LUCENE-9450 Switch to BinaryDocValues instead of stored fields in Lucene's facet implementation, yielding ~4-5% red-line QPS gain in pure faceting benchmarks (#1733)

2020-11-12 14:13:31 -08:00 · 2020-11-12 14:13:31 -08:00 · 3f8f84f9b0
parent 06877b2c6e
commit 3f8f84f9b0
7 changed files with 143 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -156,6 +156,9 @@ Improvements
 * LUCENE-9531: Consolidated CharStream and FastCharStream classes: these have been moved
  from each query parser package to org.apache.lucene.queryparser.charstream (Dawid Weiss).

+* LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields.
+  Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless)
+
 Bug fixes

 * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while 
--- a/lucene/facet/build.gradle
+++ b/lucene/facet/build.gradle
@ -27,4 +27,6 @@ dependencies {

  testImplementation project(':lucene:test-framework')
  testImplementation project(':lucene:queries')
+  // Required for opening older indexes for backward compatibility tests
+  testCompile group: 'org.apache.lucene', name: 'lucene-codecs', version: '8.6.3'
 }
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@ -31,12 +31,15 @@ import org.apache.lucene.facet.taxonomy.FacetLabel;
 import org.apache.lucene.facet.taxonomy.LRUHashMap;
 import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.CorruptIndexException; // javadocs
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.MultiTerms;
 import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.SegmentReader;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.Directory;
@ -322,9 +325,24 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
        return res;
      }
    }
-    
-    Document doc = indexReader.document(ordinal);
-    FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
+
+    int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
+    LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
+    // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
+    BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
+
+    FacetLabel ret;
+
+    if (values == null || values.advanceExact(ordinal-indexReader.leaves().get(readerIndex).docBase) == false) {
+      // The index uses the older StoredField format to store the mapping
+      // On recreating the index, the values will be stored using the BinaryDocValuesField format
+      Document doc = indexReader.document(ordinal);
+      ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
+    } else {
+      // The index uses the BinaryDocValuesField format to store the mapping
+      ret = new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
+    }
+
    synchronized (categoryCache) {
      categoryCache.put(catIDInteger, ret);
    }
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.FieldType;
@ -193,7 +194,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setOmitNorms(true);
    parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
-    fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
+    fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);

    nextID = indexWriter.getDocStats().maxDoc;

@ -492,8 +493,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    Document d = new Document();
    d.add(parentStreamField);

-    fullPathField.setStringValue(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
+    String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
+    fullPathField.setStringValue(fieldPath);
    d.add(fullPathField);
+    d.add(new BinaryDocValuesField(Consts.FULL, new BytesRef(fieldPath)));

    // Note that we do no pass an Analyzer here because the fields that are
    // added to the Document are untokenized or contains their own TokenStream.
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.facet.taxonomy.directory;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.lucene.facet.taxonomy.FacetLabel;
+import org.apache.lucene.facet.taxonomy.TaxonomyReader;
+import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+import org.junit.Ignore;
+
+/*
+  Verify we can read previous versions' taxonomy indexes, do searches
+  against them, and add documents to them.
+*/
+// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally
+public class TestBackwardsCompatibility extends LuceneTestCase {
+
+    // To generate backcompat indexes with the current default codec, run the following gradle command:
+    //  gradlew test -Dtestcase=TestBackwardsCompatibility -Dtests.bwcdir=/path/to/store/indexes
+    //           -Dtests.codec=default -Dtests.useSecurityManager=false
+    // Also add testmethod with one of the index creation methods below, for example:
+    //    -Dtestmethod=testCreateOldTaxonomy
+    //
+    // Zip up the generated indexes:
+    //
+    //    cd /path/to/store/indexes/index.cfs   ; zip index.<VERSION>-cfs.zip *
+    //
+    // Then move the zip file to your trunk checkout and use it in your test cases
+
+    public static final String oldTaxonomyIndexName = "taxonomy.8.6.3-cfs";
+
+    public void testCreateNewTaxonomy() throws IOException {
+        createNewTaxonomyIndex(oldTaxonomyIndexName);
+    }
+
+    // Opens up a pre-existing old taxonomy index and adds new BinaryDocValues based fields
+    private void createNewTaxonomyIndex(String dirName) throws IOException {
+        Path indexDir = createTempDir(oldTaxonomyIndexName);
+        TestUtil.unzip(getDataInputStream(dirName + ".zip"), indexDir);
+        Directory dir = newFSDirectory(indexDir);
+
+        DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
+
+        FacetLabel cp_b = new FacetLabel("b");
+        writer.addCategory(cp_b);
+        writer.getInternalIndexWriter().forceMerge(1);
+        writer.commit();
+
+        TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
+
+        int ord1 = reader.getOrdinal(new FacetLabel("a"));
+        assert ord1 != TaxonomyReader.INVALID_ORDINAL;
+        // Just asserting ord1 != TaxonomyReader.INVALID_ORDINAL is not enough to check compatibility
+        assertNotNull(reader.getPath(ord1));
+
+        int ord2 = reader.getOrdinal(cp_b);
+        assert ord2 != TaxonomyReader.INVALID_ORDINAL;
+        assertNotNull(reader.getPath(ord2));
+
+        reader.close();
+        writer.close();
+        dir.close();
+    }
+
+    // Used to create a fresh taxonomy index with StoredFields
+    @Ignore
+    public void testCreateOldTaxonomy() throws IOException {
+        createOldTaxonomyIndex(oldTaxonomyIndexName);
+    }
+
+    private void createOldTaxonomyIndex(String dirName) throws IOException {
+        Path indexDir = getIndexDir().resolve(dirName);
+        Files.deleteIfExists(indexDir);
+        Directory dir = newFSDirectory(indexDir);
+
+        TaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
+
+        writer.addCategory(new FacetLabel("a"));
+        writer.commit();
+        writer.close();
+        dir.close();
+    }
+
+    private Path getIndexDir() {
+        String path = System.getProperty("tests.bwcdir");
+        assumeTrue("backcompat creation tests must be run with -Dtests.bwcdir=/path/to/write/indexes", path != null);
+        return Paths.get(path);
+    }
+}
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/taxonomy.8.6.3-cfs.zip
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/taxonomy.8.6.3-cfs.zip
--- a/versions.lock
+++ b/versions.lock
@ -204,6 +204,8 @@ org.apache.kerby:kerb-server:1.0.1 (1 constraints: 0405f135)
 org.apache.kerby:kerb-simplekdc:1.0.1 (1 constraints: 0405f135)
 org.apache.kerby:kerby-kdc:1.0.1 (1 constraints: 0405f135)
 org.apache.logging.log4j:log4j-1.2-api:2.13.2 (1 constraints: 3a053a3b)
+org.apache.lucene:lucene-codecs:8.6.3 (1 constraints: 13052836)
+org.apache.lucene:lucene-core:8.6.3 (1 constraints: 7f0d022f)
 org.asciidoctor:asciidoctorj:1.6.2 (1 constraints: 0b050436)
 org.asciidoctor:asciidoctorj-api:1.6.2 (1 constraints: e30cfb0d)
 org.hsqldb:hsqldb:2.4.0 (1 constraints: 08050136)