LUCENE-9450 Switch to BinaryDocValues instead of stored fields in Lucene's facet implementation, yielding ~4-5% red-line QPS gain in pure faceting benchmarks (#1733)

This commit is contained in:
Gautam Worah 2020-11-12 14:13:31 -08:00 committed by GitHub
parent 06877b2c6e
commit 3f8f84f9b0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 143 additions and 5 deletions

View File

@ -156,6 +156,9 @@ Improvements
* LUCENE-9531: Consolidated CharStream and FastCharStream classes: these have been moved
from each query parser package to org.apache.lucene.queryparser.charstream (Dawid Weiss).
* LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields.
Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless)
Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -27,4 +27,6 @@ dependencies {
testImplementation project(':lucene:test-framework')
testImplementation project(':lucene:queries')
// Required for opening older indexes for backward compatibility tests
testCompile group: 'org.apache.lucene', name: 'lucene-codecs', version: '8.6.3'
}

View File

@ -31,12 +31,15 @@ import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.LRUHashMap;
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
@ -323,8 +326,23 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
}
}
int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
FacetLabel ret;
if (values == null || values.advanceExact(ordinal-indexReader.leaves().get(readerIndex).docBase) == false) {
// The index uses the older StoredField format to store the mapping
// On recreating the index, the values will be stored using the BinaryDocValuesField format
Document doc = indexReader.document(ordinal);
FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
} else {
// The index uses the BinaryDocValuesField format to store the mapping
ret = new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
}
synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
}

View File

@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
@ -193,7 +194,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setOmitNorms(true);
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
nextID = indexWriter.getDocStats().maxDoc;
@ -492,8 +493,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
Document d = new Document();
d.add(parentStreamField);
fullPathField.setStringValue(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
fullPathField.setStringValue(fieldPath);
d.add(fullPathField);
d.add(new BinaryDocValuesField(Consts.FULL, new BytesRef(fieldPath)));
// Note that we do no pass an Analyzer here because the fields that are
// added to the Document are untokenized or contains their own TokenStream.

View File

@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.Ignore;
/*
Verify we can read previous versions' taxonomy indexes, do searches
against them, and add documents to them.
*/
// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally
public class TestBackwardsCompatibility extends LuceneTestCase {
// To generate backcompat indexes with the current default codec, run the following gradle command:
// gradlew test -Dtestcase=TestBackwardsCompatibility -Dtests.bwcdir=/path/to/store/indexes
// -Dtests.codec=default -Dtests.useSecurityManager=false
// Also add testmethod with one of the index creation methods below, for example:
// -Dtestmethod=testCreateOldTaxonomy
//
// Zip up the generated indexes:
//
// cd /path/to/store/indexes/index.cfs ; zip index.<VERSION>-cfs.zip *
//
// Then move the zip file to your trunk checkout and use it in your test cases
public static final String oldTaxonomyIndexName = "taxonomy.8.6.3-cfs";
public void testCreateNewTaxonomy() throws IOException {
createNewTaxonomyIndex(oldTaxonomyIndexName);
}
// Opens up a pre-existing old taxonomy index and adds new BinaryDocValues based fields
private void createNewTaxonomyIndex(String dirName) throws IOException {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(dirName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
FacetLabel cp_b = new FacetLabel("b");
writer.addCategory(cp_b);
writer.getInternalIndexWriter().forceMerge(1);
writer.commit();
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
int ord1 = reader.getOrdinal(new FacetLabel("a"));
assert ord1 != TaxonomyReader.INVALID_ORDINAL;
// Just asserting ord1 != TaxonomyReader.INVALID_ORDINAL is not enough to check compatibility
assertNotNull(reader.getPath(ord1));
int ord2 = reader.getOrdinal(cp_b);
assert ord2 != TaxonomyReader.INVALID_ORDINAL;
assertNotNull(reader.getPath(ord2));
reader.close();
writer.close();
dir.close();
}
// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
createOldTaxonomyIndex(oldTaxonomyIndexName);
}
private void createOldTaxonomyIndex(String dirName) throws IOException {
Path indexDir = getIndexDir().resolve(dirName);
Files.deleteIfExists(indexDir);
Directory dir = newFSDirectory(indexDir);
TaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
writer.addCategory(new FacetLabel("a"));
writer.commit();
writer.close();
dir.close();
}
private Path getIndexDir() {
String path = System.getProperty("tests.bwcdir");
assumeTrue("backcompat creation tests must be run with -Dtests.bwcdir=/path/to/write/indexes", path != null);
return Paths.get(path);
}
}

View File

@ -204,6 +204,8 @@ org.apache.kerby:kerb-server:1.0.1 (1 constraints: 0405f135)
org.apache.kerby:kerb-simplekdc:1.0.1 (1 constraints: 0405f135)
org.apache.kerby:kerby-kdc:1.0.1 (1 constraints: 0405f135)
org.apache.logging.log4j:log4j-1.2-api:2.13.2 (1 constraints: 3a053a3b)
org.apache.lucene:lucene-codecs:8.6.3 (1 constraints: 13052836)
org.apache.lucene:lucene-core:8.6.3 (1 constraints: 7f0d022f)
org.asciidoctor:asciidoctorj:1.6.2 (1 constraints: 0b050436)
org.asciidoctor:asciidoctorj-api:1.6.2 (1 constraints: e30cfb0d)
org.hsqldb:hsqldb:2.4.0 (1 constraints: 08050136)