mirror of https://github.com/apache/lucene.git
LUCENE-9450 Switch to BinaryDocValues instead of stored fields in Lucene's facet implementation, yielding ~4-5% red-line QPS gain in pure faceting benchmarks (#1733)
This commit is contained in:
parent
06877b2c6e
commit
3f8f84f9b0
|
@ -156,6 +156,9 @@ Improvements
|
|||
* LUCENE-9531: Consolidated CharStream and FastCharStream classes: these have been moved
|
||||
from each query parser package to org.apache.lucene.queryparser.charstream (Dawid Weiss).
|
||||
|
||||
* LUCENE-9450: Use BinaryDocValues for the taxonomy index instead of StoredFields.
|
||||
Add backwards compatibility tests for the taxonomy index. (Gautam Worah, Michael McCandless)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
||||
|
|
|
@ -27,4 +27,6 @@ dependencies {
|
|||
|
||||
testImplementation project(':lucene:test-framework')
|
||||
testImplementation project(':lucene:queries')
|
||||
// Required for opening older indexes for backward compatibility tests
|
||||
testCompile group: 'org.apache.lucene', name: 'lucene-codecs', version: '8.6.3'
|
||||
}
|
||||
|
|
|
@ -31,12 +31,15 @@ import org.apache.lucene.facet.taxonomy.FacetLabel;
|
|||
import org.apache.lucene.facet.taxonomy.LRUHashMap;
|
||||
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.index.CorruptIndexException; // javadocs
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiTerms;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.ReaderUtil;
|
||||
import org.apache.lucene.index.SegmentReader;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -322,9 +325,24 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
|
|||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
Document doc = indexReader.document(ordinal);
|
||||
FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
|
||||
|
||||
int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
|
||||
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
|
||||
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
|
||||
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
|
||||
|
||||
FacetLabel ret;
|
||||
|
||||
if (values == null || values.advanceExact(ordinal-indexReader.leaves().get(readerIndex).docBase) == false) {
|
||||
// The index uses the older StoredField format to store the mapping
|
||||
// On recreating the index, the values will be stored using the BinaryDocValuesField format
|
||||
Document doc = indexReader.document(ordinal);
|
||||
ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL)));
|
||||
} else {
|
||||
// The index uses the BinaryDocValuesField format to store the mapping
|
||||
ret = new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
|
||||
}
|
||||
|
||||
synchronized (categoryCache) {
|
||||
categoryCache.put(catIDInteger, ret);
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.BinaryDocValuesField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
|
@ -193,7 +194,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
ft.setOmitNorms(true);
|
||||
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
|
||||
fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
|
||||
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
|
||||
|
||||
nextID = indexWriter.getDocStats().maxDoc;
|
||||
|
||||
|
@ -492,8 +493,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
Document d = new Document();
|
||||
d.add(parentStreamField);
|
||||
|
||||
fullPathField.setStringValue(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
|
||||
String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
|
||||
fullPathField.setStringValue(fieldPath);
|
||||
d.add(fullPathField);
|
||||
d.add(new BinaryDocValuesField(Consts.FULL, new BytesRef(fieldPath)));
|
||||
|
||||
// Note that we do no pass an Analyzer here because the fields that are
|
||||
// added to the Document are untokenized or contains their own TokenStream.
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.facet.taxonomy.directory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.Ignore;
|
||||
|
||||
/*
|
||||
Verify we can read previous versions' taxonomy indexes, do searches
|
||||
against them, and add documents to them.
|
||||
*/
|
||||
// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove files on Windows machines occasionally
|
||||
public class TestBackwardsCompatibility extends LuceneTestCase {
|
||||
|
||||
// To generate backcompat indexes with the current default codec, run the following gradle command:
|
||||
// gradlew test -Dtestcase=TestBackwardsCompatibility -Dtests.bwcdir=/path/to/store/indexes
|
||||
// -Dtests.codec=default -Dtests.useSecurityManager=false
|
||||
// Also add testmethod with one of the index creation methods below, for example:
|
||||
// -Dtestmethod=testCreateOldTaxonomy
|
||||
//
|
||||
// Zip up the generated indexes:
|
||||
//
|
||||
// cd /path/to/store/indexes/index.cfs ; zip index.<VERSION>-cfs.zip *
|
||||
//
|
||||
// Then move the zip file to your trunk checkout and use it in your test cases
|
||||
|
||||
public static final String oldTaxonomyIndexName = "taxonomy.8.6.3-cfs";
|
||||
|
||||
public void testCreateNewTaxonomy() throws IOException {
|
||||
createNewTaxonomyIndex(oldTaxonomyIndexName);
|
||||
}
|
||||
|
||||
// Opens up a pre-existing old taxonomy index and adds new BinaryDocValues based fields
|
||||
private void createNewTaxonomyIndex(String dirName) throws IOException {
|
||||
Path indexDir = createTempDir(oldTaxonomyIndexName);
|
||||
TestUtil.unzip(getDataInputStream(dirName + ".zip"), indexDir);
|
||||
Directory dir = newFSDirectory(indexDir);
|
||||
|
||||
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
|
||||
|
||||
FacetLabel cp_b = new FacetLabel("b");
|
||||
writer.addCategory(cp_b);
|
||||
writer.getInternalIndexWriter().forceMerge(1);
|
||||
writer.commit();
|
||||
|
||||
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
|
||||
|
||||
int ord1 = reader.getOrdinal(new FacetLabel("a"));
|
||||
assert ord1 != TaxonomyReader.INVALID_ORDINAL;
|
||||
// Just asserting ord1 != TaxonomyReader.INVALID_ORDINAL is not enough to check compatibility
|
||||
assertNotNull(reader.getPath(ord1));
|
||||
|
||||
int ord2 = reader.getOrdinal(cp_b);
|
||||
assert ord2 != TaxonomyReader.INVALID_ORDINAL;
|
||||
assertNotNull(reader.getPath(ord2));
|
||||
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// Used to create a fresh taxonomy index with StoredFields
|
||||
@Ignore
|
||||
public void testCreateOldTaxonomy() throws IOException {
|
||||
createOldTaxonomyIndex(oldTaxonomyIndexName);
|
||||
}
|
||||
|
||||
private void createOldTaxonomyIndex(String dirName) throws IOException {
|
||||
Path indexDir = getIndexDir().resolve(dirName);
|
||||
Files.deleteIfExists(indexDir);
|
||||
Directory dir = newFSDirectory(indexDir);
|
||||
|
||||
TaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
|
||||
|
||||
writer.addCategory(new FacetLabel("a"));
|
||||
writer.commit();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private Path getIndexDir() {
|
||||
String path = System.getProperty("tests.bwcdir");
|
||||
assumeTrue("backcompat creation tests must be run with -Dtests.bwcdir=/path/to/write/indexes", path != null);
|
||||
return Paths.get(path);
|
||||
}
|
||||
}
|
Binary file not shown.
|
@ -204,6 +204,8 @@ org.apache.kerby:kerb-server:1.0.1 (1 constraints: 0405f135)
|
|||
org.apache.kerby:kerb-simplekdc:1.0.1 (1 constraints: 0405f135)
|
||||
org.apache.kerby:kerby-kdc:1.0.1 (1 constraints: 0405f135)
|
||||
org.apache.logging.log4j:log4j-1.2-api:2.13.2 (1 constraints: 3a053a3b)
|
||||
org.apache.lucene:lucene-codecs:8.6.3 (1 constraints: 13052836)
|
||||
org.apache.lucene:lucene-core:8.6.3 (1 constraints: 7f0d022f)
|
||||
org.asciidoctor:asciidoctorj:1.6.2 (1 constraints: 0b050436)
|
||||
org.asciidoctor:asciidoctorj-api:1.6.2 (1 constraints: e30cfb0d)
|
||||
org.hsqldb:hsqldb:2.4.0 (1 constraints: 08050136)
|
||||
|
|
Loading…
Reference in New Issue