LUCENE-10122 Use NumericDocValue to store taxonomy parent array (#454)

This commit is contained in:
Patrick Zhai 2021-11-19 10:05:56 -08:00 committed by GitHub
parent 631d1ad749
commit 6b99f03cdd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 36 deletions

View File

@ -232,7 +232,7 @@ Improvements
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai)
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Patrick Zhai)
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed.
@ -439,6 +439,8 @@ Build
Other
---------------------
* LUCENE-10122: Use NumericDocValues to store taxonomy parent array (Patrick Zhai)
* LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss)
* LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files.
@ -447,7 +449,7 @@ Other
* LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss)
* LUCENE-9391: Upgrade HPPC to 0.8.2. (Haoyu Zhai)
* LUCENE-9391: Upgrade HPPC to 0.8.2. (Patrick Zhai)
* LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai)
@ -508,7 +510,7 @@ Improvements
* LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
(Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
* LUCENE-10103: Make QueryCache respect Accountable queries. (Haoyu Zhai)
* LUCENE-10103: Make QueryCache respect Accountable queries. (Patrick Zhai)
Optimizations
---------------------
@ -720,7 +722,7 @@ New Features
(Cameron VandenBerg)
* LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai)
on a consistent multi-segment index even when they require re-indexing. (Patrick Zhai)
* LUCENE-9385: Add FacetsConfig option to control which drill-down
terms are indexed for a FacetLabel (Zachary Chen)
@ -923,7 +925,7 @@ Improvements
* LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name
per hit in dependencies, ExpressionFunctionValues will no longer
recompute already computed values (Haoyu Zhai)
recompute already computed values (Patrick Zhai)
* LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as
unsigned long when detecting corruption.
@ -996,7 +998,7 @@ Bug Fixes
Documentation
---------------------
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Patrick Zhai)
Changes in Runtime Behavior
---------------------

View File

@ -21,7 +21,13 @@ import org.apache.lucene.util.BytesRef;
/** @lucene.experimental */
abstract class Consts {
static final String FULL = "$full_path$";
// TODO: Remove following 3 lines in Lucene 10, those are only used when
// parent facet ordinals are stored using term position, and is deprecated
// and migrated to NumericDocValues in Lucene 9
static final String FIELD_PAYLOADS = "$payloads$";
static final String PAYLOAD_PARENT = "p";
static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
}

View File

@ -34,6 +34,7 @@ import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.FacetsConfig;
@ -91,16 +92,18 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
private final Directory dir;
private final IndexWriter indexWriter;
private final boolean useOlderStoredFieldIndex;
private final boolean useOlderFormat;
private final TaxonomyWriterCache cache;
private final AtomicInteger cacheMisses = new AtomicInteger(0);
// Records the taxonomy index epoch, updated on replaceTaxonomy as well.
private long indexEpoch;
// TODO: remove following 2 fields in Lucene 10
private SinglePositionTokenStream parentStream =
new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
private Field parentStreamField;
private Field fullPathField;
private int cacheMissesUntilFill = 11;
private boolean shouldFillCache = true;
@ -160,13 +163,13 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
if (DirectoryReader.indexExists(directory) == false) {
indexEpoch = 1;
// no commit exists so we can safely use the new BinaryDocValues field
useOlderStoredFieldIndex = false;
useOlderFormat = false;
} else {
String epochStr = null;
SegmentInfos infos = SegmentInfos.readLatestCommit(dir);
/* a previous commit exists, so check the version of the last commit */
useOlderStoredFieldIndex = infos.getIndexCreatedVersionMajor() <= 8;
useOlderFormat = infos.getIndexCreatedVersionMajor() <= 8;
Map<String, String> commitData = infos.getUserData();
if (commitData != null) {
@ -181,12 +184,17 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
++indexEpoch;
}
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setOmitNorms(true);
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
if (useOlderStoredFieldIndex) {
if (useOlderFormat) {
// parent ordinal field
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setOmitNorms(true);
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
// full path field
fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
} else {
parentStreamField = null;
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
}
@ -466,23 +474,28 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* effectively synchronized as well.
*/
private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
// Before Lucene 2.9, position increments >=0 were supported, so we
// added 1 to parent to allow the parent -1 (the parent of the root).
// Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
// no longer enough, since 0 is not encoded consistently either (see
// comment in SinglePositionTokenStream). But because we must be
// backward-compatible with existing indexes, we can't just fix what
// we write here (e.g., to write parent+2), and need to do a workaround
// in the reader (which knows that anyway only category 0 has a parent
// -1).
parentStream.set(Math.max(parent + 1, 1));
Document d = new Document();
d.add(parentStreamField);
if (useOlderFormat) {
// Before Lucene 2.9, position increments >=0 were supported, so we
// added 1 to parent to allow the parent -1 (the parent of the root).
// Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
// no longer enough, since 0 is not encoded consistently either (see
// comment in SinglePositionTokenStream). But because we must be
// backward-compatible with existing indexes, we can't just fix what
// we write here (e.g., to write parent+2), and need to do a workaround
// in the reader (which knows that anyway only category 0 has a parent
// -1).
assert parentStreamField != null;
parentStream.set(Math.max(parent + 1, 1));
d.add(parentStreamField);
} else {
d.add(new NumericDocValuesField(Consts.FIELD_PARENT_ORDINAL_NDV, parent));
}
String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
fullPathField.setStringValue(fieldPath);
if (useOlderStoredFieldIndex == false) {
if (useOlderFormat == false) {
/* Lucene 9 switches to BinaryDocValuesField for storing taxonomy categories */
d.add(new BinaryDocValuesField(Consts.FULL, new BytesRef(fieldPath)));
}
@ -508,6 +521,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
return id;
}
// TODO: remove this class in Lucene 10
private static class SinglePositionTokenStream extends TokenStream {
private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;

View File

@ -25,7 +25,9 @@ import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Accountable;
@ -58,13 +60,6 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
parents = new int[reader.maxDoc()];
if (parents.length > 0) {
initParents(reader, 0);
// Starting Lucene 2.9, following the change LUCENE-1542, we can
// no longer reliably read the parent "-1" (see comment in
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
// to fix this in indexing without breaking backward-compatibility
// with existing indexes, so what we'll do instead is just
// hard-code the parent of ordinal 0 to be -1, and assume (as is
// indeed the case) that no other parent can be -1.
parents[0] = TaxonomyReader.INVALID_ORDINAL;
}
}
@ -130,6 +125,47 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
return;
}
if (getMajorVersion(reader) <= 8) {
loadParentUsingTermPosition(reader, first);
return;
}
for (LeafReaderContext leafContext : reader.leaves()) {
int leafDocNum = leafContext.reader().maxDoc();
if (leafContext.docBase + leafDocNum <= first) {
// skip this leaf if it does not contain new categories
continue;
}
NumericDocValues parentValues =
leafContext.reader().getNumericDocValues(Consts.FIELD_PARENT_ORDINAL_NDV);
if (parentValues == null) {
throw new CorruptIndexException(
"Parent data field " + Consts.FIELD_PARENT_ORDINAL_NDV + " not exists",
leafContext.reader().toString());
}
for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) {
if (parentValues.advanceExact(doc) == false) {
throw new CorruptIndexException(
"Missing parent data for category " + (doc + leafContext.docBase), reader.toString());
}
// we're putting an int and converting it back so it should be safe
parents[doc + leafContext.docBase] = Math.toIntExact(parentValues.longValue());
}
}
}
private static int getMajorVersion(IndexReader reader) {
assert reader.leaves().size() > 0;
return reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor();
}
/**
* Try loading the old way of storing parent ordinal first, return true if the parent array is
* loaded Or false if not, and we will try loading using NumericDocValues
*/
// TODO: Remove in Lucene 10, this is only for back-compatibility
private void loadParentUsingTermPosition(IndexReader reader, int first) throws IOException {
// it's ok to use MultiTerms because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
@ -140,7 +176,7 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException(
"Missing parent data for category " + first, reader.toString());
"[Lucene 8] Missing parent data for category " + first, reader.toString());
}
int num = reader.maxDoc();
@ -148,7 +184,7 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException(
"Missing parent data for category " + i, reader.toString());
"[Lucene 8] Missing parent data for category " + i, reader.toString());
}
parents[i] = positions.nextPosition();
@ -156,12 +192,13 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException(
"Missing parent data for category " + (i + 1), reader.toString());
"[Lucene 8] Missing parent data for category " + (i + 1), reader.toString());
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
throw new CorruptIndexException(
"[Lucene 8] Missing parent data for category " + i, reader.toString());
}
}
}