LUCENE-10122 Use NumericDocValue to store taxonomy parent array instead of custom term positions (#451)

This commit is contained in:
Patrick Zhai 2021-11-17 16:32:34 -08:00 committed by GitHub
parent bae095ae48
commit b4476e4318
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 116 deletions

View File

@ -251,7 +251,7 @@ Improvements
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss) (Dawid Weiss)
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai) * LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Patrick Zhai)
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O. * LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed. Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed.
@ -460,6 +460,8 @@ Build
Other Other
--------------------- ---------------------
* LUCENE-10122: Use NumericDocValues to store taxonomy parent array (Patrick Zhai)
* LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss) * LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss)
* LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files. * LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files.
@ -468,7 +470,7 @@ Other
* LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss) * LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss)
* LUCENE-9391: Upgrade HPPC to 0.8.2. (Haoyu Zhai) * LUCENE-9391: Upgrade HPPC to 0.8.2. (Patrick Zhai)
* LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai) * LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai)
@ -529,7 +531,7 @@ Improvements
* LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments. * LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
(Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir) (Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
* LUCENE-10103: Make QueryCache respect Accountable queries. (Haoyu Zhai) * LUCENE-10103: Make QueryCache respect Accountable queries. (Patrick Zhai)
Optimizations Optimizations
--------------------- ---------------------
@ -741,7 +743,7 @@ New Features
(Cameron VandenBerg) (Cameron VandenBerg)
* LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes * LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai) on a consistent multi-segment index even when they require re-indexing. (Patrick Zhai)
* LUCENE-9385: Add FacetsConfig option to control which drill-down * LUCENE-9385: Add FacetsConfig option to control which drill-down
terms are indexed for a FacetLabel (Zachary Chen) terms are indexed for a FacetLabel (Zachary Chen)
@ -944,7 +946,7 @@ Improvements
* LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name * LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name
per hit in dependencies, ExpressionFunctionValues will no longer per hit in dependencies, ExpressionFunctionValues will no longer
recompute already computed values (Haoyu Zhai) recompute already computed values (Patrick Zhai)
* LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as * LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as
unsigned long when detecting corruption. unsigned long when detecting corruption.
@ -1017,7 +1019,7 @@ Bug Fixes
Documentation Documentation
--------------------- ---------------------
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai) * LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Patrick Zhai)
Changes in Runtime Behavior Changes in Runtime Behavior
--------------------- ---------------------

View File

@ -16,12 +16,8 @@
*/ */
package org.apache.lucene.facet.taxonomy.directory; package org.apache.lucene.facet.taxonomy.directory;
import org.apache.lucene.util.BytesRef;
/** @lucene.experimental */ /** @lucene.experimental */
abstract class Consts { abstract class Consts {
static final String FULL = "$full_path$"; static final String FULL = "$full_path$";
static final String FIELD_PAYLOADS = "$payloads$"; static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
static final String PAYLOAD_PARENT = "p";
static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
} }

View File

@ -27,15 +27,11 @@ import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType; import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel; import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -97,9 +93,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
// Records the taxonomy index epoch, updated on replaceTaxonomy as well. // Records the taxonomy index epoch, updated on replaceTaxonomy as well.
private long indexEpoch; private long indexEpoch;
private SinglePositionTokenStream parentStream =
new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
private Field parentStreamField;
private Field fullPathField; private Field fullPathField;
private int cacheMissesUntilFill = 11; private int cacheMissesUntilFill = 11;
private boolean shouldFillCache = true; private boolean shouldFillCache = true;
@ -176,9 +169,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
++indexEpoch; ++indexEpoch;
} }
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.setOmitNorms(true);
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO); fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
nextID = indexWriter.getDocStats().maxDoc; nextID = indexWriter.getDocStats().maxDoc;
@ -457,18 +447,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* effectively synchronized as well. * effectively synchronized as well.
*/ */
private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException { private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
// Before Lucene 2.9, position increments >=0 were supported, so we
// added 1 to parent to allow the parent -1 (the parent of the root).
// Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
// no longer enough, since 0 is not encoded consistently either (see
// comment in SinglePositionTokenStream). But because we must be
// backward-compatible with existing indexes, we can't just fix what
// we write here (e.g., to write parent+2), and need to do a workaround
// in the reader (which knows that anyway only category 0 has a parent
// -1).
parentStream.set(Math.max(parent + 1, 1));
Document d = new Document(); Document d = new Document();
d.add(parentStreamField); /* Lucene 9 switches to NumericDocValuesField for storing parent ordinals */
d.add(new NumericDocValuesField(Consts.FIELD_PARENT_ORDINAL_NDV, parent));
String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length); String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
fullPathField.setStringValue(fieldPath); fullPathField.setStringValue(fieldPath);
@ -497,50 +478,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
return id; return id;
} }
private static class SinglePositionTokenStream extends TokenStream {
private CharTermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
private boolean returned;
private int val;
private final String word;
public SinglePositionTokenStream(String word) {
termAtt = addAttribute(CharTermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
this.word = word;
returned = true;
}
/**
* Set the value we want to keep, as the position increment. Note that when
* TermPositions.nextPosition() is later used to retrieve this value, val-1 will be returned,
* not val.
*
* <p>IMPORTANT NOTE: Before Lucene 2.9, val&gt;=0 were safe (for val==0, the retrieved position
* would be -1). But starting with Lucene 2.9, this unfortunately changed, and only val&gt;0 are
* safe. val=0 can still be used, but don't count on the value you retrieve later (it could be 0
* or -1, depending on circumstances or versions). This change is described in Lucene's JIRA:
* LUCENE-1542.
*/
public void set(int val) {
this.val = val;
returned = false;
}
@Override
public boolean incrementToken() throws IOException {
if (returned) {
return false;
}
clearAttributes();
posIncrAtt.setPositionIncrement(val);
termAtt.setEmpty();
termAtt.append(word);
returned = true;
return true;
}
}
private void addToCache(FacetLabel categoryPath, int id) throws IOException { private void addToCache(FacetLabel categoryPath, int id) throws IOException {
if (cache.put(categoryPath, id)) { if (cache.put(categoryPath, id)) {
// If cache.put() returned true, it means the cache was limited in // If cache.put() returned true, it means the cache was limited in

View File

@ -25,9 +25,8 @@ import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables; import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
@ -58,13 +57,6 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
parents = new int[reader.maxDoc()]; parents = new int[reader.maxDoc()];
if (parents.length > 0) { if (parents.length > 0) {
initParents(reader, 0); initParents(reader, 0);
// Starting Lucene 2.9, following the change LUCENE-1542, we can
// no longer reliably read the parent "-1" (see comment in
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
// to fix this in indexing without breaking backward-compatibility
// with existing indexes, so what we'll do instead is just
// hard-code the parent of ordinal 0 to be -1, and assume (as is
// indeed the case) that no other parent can be -1.
parents[0] = TaxonomyReader.INVALID_ORDINAL; parents[0] = TaxonomyReader.INVALID_ORDINAL;
} }
} }
@ -130,38 +122,27 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
return; return;
} }
// it's ok to use MultiTerms because we only iterate on one posting list. for (LeafReaderContext leafContext : reader.leaves()) {
// breaking it to loop over the leaves() only complicates code for no int leafDocNum = leafContext.reader().maxDoc();
// apparent gain. if (leafContext.docBase + leafDocNum <= first) {
PostingsEnum positions = // skip this leaf if it does not contain new categories
MultiTerms.getTermPostingsEnum( continue;
reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS); }
NumericDocValues parentValues =
leafContext.reader().getNumericDocValues(Consts.FIELD_PARENT_ORDINAL_NDV);
if (parentValues == null) {
throw new CorruptIndexException(
"Parent data field " + Consts.FIELD_PARENT_ORDINAL_NDV + " not exists",
leafContext.reader().toString());
}
// shouldn't really happen, if it does, something's wrong for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) {
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) { if (parentValues.advanceExact(doc) == false) {
throw new CorruptIndexException(
"Missing parent data for category " + first, reader.toString());
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException( throw new CorruptIndexException(
"Missing parent data for category " + i, reader.toString()); "Missing parent data for category " + (doc + leafContext.docBase), reader.toString());
} }
// we're putting an int and converting it back so it should be safe
parents[i] = positions.nextPosition(); parents[doc + leafContext.docBase] = Math.toIntExact(parentValues.longValue());
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException(
"Missing parent data for category " + (i + 1), reader.toString());
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
} }
} }
} }