mirror of https://github.com/apache/lucene.git
LUCENE-10122 Use NumericDocValue to store taxonomy parent array instead of custom term positions (#451)
This commit is contained in:
parent
bae095ae48
commit
b4476e4318
|
@ -251,7 +251,7 @@ Improvements
|
||||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||||
(Dawid Weiss)
|
(Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai)
|
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Patrick Zhai)
|
||||||
|
|
||||||
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
|
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
|
||||||
Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed.
|
Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed.
|
||||||
|
@ -460,6 +460,8 @@ Build
|
||||||
Other
|
Other
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
|
* LUCENE-10122: Use NumericDocValues to store taxonomy parent array (Patrick Zhai)
|
||||||
|
|
||||||
* LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss)
|
* LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files.
|
* LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files.
|
||||||
|
@ -468,7 +470,7 @@ Other
|
||||||
|
|
||||||
* LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss)
|
* LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-9391: Upgrade HPPC to 0.8.2. (Haoyu Zhai)
|
* LUCENE-9391: Upgrade HPPC to 0.8.2. (Patrick Zhai)
|
||||||
|
|
||||||
* LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai)
|
* LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai)
|
||||||
|
|
||||||
|
@ -529,7 +531,7 @@ Improvements
|
||||||
* LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
|
* LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
|
||||||
(Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
|
(Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)
|
||||||
|
|
||||||
* LUCENE-10103: Make QueryCache respect Accountable queries. (Haoyu Zhai)
|
* LUCENE-10103: Make QueryCache respect Accountable queries. (Patrick Zhai)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
@ -741,7 +743,7 @@ New Features
|
||||||
(Cameron VandenBerg)
|
(Cameron VandenBerg)
|
||||||
|
|
||||||
* LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
|
* LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
|
||||||
on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai)
|
on a consistent multi-segment index even when they require re-indexing. (Patrick Zhai)
|
||||||
|
|
||||||
* LUCENE-9385: Add FacetsConfig option to control which drill-down
|
* LUCENE-9385: Add FacetsConfig option to control which drill-down
|
||||||
terms are indexed for a FacetLabel (Zachary Chen)
|
terms are indexed for a FacetLabel (Zachary Chen)
|
||||||
|
@ -944,7 +946,7 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name
|
* LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name
|
||||||
per hit in dependencies, ExpressionFunctionValues will no longer
|
per hit in dependencies, ExpressionFunctionValues will no longer
|
||||||
recompute already computed values (Haoyu Zhai)
|
recompute already computed values (Patrick Zhai)
|
||||||
|
|
||||||
* LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as
|
* LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as
|
||||||
unsigned long when detecting corruption.
|
unsigned long when detecting corruption.
|
||||||
|
@ -1017,7 +1019,7 @@ Bug Fixes
|
||||||
Documentation
|
Documentation
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
|
* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Patrick Zhai)
|
||||||
|
|
||||||
Changes in Runtime Behavior
|
Changes in Runtime Behavior
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -16,12 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.facet.taxonomy.directory;
|
package org.apache.lucene.facet.taxonomy.directory;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
|
|
||||||
/** @lucene.experimental */
|
/** @lucene.experimental */
|
||||||
abstract class Consts {
|
abstract class Consts {
|
||||||
static final String FULL = "$full_path$";
|
static final String FULL = "$full_path$";
|
||||||
static final String FIELD_PAYLOADS = "$payloads$";
|
static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
|
||||||
static final String PAYLOAD_PARENT = "p";
|
|
||||||
static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,15 +27,11 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|
||||||
import org.apache.lucene.document.BinaryDocValuesField;
|
import org.apache.lucene.document.BinaryDocValuesField;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.NumericDocValuesField;
|
||||||
import org.apache.lucene.document.StringField;
|
import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.document.TextField;
|
|
||||||
import org.apache.lucene.facet.FacetsConfig;
|
import org.apache.lucene.facet.FacetsConfig;
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
|
@ -97,9 +93,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
// Records the taxonomy index epoch, updated on replaceTaxonomy as well.
|
// Records the taxonomy index epoch, updated on replaceTaxonomy as well.
|
||||||
private long indexEpoch;
|
private long indexEpoch;
|
||||||
|
|
||||||
private SinglePositionTokenStream parentStream =
|
|
||||||
new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
|
|
||||||
private Field parentStreamField;
|
|
||||||
private Field fullPathField;
|
private Field fullPathField;
|
||||||
private int cacheMissesUntilFill = 11;
|
private int cacheMissesUntilFill = 11;
|
||||||
private boolean shouldFillCache = true;
|
private boolean shouldFillCache = true;
|
||||||
|
@ -176,9 +169,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
++indexEpoch;
|
++indexEpoch;
|
||||||
}
|
}
|
||||||
|
|
||||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
|
||||||
ft.setOmitNorms(true);
|
|
||||||
parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
|
|
||||||
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
|
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
|
||||||
|
|
||||||
nextID = indexWriter.getDocStats().maxDoc;
|
nextID = indexWriter.getDocStats().maxDoc;
|
||||||
|
@ -457,18 +447,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
* effectively synchronized as well.
|
* effectively synchronized as well.
|
||||||
*/
|
*/
|
||||||
private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
|
private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
|
||||||
// Before Lucene 2.9, position increments >=0 were supported, so we
|
|
||||||
// added 1 to parent to allow the parent -1 (the parent of the root).
|
|
||||||
// Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
|
|
||||||
// no longer enough, since 0 is not encoded consistently either (see
|
|
||||||
// comment in SinglePositionTokenStream). But because we must be
|
|
||||||
// backward-compatible with existing indexes, we can't just fix what
|
|
||||||
// we write here (e.g., to write parent+2), and need to do a workaround
|
|
||||||
// in the reader (which knows that anyway only category 0 has a parent
|
|
||||||
// -1).
|
|
||||||
parentStream.set(Math.max(parent + 1, 1));
|
|
||||||
Document d = new Document();
|
Document d = new Document();
|
||||||
d.add(parentStreamField);
|
/* Lucene 9 switches to NumericDocValuesField for storing parent ordinals */
|
||||||
|
d.add(new NumericDocValuesField(Consts.FIELD_PARENT_ORDINAL_NDV, parent));
|
||||||
|
|
||||||
String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
|
String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
|
||||||
fullPathField.setStringValue(fieldPath);
|
fullPathField.setStringValue(fieldPath);
|
||||||
|
@ -497,50 +478,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class SinglePositionTokenStream extends TokenStream {
|
|
||||||
private CharTermAttribute termAtt;
|
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
|
||||||
private boolean returned;
|
|
||||||
private int val;
|
|
||||||
private final String word;
|
|
||||||
|
|
||||||
public SinglePositionTokenStream(String word) {
|
|
||||||
termAtt = addAttribute(CharTermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
this.word = word;
|
|
||||||
returned = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set the value we want to keep, as the position increment. Note that when
|
|
||||||
* TermPositions.nextPosition() is later used to retrieve this value, val-1 will be returned,
|
|
||||||
* not val.
|
|
||||||
*
|
|
||||||
* <p>IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0, the retrieved position
|
|
||||||
* would be -1). But starting with Lucene 2.9, this unfortunately changed, and only val>0 are
|
|
||||||
* safe. val=0 can still be used, but don't count on the value you retrieve later (it could be 0
|
|
||||||
* or -1, depending on circumstances or versions). This change is described in Lucene's JIRA:
|
|
||||||
* LUCENE-1542.
|
|
||||||
*/
|
|
||||||
public void set(int val) {
|
|
||||||
this.val = val;
|
|
||||||
returned = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean incrementToken() throws IOException {
|
|
||||||
if (returned) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
clearAttributes();
|
|
||||||
posIncrAtt.setPositionIncrement(val);
|
|
||||||
termAtt.setEmpty();
|
|
||||||
termAtt.append(word);
|
|
||||||
returned = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addToCache(FacetLabel categoryPath, int id) throws IOException {
|
private void addToCache(FacetLabel categoryPath, int id) throws IOException {
|
||||||
if (cache.put(categoryPath, id)) {
|
if (cache.put(categoryPath, id)) {
|
||||||
// If cache.put() returned true, it means the cache was limited in
|
// If cache.put() returned true, it means the cache was limited in
|
||||||
|
|
|
@ -25,9 +25,8 @@ import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
import org.apache.lucene.index.CorruptIndexException;
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.MultiTerms;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
|
||||||
import org.apache.lucene.util.Accountable;
|
import org.apache.lucene.util.Accountable;
|
||||||
import org.apache.lucene.util.Accountables;
|
import org.apache.lucene.util.Accountables;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
@ -58,13 +57,6 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
|
||||||
parents = new int[reader.maxDoc()];
|
parents = new int[reader.maxDoc()];
|
||||||
if (parents.length > 0) {
|
if (parents.length > 0) {
|
||||||
initParents(reader, 0);
|
initParents(reader, 0);
|
||||||
// Starting Lucene 2.9, following the change LUCENE-1542, we can
|
|
||||||
// no longer reliably read the parent "-1" (see comment in
|
|
||||||
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
|
|
||||||
// to fix this in indexing without breaking backward-compatibility
|
|
||||||
// with existing indexes, so what we'll do instead is just
|
|
||||||
// hard-code the parent of ordinal 0 to be -1, and assume (as is
|
|
||||||
// indeed the case) that no other parent can be -1.
|
|
||||||
parents[0] = TaxonomyReader.INVALID_ORDINAL;
|
parents[0] = TaxonomyReader.INVALID_ORDINAL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -130,38 +122,27 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// it's ok to use MultiTerms because we only iterate on one posting list.
|
for (LeafReaderContext leafContext : reader.leaves()) {
|
||||||
// breaking it to loop over the leaves() only complicates code for no
|
int leafDocNum = leafContext.reader().maxDoc();
|
||||||
// apparent gain.
|
if (leafContext.docBase + leafDocNum <= first) {
|
||||||
PostingsEnum positions =
|
// skip this leaf if it does not contain new categories
|
||||||
MultiTerms.getTermPostingsEnum(
|
continue;
|
||||||
reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS);
|
}
|
||||||
|
NumericDocValues parentValues =
|
||||||
// shouldn't really happen, if it does, something's wrong
|
leafContext.reader().getNumericDocValues(Consts.FIELD_PARENT_ORDINAL_NDV);
|
||||||
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
|
if (parentValues == null) {
|
||||||
throw new CorruptIndexException(
|
throw new CorruptIndexException(
|
||||||
"Missing parent data for category " + first, reader.toString());
|
"Parent data field " + Consts.FIELD_PARENT_ORDINAL_NDV + " not exists",
|
||||||
|
leafContext.reader().toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
int num = reader.maxDoc();
|
for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) {
|
||||||
for (int i = first; i < num; i++) {
|
if (parentValues.advanceExact(doc) == false) {
|
||||||
if (positions.docID() == i) {
|
|
||||||
if (positions.freq() == 0) { // shouldn't happen
|
|
||||||
throw new CorruptIndexException(
|
throw new CorruptIndexException(
|
||||||
"Missing parent data for category " + i, reader.toString());
|
"Missing parent data for category " + (doc + leafContext.docBase), reader.toString());
|
||||||
}
|
}
|
||||||
|
// we're putting an int and converting it back so it should be safe
|
||||||
parents[i] = positions.nextPosition();
|
parents[doc + leafContext.docBase] = Math.toIntExact(parentValues.longValue());
|
||||||
|
|
||||||
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
|
|
||||||
if (i + 1 < num) {
|
|
||||||
throw new CorruptIndexException(
|
|
||||||
"Missing parent data for category " + (i + 1), reader.toString());
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else { // this shouldn't happen
|
|
||||||
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue