diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9e7ac8d14bd..e6af773c679 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -251,7 +251,7 @@ Improvements * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (Dawid Weiss) -* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai) +* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Patrick Zhai) * LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O. Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed. @@ -460,6 +460,8 @@ Build Other --------------------- +* LUCENE-10122: Use NumericDocValues to store taxonomy parent array (Patrick Zhai) + * LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss) * LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files. @@ -468,7 +470,7 @@ Other * LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss) -* LUCENE-9391: Upgrade HPPC to 0.8.2. (Haoyu Zhai) +* LUCENE-9391: Upgrade HPPC to 0.8.2. (Patrick Zhai) * LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai) @@ -529,7 +531,7 @@ Improvements * LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments. (Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir) -* LUCENE-10103: Make QueryCache respect Accountable queries. (Haoyu Zhai) +* LUCENE-10103: Make QueryCache respect Accountable queries. (Patrick Zhai) Optimizations --------------------- @@ -741,7 +743,7 @@ New Features (Cameron VandenBerg) * LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes - on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai) + on a consistent multi-segment index even when they require re-indexing. (Patrick Zhai) * LUCENE-9385: Add FacetsConfig option to control which drill-down terms are indexed for a FacetLabel (Zachary Chen) @@ -944,7 +946,7 @@ Improvements * LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name per hit in dependencies, ExpressionFunctionValues will no longer - recompute already computed values (Haoyu Zhai) + recompute already computed values (Patrick Zhai) * LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as unsigned long when detecting corruption. @@ -1017,7 +1019,7 @@ Bug Fixes Documentation --------------------- -* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai) +* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Patrick Zhai) Changes in Runtime Behavior --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java index 104bfdfb862..e5256db380a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java @@ -16,12 +16,8 @@ */ package org.apache.lucene.facet.taxonomy.directory; -import org.apache.lucene.util.BytesRef; - /** @lucene.experimental */ abstract class Consts { static final String FULL = "$full_path$"; - static final String FIELD_PAYLOADS = "$payloads$"; - static final String PAYLOAD_PARENT = "p"; - static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT); + static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$"; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index 7e2decf8918..123fa8eb5e8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -27,15 +27,11 @@ import java.util.HashMap; import java.util.Map; import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.taxonomy.FacetLabel; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -97,9 +93,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { // Records the taxonomy index epoch, updated on replaceTaxonomy as well. private long indexEpoch; - private SinglePositionTokenStream parentStream = - new SinglePositionTokenStream(Consts.PAYLOAD_PARENT); - private Field parentStreamField; private Field fullPathField; private int cacheMissesUntilFill = 11; private boolean shouldFillCache = true; @@ -176,9 +169,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { ++indexEpoch; } - FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); - ft.setOmitNorms(true); - parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft); fullPathField = new StringField(Consts.FULL, "", Field.Store.NO); nextID = indexWriter.getDocStats().maxDoc; @@ -457,18 +447,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { * effectively synchronized as well. */ private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException { - // Before Lucene 2.9, position increments >=0 were supported, so we - // added 1 to parent to allow the parent -1 (the parent of the root). - // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is - // no longer enough, since 0 is not encoded consistently either (see - // comment in SinglePositionTokenStream). But because we must be - // backward-compatible with existing indexes, we can't just fix what - // we write here (e.g., to write parent+2), and need to do a workaround - // in the reader (which knows that anyway only category 0 has a parent - // -1). - parentStream.set(Math.max(parent + 1, 1)); Document d = new Document(); - d.add(parentStreamField); + /* Lucene 9 switches to NumericDocValuesField for storing parent ordinals */ + d.add(new NumericDocValuesField(Consts.FIELD_PARENT_ORDINAL_NDV, parent)); String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length); fullPathField.setStringValue(fieldPath); @@ -497,50 +478,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { return id; } - private static class SinglePositionTokenStream extends TokenStream { - private CharTermAttribute termAtt; - private PositionIncrementAttribute posIncrAtt; - private boolean returned; - private int val; - private final String word; - - public SinglePositionTokenStream(String word) { - termAtt = addAttribute(CharTermAttribute.class); - posIncrAtt = addAttribute(PositionIncrementAttribute.class); - this.word = word; - returned = true; - } - - /** - * Set the value we want to keep, as the position increment. Note that when - * TermPositions.nextPosition() is later used to retrieve this value, val-1 will be returned, - * not val. - * - *

IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0, the retrieved position - * would be -1). But starting with Lucene 2.9, this unfortunately changed, and only val>0 are - * safe. val=0 can still be used, but don't count on the value you retrieve later (it could be 0 - * or -1, depending on circumstances or versions). This change is described in Lucene's JIRA: - * LUCENE-1542. - */ - public void set(int val) { - this.val = val; - returned = false; - } - - @Override - public boolean incrementToken() throws IOException { - if (returned) { - return false; - } - clearAttributes(); - posIncrAtt.setPositionIncrement(val); - termAtt.setEmpty(); - termAtt.append(word); - returned = true; - return true; - } - } - private void addToCache(FacetLabel categoryPath, int id) throws IOException { if (cache.put(categoryPath, id)) { // If cache.put() returned true, it means the cache was limited in diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java index 35c04958ce1..fbb3f80356a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java @@ -25,9 +25,8 @@ import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiTerms; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.util.Accountable; import org.apache.lucene.util.Accountables; import org.apache.lucene.util.ArrayUtil; @@ -58,13 +57,6 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable parents = new int[reader.maxDoc()]; if (parents.length > 0) { initParents(reader, 0); - // Starting Lucene 2.9, following the change LUCENE-1542, we can - // no longer reliably read the parent "-1" (see comment in - // LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way - // to fix this in indexing without breaking backward-compatibility - // with existing indexes, so what we'll do instead is just - // hard-code the parent of ordinal 0 to be -1, and assume (as is - // indeed the case) that no other parent can be -1. parents[0] = TaxonomyReader.INVALID_ORDINAL; } } @@ -130,38 +122,27 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable return; } - // it's ok to use MultiTerms because we only iterate on one posting list. - // breaking it to loop over the leaves() only complicates code for no - // apparent gain. - PostingsEnum positions = - MultiTerms.getTermPostingsEnum( - reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS); + for (LeafReaderContext leafContext : reader.leaves()) { + int leafDocNum = leafContext.reader().maxDoc(); + if (leafContext.docBase + leafDocNum <= first) { + // skip this leaf if it does not contain new categories + continue; + } + NumericDocValues parentValues = + leafContext.reader().getNumericDocValues(Consts.FIELD_PARENT_ORDINAL_NDV); + if (parentValues == null) { + throw new CorruptIndexException( + "Parent data field " + Consts.FIELD_PARENT_ORDINAL_NDV + " not exists", + leafContext.reader().toString()); + } - // shouldn't really happen, if it does, something's wrong - if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) { - throw new CorruptIndexException( - "Missing parent data for category " + first, reader.toString()); - } - - int num = reader.maxDoc(); - for (int i = first; i < num; i++) { - if (positions.docID() == i) { - if (positions.freq() == 0) { // shouldn't happen + for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) { + if (parentValues.advanceExact(doc) == false) { throw new CorruptIndexException( - "Missing parent data for category " + i, reader.toString()); + "Missing parent data for category " + (doc + leafContext.docBase), reader.toString()); } - - parents[i] = positions.nextPosition(); - - if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { - if (i + 1 < num) { - throw new CorruptIndexException( - "Missing parent data for category " + (i + 1), reader.toString()); - } - break; - } - } else { // this shouldn't happen - throw new CorruptIndexException("Missing parent data for category " + i, reader.toString()); + // we're putting an int and converting it back so it should be safe + parents[doc + leafContext.docBase] = Math.toIntExact(parentValues.longValue()); } } }