From 871844711260a89a36e6e37031ab36befd69a89d Mon Sep 17 00:00:00 2001 From: Eric Pugh Date: Thu, 9 Jan 2020 08:58:14 -0500 Subject: [PATCH 1/8] SOLR-13927: Correct v2 /schema APIs in docs (#1010) --- solr/solr-ref-guide/src/schema-api.adoc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/solr/solr-ref-guide/src/schema-api.adoc b/solr/solr-ref-guide/src/schema-api.adoc index 96de55e794a..2f5a5e028d8 100644 --- a/solr/solr-ref-guide/src/schema-api.adoc +++ b/solr/solr-ref-guide/src/schema-api.adoc @@ -73,7 +73,7 @@ bin/solr -e cloud -noprompt == Modify the Schema -To add, remove or replace fields, dynamic field rules, copy field rules, or new field types, you can send a POST request to the `/collection/schema/` endpoint with a sequence of commands in JSON format to perform the requested actions. The following commands are supported: +To add, remove or replace fields, dynamic field rules, copy field rules, or new field types, you can send a POST request to the `/api/{collections|cores}/{name}/schema/` endpoint with a sequence of commands in JSON format to perform the requested actions. The following commands are supported: * `add-field`: add a new field with parameters you provide. * `delete-field`: delete a field. @@ -127,7 +127,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ "name":"sell_by", "type":"pdate", "stored":true } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -158,7 +158,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ ---- curl -X POST -H 'Content-type:application/json' --data-binary '{ "delete-field" : { "name":"sell_by" } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -197,7 +197,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ "name":"sell_by", "type":"date", "stored":false } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -236,7 +236,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ "name":"*_s", "type":"string", "stored":true } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -267,7 +267,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ ---- curl -X POST -H 'Content-type:application/json' --data-binary '{ "delete-dynamic-field":{ "name":"*_s" } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -388,7 +388,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ "queryAnalyzer":{ "tokenizer":{ "name":"keyword" }}} -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -419,7 +419,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ ---- curl -X POST -H 'Content-type:application/json' --data-binary '{ "delete-field-type":{ "name":"myNewTxtField" } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -464,7 +464,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ "analyzer":{ "tokenizer":{ "name":"standard" }}} -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -510,7 +510,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ "add-copy-field":{ "source":"shelf", "dest":[ "location", "catchall" ]} -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- @@ -543,7 +543,7 @@ curl -X POST -H 'Content-type:application/json' --data-binary '{ ---- curl -X POST -H 'Content-type:application/json' --data-binary '{ "delete-copy-field":{ "source":"shelf", "dest":"location" } -}' http://localhost:8983/api/cores/gettingstarted/schema +}' http://localhost:8983/api/collections/gettingstarted/schema ---- ==== -- From ffe75fb441e1b6d772404560d5e0a10e2a5db39a Mon Sep 17 00:00:00 2001 From: andywebb1975 Date: Thu, 9 Jan 2020 14:04:10 +0000 Subject: [PATCH 2/8] SOLR-14165: set SolrResponse's serialVersionUID explicitly --- .../src/java/org/apache/solr/client/solrj/SolrResponse.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/SolrResponse.java b/solr/solrj/src/java/org/apache/solr/client/solrj/SolrResponse.java index 7abde81fa49..c9f2cc5eb58 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/SolrResponse.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/SolrResponse.java @@ -37,6 +37,9 @@ import java.io.Serializable; */ public abstract class SolrResponse implements Serializable, MapWriter { + /** make this compatible with earlier versions */ + private static final long serialVersionUID = -7931100103360242645L; + /** Elapsed time in milliseconds for the request as seen from the client. */ public abstract long getElapsedTime(); From b11c3cffe4954f35afd3461c0d3a7d0ac5a34054 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 9 Jan 2020 15:09:21 +0100 Subject: [PATCH 3/8] LUCENE-9118: BlockTreeTermsReader uses `Arrays#compareUnsigned` to compare suffixes. (#1150) --- .../blocktree/SegmentTermsEnumFrame.java | 164 +++++++----------- 1 file changed, 62 insertions(+), 102 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java index a32bdac427c..fdb4cc6955b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.blocktree; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.index.IndexOptions; @@ -523,8 +524,7 @@ final class SegmentTermsEnumFrame { assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: - //nextTerm: while(nextEnt < entCount) { - nextTerm: while (true) { + do { nextEnt++; suffix = suffixesReader.readVInt(); @@ -537,60 +537,37 @@ final class SegmentTermsEnumFrame { // System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + brToString(suffixBytesRef)); // } - final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); suffixesReader.skipBytes(suffix); - final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); - int targetPos = target.offset + prefix; + // Loop over bytes in the suffix, comparing to the target + final int cmp = Arrays.compareUnsigned( + suffixBytes, startBytePos, startBytePos + suffix, + target.bytes, target.offset + prefix, target.offset + target.length); - // Loop over bytes in the suffix, comparing to - // the target - int bytePos = startBytePos; - while(true) { - final int cmp; - final boolean stop; - if (targetPos < targetLimit) { - cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); - stop = false; - } else { - assert targetPos == targetLimit; - cmp = termLen - target.length; - stop = true; - } + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); - if (cmp < 0) { - // Current entry is still before the target; - // keep scanning + //if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! - if (nextEnt == entCount) { - // We are done scanning this block - break nextTerm; - } else { - continue nextTerm; - } - } else if (cmp > 0) { + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: - // Done! Current entry is after target -- - // return NOT_FOUND: - fillTerm(); - - //if (DEBUG) System.out.println(" not found"); - return SeekStatus.NOT_FOUND; - } else if (stop) { - // Exact match! - - // This cannot be a sub-block because we - // would have followed the index to this - // sub-block from the start: - - assert ste.termExists; - fillTerm(); - //if (DEBUG) System.out.println(" found!"); - return SeekStatus.FOUND; - } + assert ste.termExists; + fillTerm(); + //if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; } - } + } while (nextEnt < entCount); // It is possible (and OK) that terms index pointed us // at this block, but, we scanned the entire block and @@ -631,7 +608,7 @@ final class SegmentTermsEnumFrame { assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: - nextTerm: while(nextEnt < entCount) { + while(nextEnt < entCount) { nextEnt++; @@ -658,65 +635,48 @@ final class SegmentTermsEnumFrame { lastSubFP = fp - subCode; } - final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); - int targetPos = target.offset + prefix; + final int cmp = Arrays.compareUnsigned( + suffixBytes, startBytePos, startBytePos + suffix, + target.bytes, target.offset + prefix, target.offset + target.length); - // Loop over bytes in the suffix, comparing to - // the target - int bytePos = startBytePos; - while (true) { - final int cmp; - final boolean stop; - if (targetPos < targetLimit) { - cmp = (suffixBytes[bytePos++]&0xFF) - (target.bytes[targetPos++]&0xFF); - stop = false; - } else { - assert targetPos == targetLimit; - cmp = termLen - target.length; - stop = true; - } + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); - if (cmp < 0) { - // Current entry is still before the target; - // keep scanning - continue nextTerm; - } else if (cmp > 0) { + //if (DEBUG) System.out.println(" maybe done exactOnly=" + exactOnly + " ste.termExists=" + ste.termExists); - // Done! Current entry is after target -- - // return NOT_FOUND: - fillTerm(); - - //if (DEBUG) System.out.println(" maybe done exactOnly=" + exactOnly + " ste.termExists=" + ste.termExists); - - if (!exactOnly && !ste.termExists) { - //System.out.println(" now pushFrame"); - // TODO this - // We are on a sub-block, and caller wants - // us to position to the next term after - // the target, so we must recurse into the - // sub-frame(s): - ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); + if (!exactOnly && !ste.termExists) { + //System.out.println(" now pushFrame"); + // TODO this + // We are on a sub-block, and caller wants + // us to position to the next term after + // the target, so we must recurse into the + // sub-frame(s): + ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); + ste.currentFrame.loadBlock(); + while (ste.currentFrame.next()) { + ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); ste.currentFrame.loadBlock(); - while (ste.currentFrame.next()) { - ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); - ste.currentFrame.loadBlock(); - } } - - //if (DEBUG) System.out.println(" not found"); - return SeekStatus.NOT_FOUND; - } else if (stop) { - // Exact match! - - // This cannot be a sub-block because we - // would have followed the index to this - // sub-block from the start: - - assert ste.termExists; - fillTerm(); - //if (DEBUG) System.out.println(" found!"); - return SeekStatus.FOUND; } + + //if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! + + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: + + assert ste.termExists; + fillTerm(); + //if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; } } From 7ad33c3a98a15dfbf3d442a2c65b13fa2a748979 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 9 Jan 2020 15:15:30 +0100 Subject: [PATCH 4/8] LUCENE-9115: NRTCachingDirectory shouldn't cache files of unknown size. (#1145) --- lucene/CHANGES.txt | 3 ++ .../lucene/store/NRTCachingDirectory.java | 2 ++ .../lucene/store/TestNRTCachingDirectory.java | 30 +++++++++++++++++++ 3 files changed, 35 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 51d94755ea7..fcc359f7326 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,6 +123,9 @@ Bug Fixes * LUCENE-9084: Fix potential deadlock due to circular synchronization in AnalyzingInfixSuggester (Paul Ward) +* LUCENE-9115: NRTCachingDirectory no longer caches files of unknown size. + (Adrien Grand) + Other --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java index 63526742372..0e4b9222b4d 100644 --- a/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java +++ b/lucene/core/src/java/org/apache/lucene/store/NRTCachingDirectory.java @@ -232,6 +232,8 @@ public class NRTCachingDirectory extends FilterDirectory implements Accountable bytes = context.mergeInfo.estimatedMergeBytes; } else if (context.flushInfo != null) { bytes = context.flushInfo.estimatedSegmentSize; + } else { + return false; } return (bytes <= maxMergeSizeBytes) && (bytes + cacheSize.get()) <= maxCachedBytes; diff --git a/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java b/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java index 0f2e200bf26..502b18106e4 100644 --- a/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java +++ b/lucene/core/src/test/org/apache/lucene/store/TestNRTCachingDirectory.java @@ -136,4 +136,34 @@ public class TestNRTCachingDirectory extends BaseDirectoryTestCase { nrtDir.close(); fsDir.close(); } + + public void testUnknownFileSize() throws IOException { + Directory dir = newDirectory(); + + Directory nrtDir1 = new NRTCachingDirectory(dir, 1, 1) { + @Override + protected boolean doCacheWrite(String name, IOContext context) { + boolean cache = super.doCacheWrite(name, context); + assertTrue(cache); + return cache; + } + }; + IOContext ioContext = new IOContext(new FlushInfo(3, 42)); + nrtDir1.createOutput("foo", ioContext).close(); + nrtDir1.createTempOutput("bar", "baz", ioContext).close(); + + Directory nrtDir2 = new NRTCachingDirectory(dir, 1, 1) { + @Override + protected boolean doCacheWrite(String name, IOContext context) { + boolean cache = super.doCacheWrite(name, context); + assertFalse(cache); + return cache; + } + }; + ioContext = IOContext.DEFAULT; + nrtDir2.createOutput("foo", ioContext).close(); + nrtDir2.createTempOutput("bar", "baz", ioContext).close(); + + dir.close(); + } } From d0b4a166e06521757cc3be25dafd7705e1eeecdc Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 9 Jan 2020 15:16:26 +0100 Subject: [PATCH 5/8] LUCENE-9116: Remove long[] from `PostingsWriterBase#encodeTerm`. (#1149) LUCENE-9116: Remove long[] from `PostingsWriterBase#encodeTerm`. All the metadata can be directly encoded in the `DataOutput`. --- lucene/CHANGES.txt | 6 + .../lucene50/Lucene50PostingsReader.java | 8 +- .../lucene50/Lucene50PostingsWriter.java | 19 +- .../codecs/blockterms/BlockTermsReader.java | 14 +- .../codecs/blockterms/BlockTermsWriter.java | 20 +- .../OrdsBlockTreeTermsReader.java | 3 +- .../OrdsBlockTreeTermsWriter.java | 29 +- .../codecs/blocktreeords/OrdsFieldReader.java | 4 +- .../OrdsIntersectTermsEnumFrame.java | 12 +- .../OrdsSegmentTermsEnumFrame.java | 12 +- .../codecs/memory/FSTOrdPostingsFormat.java | 78 -- .../codecs/memory/FSTOrdTermsReader.java | 884 ------------------ .../codecs/memory/FSTOrdTermsWriter.java | 386 -------- .../codecs/memory/FSTPostingsFormat.java | 78 -- .../lucene/codecs/memory/FSTTermOutputs.java | 383 -------- .../lucene/codecs/memory/FSTTermsReader.java | 785 ---------------- .../lucene/codecs/memory/FSTTermsWriter.java | 291 ------ .../DeltaBaseTermStateSerializer.java | 4 +- .../org.apache.lucene.codecs.PostingsFormat | 2 - .../memory/TestFSTOrdPostingsFormat.java | 34 - .../codecs/memory/TestFSTPostingsFormat.java | 34 - .../uniformsplit/TestTermBytesComparator.java | 2 +- .../sharedterms/STBlockReaderTest.java | 2 +- .../lucene/codecs/PostingsReaderBase.java | 2 +- .../lucene/codecs/PostingsWriterBase.java | 15 +- .../lucene/codecs/PushPostingsWriterBase.java | 4 +- .../blocktree/BlockTreeTermsReader.java | 15 +- .../blocktree/BlockTreeTermsWriter.java | 30 +- .../lucene/codecs/blocktree/FieldReader.java | 4 +- .../blocktree/IntersectTermsEnumFrame.java | 13 +- .../blocktree/SegmentTermsEnumFrame.java | 12 +- .../lucene84/Lucene84PostingsReader.java | 8 +- .../lucene84/Lucene84PostingsWriter.java | 19 +- .../idversion/IDVersionPostingsReader.java | 2 +- .../idversion/IDVersionPostingsWriter.java | 7 +- .../IDVersionSegmentTermsEnumFrame.java | 12 +- .../VersionBlockTreeTermsReader.java | 3 +- .../VersionBlockTreeTermsWriter.java | 29 +- .../codecs/idversion/VersionFieldReader.java | 4 +- .../mockrandom/MockRandomPostingsFormat.java | 58 +- .../org/apache/lucene/index/RandomCodec.java | 4 - 41 files changed, 92 insertions(+), 3239 deletions(-) delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java delete mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java delete mode 100644 lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java delete mode 100644 lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fcc359f7326..1b1f250547e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -88,6 +88,12 @@ API Changes yield Passages sized a little different due to the fact that the sizing pivot is now the center of the first match and not its left edge. +* LUCENE-9116: PostingsWriterBase and PostingsReaderBase no longer support + setting a field's metadata via a `long[]`. (Adrien Grand) + +* LUCENE-9116: The FST and FSTOrd postings formats have been removed. + (Adrien Grand) + New Features --------------------- (No changes) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java index 0ea8c802cd2..adae891c4ab 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java @@ -154,7 +154,7 @@ public final class Lucene50PostingsReader extends PostingsReaderBase { } @Override - public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; @@ -167,11 +167,11 @@ public final class Lucene50PostingsReader extends PostingsReaderBase { termState.payStartFP = 0; } - termState.docStartFP += longs[0]; + termState.docStartFP += in.readVLong(); if (fieldHasPositions) { - termState.posStartFP += longs[1]; + termState.posStartFP += in.readVLong(); if (fieldHasOffsets || fieldHasPayloads) { - termState.payStartFP += longs[2]; + termState.payStartFP += in.readVLong(); } } if (termState.docFreq == 1) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java index a600e61fb32..8f425a2036c 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java @@ -187,20 +187,11 @@ public final class Lucene50PostingsWriter extends PushPostingsWriterBase { } @Override - public int setField(FieldInfo fieldInfo) { + public void setField(FieldInfo fieldInfo) { super.setField(fieldInfo); skipWriter.setField(writePositions, writeOffsets, writePayloads); lastState = emptyState; fieldHasNorms = fieldInfo.hasNorms(); - if (writePositions) { - if (writePayloads || writeOffsets) { - return 3; // doc + pos + pay FP - } else { - return 2; // doc + pos FP - } - } else { - return 1; // doc FP - } } @Override @@ -463,16 +454,16 @@ public final class Lucene50PostingsWriter extends PushPostingsWriterBase { } @Override - public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IntBlockTermState state = (IntBlockTermState)_state; if (absolute) { lastState = emptyState; } - longs[0] = state.docStartFP - lastState.docStartFP; + out.writeVLong(state.docStartFP - lastState.docStartFP); if (writePositions) { - longs[1] = state.posStartFP - lastState.posStartFP; + out.writeVLong(state.posStartFP - lastState.posStartFP); if (writePayloads || writeOffsets) { - longs[2] = state.payStartFP - lastState.payStartFP; + out.writeVLong(state.payStartFP - lastState.payStartFP); } } if (state.singletonDocID != -1) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java index 964f616c6ff..480f5fde271 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java @@ -145,7 +145,6 @@ public class BlockTermsReader extends FieldsProducer { // when frequencies are omitted, sumDocFreq=totalTermFreq and we only write one value final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong(); final int docCount = in.readVInt(); - final int longsSize = in.readVInt(); if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), in); } @@ -155,7 +154,7 @@ public class BlockTermsReader extends FieldsProducer { if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, in); } - FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount, longsSize)); + FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount)); if (previous != null) { throw new CorruptIndexException("duplicate fields: " + fieldInfo.name, in); } @@ -223,9 +222,8 @@ public class BlockTermsReader extends FieldsProducer { final long sumTotalTermFreq; final long sumDocFreq; final int docCount; - final int longsSize; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; @@ -233,7 +231,6 @@ public class BlockTermsReader extends FieldsProducer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; - this.longsSize = longsSize; } @Override @@ -326,7 +323,6 @@ public class BlockTermsReader extends FieldsProducer { private final ByteArrayDataInput freqReader = new ByteArrayDataInput(); private int metaDataUpto; - private long[] longs; private byte[] bytes; private ByteArrayDataInput bytesReader; @@ -343,7 +339,6 @@ public class BlockTermsReader extends FieldsProducer { termSuffixes = new byte[128]; docFreqBytes = new byte[64]; //System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader); - longs = new long[longsSize]; } // TODO: we may want an alternate mode here which is @@ -826,10 +821,7 @@ public class BlockTermsReader extends FieldsProducer { //System.out.println(" totTF=" + state.totalTermFreq); } // metadata - for (int i = 0; i < longs.length; i++) { - longs[i] = bytesReader.readVLong(); - } - postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute); + postingsReader.decodeTerm(bytesReader, fieldInfo, state, absolute); metaDataUpto++; absolute = false; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java index f620bd83d0f..e064aa1ecf2 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java @@ -81,9 +81,8 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; - public final int longsSize; - public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { + public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.termsStartPointer = termsStartPointer; @@ -91,7 +90,6 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; - this.longsSize = longsSize; } } @@ -176,7 +174,6 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); - out.writeVInt(field.longsSize); } writeTrailer(dirStart); CodecUtil.writeFooter(out); @@ -206,7 +203,6 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { long sumTotalTermFreq; long sumDocFreq; int docCount; - int longsSize; private TermEntry[] pendingTerms; @@ -226,7 +222,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { } termsStartPointer = out.getFilePointer(); this.postingsWriter = postingsWriter; - this.longsSize = postingsWriter.setField(fieldInfo); + postingsWriter.setField(fieldInfo); } private final BytesRefBuilder lastPrevTerm = new BytesRefBuilder(); @@ -285,8 +281,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { termsStartPointer, fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 ? sumTotalTermFreq : -1, sumDocFreq, - docsSeen.cardinality(), - longsSize)); + docsSeen.cardinality())); } } @@ -307,7 +302,6 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { } private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); - private final ByteBuffersDataOutput bufferWriter = ByteBuffersDataOutput.newResettableInstance(); private void flushBlock() throws IOException { //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer()); @@ -353,16 +347,10 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { bytesWriter.reset(); // 4th pass: write the metadata - long[] longs = new long[longsSize]; boolean absolute = true; for(int termCount=0;termCount 0; this.fieldInfo = fieldInfo; @@ -159,7 +158,6 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; - this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; } @@ -424,7 +422,6 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { class TermsWriter { private final FieldInfo fieldInfo; - private final int longsSize; private long numTerms; final FixedBitSet docsSeen; long sumTotalTermFreq; @@ -439,8 +436,6 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private int[] prefixStarts = new int[8]; - private final long[] longs; - // Pending stack of terms and blocks. As terms arrive (in sorted order) // we append to this stack, and once the top of the stack has enough // terms starting with a common prefix, we write a new block with @@ -633,13 +628,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { } // Write term meta data - postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); - for (int pos = 0; pos < longsSize; pos++) { - assert longs[pos] >= 0; - metaWriter.writeVLong(longs[pos]); - } - bytesWriter.copyTo(metaWriter); - bytesWriter.reset(); + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); absolute = false; } totalTermCount = end-start; @@ -684,13 +673,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { // separate anymore: // Write term meta data - postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); - for (int pos = 0; pos < longsSize; pos++) { - assert longs[pos] >= 0; - metaWriter.writeVLong(longs[pos]); - } - bytesWriter.copyTo(metaWriter); - bytesWriter.reset(); + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); absolute = false; totalTermCount++; @@ -763,8 +746,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { TermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; docsSeen = new FixedBitSet(maxDoc); - this.longsSize = postingsWriter.setField(fieldInfo); - this.longs = new long[longsSize]; + postingsWriter.setField(fieldInfo); } /** Writes one term's worth of postings. */ @@ -874,7 +856,6 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { sumTotalTermFreq, sumDocFreq, docsSeen.cardinality(), - longsSize, minTerm, maxTerm)); } else { assert docsSeen.cardinality() == 0; @@ -884,7 +865,6 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); - private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); } private boolean closed; @@ -916,7 +896,6 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); - out.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(out, field.minTerm); writeBytesRef(out, field.maxTerm); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java index 5d02258837d..54954e85d3d 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java @@ -46,7 +46,6 @@ final class OrdsFieldReader extends Terms implements Accountable { final Output rootCode; final BytesRef minTerm; final BytesRef maxTerm; - final int longsSize; final OrdsBlockTreeTermsReader parent; final FST index; @@ -54,7 +53,7 @@ final class OrdsFieldReader extends Terms implements Accountable { OrdsFieldReader(OrdsBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, Output rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, - long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { + long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -65,7 +64,6 @@ final class OrdsFieldReader extends Terms implements Accountable { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; - this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java index a34f0fda1d0..ab7eab73427 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java @@ -84,9 +84,7 @@ final class OrdsIntersectTermsEnumFrame { final BlockTermState termState; - // metadata buffer, holding monotonic values - public long[] longs; - // metadata buffer, holding general values + // metadata public byte[] bytes; ByteArrayDataInput bytesReader; @@ -103,7 +101,6 @@ final class OrdsIntersectTermsEnumFrame { this.ord = ord; this.termState = ite.fr.parent.postingsReader.newTermState(); this.termState.totalTermFreq = -1; - this.longs = new long[ite.fr.longsSize]; } void loadNextFloorBlock() throws IOException { @@ -298,11 +295,8 @@ final class OrdsIntersectTermsEnumFrame { termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); } - // metadata - for (int i = 0; i < ite.fr.longsSize; i++) { - longs[i] = bytesReader.readVLong(); - } - ite.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ite.fr.fieldInfo, termState, absolute); + // metadata + ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java index ee3782f29cd..240e781c7cc 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java @@ -97,9 +97,7 @@ final class OrdsSegmentTermsEnumFrame { final BlockTermState state; - // metadata buffer, holding monotonic values - public long[] longs; - // metadata buffer, holding general values + // metadata public byte[] bytes; ByteArrayDataInput bytesReader; @@ -110,7 +108,6 @@ final class OrdsSegmentTermsEnumFrame { this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; - this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -507,11 +504,8 @@ final class OrdsSegmentTermsEnumFrame { } //if (DEBUG) System.out.println(" longsSize=" + ste.fr.longsSize); - // metadata - for (int i = 0; i < ste.fr.longsSize; i++) { - longs[i] = bytesReader.readVLong(); - } - ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); + // metadata + ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java deleted file mode 100644 index 0ce12178a90..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - - -import java.io.IOException; - -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; -import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.IOUtils; - -/** - * FSTOrd term dict + Lucene50PBF - */ - -public final class FSTOrdPostingsFormat extends PostingsFormat { - public FSTOrdPostingsFormat() { - super("FSTOrd50"); - } - - @Override - public String toString() { - return getName(); - } - - @Override - public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene84PostingsWriter(state); - - boolean success = false; - try { - FieldsConsumer ret = new FSTOrdTermsWriter(state, postingsWriter); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsWriter); - } - } - } - - @Override - public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene84PostingsReader(state); - boolean success = false; - try { - FieldsProducer ret = new FSTOrdTermsReader(state, postingsReader); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsReader); - } - } - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java deleted file mode 100644 index 7ecf19cc9f6..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java +++ /dev/null @@ -1,884 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.TreeMap; - -import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.ImpactsEnum; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteArrayDataInput; -import org.apache.lucene.store.ChecksumIndexInput; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Accountables; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.fst.BytesRefFSTEnum; -import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; -import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.fst.Util; - -/** - * FST-based terms dictionary reader. - * - * The FST index maps each term and its ord, and during seek - * the ord is used to fetch metadata from a single block. - * The term dictionary is fully memory resident. - * - * @lucene.experimental - */ -public class FSTOrdTermsReader extends FieldsProducer { - static final int INTERVAL = FSTOrdTermsWriter.SKIP_INTERVAL; - final TreeMap fields = new TreeMap<>(); - final PostingsReaderBase postingsReader; - //static final boolean TEST = false; - - public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { - final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION); - final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION); - - this.postingsReader = postingsReader; - ChecksumIndexInput indexIn = null; - IndexInput blockIn = null; - boolean success = false; - try { - indexIn = state.directory.openChecksumInput(termsIndexFileName, state.context); - blockIn = state.directory.openInput(termsBlockFileName, state.context); - int version = CodecUtil.checkIndexHeader(indexIn, FSTOrdTermsWriter.TERMS_INDEX_CODEC_NAME, - FSTOrdTermsWriter.VERSION_START, - FSTOrdTermsWriter.VERSION_CURRENT, - state.segmentInfo.getId(), state.segmentSuffix); - int version2 = CodecUtil.checkIndexHeader(blockIn, FSTOrdTermsWriter.TERMS_CODEC_NAME, - FSTOrdTermsWriter.VERSION_START, - FSTOrdTermsWriter.VERSION_CURRENT, - state.segmentInfo.getId(), state.segmentSuffix); - - if (version != version2) { - throw new CorruptIndexException("Format versions mismatch: index=" + version + ", terms=" + version2, blockIn); - } - - CodecUtil.checksumEntireFile(blockIn); - - this.postingsReader.init(blockIn, state); - seekDir(blockIn); - - final FieldInfos fieldInfos = state.fieldInfos; - final int numFields = blockIn.readVInt(); - for (int i = 0; i < numFields; i++) { - FieldInfo fieldInfo = fieldInfos.fieldInfo(blockIn.readVInt()); - boolean hasFreq = fieldInfo.getIndexOptions() != IndexOptions.DOCS; - long numTerms = blockIn.readVLong(); - long sumTotalTermFreq = blockIn.readVLong(); - // if freqs are omitted, sumDocFreq=sumTotalTermFreq and we only write one value - long sumDocFreq = hasFreq ? blockIn.readVLong() : sumTotalTermFreq; - int docCount = blockIn.readVInt(); - int longsSize = blockIn.readVInt(); - FST index = new FST<>(indexIn, PositiveIntOutputs.getSingleton()); - - TermsReader current = new TermsReader(fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index); - TermsReader previous = fields.put(fieldInfo.name, current); - checkFieldSummary(state.segmentInfo, indexIn, blockIn, current, previous); - } - CodecUtil.checkFooter(indexIn); - success = true; - } finally { - if (success) { - IOUtils.close(indexIn, blockIn); - } else { - IOUtils.closeWhileHandlingException(indexIn, blockIn); - } - } - } - - private void seekDir(IndexInput in) throws IOException { - in.seek(in.length() - CodecUtil.footerLength() - 8); - in.seek(in.readLong()); - } - private void checkFieldSummary(SegmentInfo info, IndexInput indexIn, IndexInput blockIn, TermsReader field, TermsReader previous) throws IOException { - // #docs with field must be <= #docs - if (field.docCount < 0 || field.docCount > info.maxDoc()) { - throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.maxDoc() + " (blockIn=" + blockIn + ")", indexIn); - } - // #postings must be >= #docs with field - if (field.sumDocFreq < field.docCount) { - throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (blockIn=" + blockIn + ")", indexIn); - } - // #positions must be >= #postings - if (field.sumTotalTermFreq < field.sumDocFreq) { - throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (blockIn=" + blockIn + ")", indexIn); - } - if (previous != null) { - throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name + " (blockIn=" + blockIn + ")", indexIn); - } - } - - @Override - public Iterator iterator() { - return Collections.unmodifiableSet(fields.keySet()).iterator(); - } - - @Override - public Terms terms(String field) throws IOException { - assert field != null; - return fields.get(field); - } - - @Override - public int size() { - return fields.size(); - } - - @Override - public void close() throws IOException { - try { - IOUtils.close(postingsReader); - } finally { - fields.clear(); - } - } - - final class TermsReader extends Terms implements Accountable { - final FieldInfo fieldInfo; - final long numTerms; - final long sumTotalTermFreq; - final long sumDocFreq; - final int docCount; - final int longsSize; - final FST index; - - final int numSkipInfo; - final long[] skipInfo; - final byte[] statsBlock; - final byte[] metaLongsBlock; - final byte[] metaBytesBlock; - - TermsReader(FieldInfo fieldInfo, IndexInput blockIn, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST index) throws IOException { - this.fieldInfo = fieldInfo; - this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; - this.longsSize = longsSize; - this.index = index; - - assert (numTerms & (~0xffffffffL)) == 0; - final int numBlocks = (int)(numTerms + INTERVAL - 1) / INTERVAL; - this.numSkipInfo = longsSize + 3; - this.skipInfo = new long[numBlocks * numSkipInfo]; - this.statsBlock = new byte[(int)blockIn.readVLong()]; - this.metaLongsBlock = new byte[(int)blockIn.readVLong()]; - this.metaBytesBlock = new byte[(int)blockIn.readVLong()]; - - int last = 0, next = 0; - for (int i = 1; i < numBlocks; i++) { - next = numSkipInfo * i; - for (int j = 0; j < numSkipInfo; j++) { - skipInfo[next + j] = skipInfo[last + j] + blockIn.readVLong(); - } - last = next; - } - blockIn.readBytes(statsBlock, 0, statsBlock.length); - blockIn.readBytes(metaLongsBlock, 0, metaLongsBlock.length); - blockIn.readBytes(metaBytesBlock, 0, metaBytesBlock.length); - } - - public boolean hasFreqs() { - return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - } - - @Override - public boolean hasOffsets() { - return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - } - - @Override - public boolean hasPositions() { - return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - } - - @Override - public boolean hasPayloads() { - return fieldInfo.hasPayloads(); - } - - @Override - public long size() { - return numTerms; - } - - @Override - public long getSumTotalTermFreq() { - return sumTotalTermFreq; - } - - @Override - public long getSumDocFreq() throws IOException { - return sumDocFreq; - } - - @Override - public int getDocCount() throws IOException { - return docCount; - } - - @Override - public TermsEnum iterator() throws IOException { - return new SegmentTermsEnum(); - } - - @Override - public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { - throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); - } - return new IntersectTermsEnum(compiled, startTerm); - } - - @Override - public long ramBytesUsed() { - long ramBytesUsed = 0; - if (index != null) { - ramBytesUsed += index.ramBytesUsed(); - ramBytesUsed += RamUsageEstimator.sizeOf(metaBytesBlock); - ramBytesUsed += RamUsageEstimator.sizeOf(metaLongsBlock); - ramBytesUsed += RamUsageEstimator.sizeOf(skipInfo); - ramBytesUsed += RamUsageEstimator.sizeOf(statsBlock); - } - return ramBytesUsed; - } - - @Override - public Collection getChildResources() { - if (index == null) { - return Collections.emptyList(); - } else { - return Collections.singletonList(Accountables.namedAccountable("terms", index)); - } - } - - @Override - public String toString() { - return "FSTOrdTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")"; - } - - // Only wraps common operations for PBF interact - abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { - - /* Current term's ord, starts from 0 */ - long ord; - - /* Current term stats + decoded metadata (customized by PBF) */ - final BlockTermState state; - - /* Datainput to load stats & metadata */ - final ByteArrayDataInput statsReader = new ByteArrayDataInput(); - final ByteArrayDataInput metaLongsReader = new ByteArrayDataInput(); - final ByteArrayDataInput metaBytesReader = new ByteArrayDataInput(); - - /* To which block is buffered */ - int statsBlockOrd; - int metaBlockOrd; - - /* Current buffered metadata (long[] & byte[]) */ - long[][] longs; - int[] bytesStart; - int[] bytesLength; - - /* Current buffered stats (df & ttf) */ - int[] docFreq; - long[] totalTermFreq; - - BaseTermsEnum() throws IOException { - this.state = postingsReader.newTermState(); - this.statsReader.reset(statsBlock); - this.metaLongsReader.reset(metaLongsBlock); - this.metaBytesReader.reset(metaBytesBlock); - - this.longs = new long[INTERVAL][longsSize]; - this.bytesStart = new int[INTERVAL]; - this.bytesLength = new int[INTERVAL]; - this.docFreq = new int[INTERVAL]; - this.totalTermFreq = new long[INTERVAL]; - this.statsBlockOrd = -1; - this.metaBlockOrd = -1; - } - - /** Decodes stats data into term state */ - void decodeStats() throws IOException { - final int upto = (int)ord % INTERVAL; - final int oldBlockOrd = statsBlockOrd; - statsBlockOrd = (int)ord / INTERVAL; - if (oldBlockOrd != statsBlockOrd) { - refillStats(); - } - state.docFreq = docFreq[upto]; - state.totalTermFreq = totalTermFreq[upto]; - } - - /** Let PBF decode metadata */ - void decodeMetaData() throws IOException { - final int upto = (int)ord % INTERVAL; - final int oldBlockOrd = metaBlockOrd; - metaBlockOrd = (int)ord / INTERVAL; - if (metaBlockOrd != oldBlockOrd) { - refillMetadata(); - } - metaBytesReader.setPosition(bytesStart[upto]); - postingsReader.decodeTerm(longs[upto], metaBytesReader, fieldInfo, state, true); - } - - /** Load current stats shard */ - final void refillStats() throws IOException { - final int offset = statsBlockOrd * numSkipInfo; - final int statsFP = (int)skipInfo[offset]; - statsReader.setPosition(statsFP); - for (int i = 0; i < INTERVAL && !statsReader.eof(); i++) { - int code = statsReader.readVInt(); - if (hasFreqs()) { - docFreq[i] = (code >>> 1); - if ((code & 1) == 1) { - totalTermFreq[i] = docFreq[i]; - } else { - totalTermFreq[i] = docFreq[i] + statsReader.readVLong(); - } - } else { - docFreq[i] = code; - totalTermFreq[i] = code; - } - } - } - - /** Load current metadata shard */ - final void refillMetadata() throws IOException { - final int offset = metaBlockOrd * numSkipInfo; - final int metaLongsFP = (int)skipInfo[offset + 1]; - final int metaBytesFP = (int)skipInfo[offset + 2]; - metaLongsReader.setPosition(metaLongsFP); - for (int j = 0; j < longsSize; j++) { - longs[0][j] = skipInfo[offset + 3 + j] + metaLongsReader.readVLong(); - } - bytesStart[0] = metaBytesFP; - bytesLength[0] = (int)metaLongsReader.readVLong(); - for (int i = 1; i < INTERVAL && !metaLongsReader.eof(); i++) { - for (int j = 0; j < longsSize; j++) { - longs[i][j] = longs[i-1][j] + metaLongsReader.readVLong(); - } - bytesStart[i] = bytesStart[i-1] + bytesLength[i-1]; - bytesLength[i] = (int)metaLongsReader.readVLong(); - } - } - - @Override - public TermState termState() throws IOException { - decodeMetaData(); - return state.clone(); - } - - @Override - public int docFreq() throws IOException { - return state.docFreq; - } - - @Override - public long totalTermFreq() throws IOException { - return state.totalTermFreq; - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - decodeMetaData(); - return postingsReader.postings(fieldInfo, state, reuse, flags); - } - - @Override - public ImpactsEnum impacts(int flags) throws IOException { - decodeMetaData(); - return postingsReader.impacts(fieldInfo, state, flags); - } - - // TODO: this can be achieved by making use of Util.getByOutput() - // and should have related tests - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long ord() { - throw new UnsupportedOperationException(); - } - } - - // Iterates through all terms in this field - private final class SegmentTermsEnum extends BaseTermsEnum { - final BytesRefFSTEnum fstEnum; - /* Current term, null when enum ends or unpositioned */ - BytesRef term; - - /* True when current term's metadata is decoded */ - boolean decoded; - - /* True when current enum is 'positioned' by seekExact(TermState) */ - boolean seekPending; - - SegmentTermsEnum() throws IOException { - this.fstEnum = new BytesRefFSTEnum<>(index); - this.decoded = false; - this.seekPending = false; - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - @Override - void decodeMetaData() throws IOException { - if (!decoded && !seekPending) { - super.decodeMetaData(); - decoded = true; - } - } - - // Update current enum according to FSTEnum - void updateEnum(final InputOutput pair) throws IOException { - if (pair == null) { - term = null; - } else { - term = pair.input; - ord = pair.output; - decodeStats(); - } - decoded = false; - seekPending = false; - } - - @Override - public BytesRef next() throws IOException { - if (seekPending) { // previously positioned, but termOutputs not fetched - seekPending = false; - SeekStatus status = seekCeil(term); - assert status == SeekStatus.FOUND; // must positioned on valid term - } - updateEnum(fstEnum.next()); - return term; - } - - @Override - public boolean seekExact(BytesRef target) throws IOException { - updateEnum(fstEnum.seekExact(target)); - return term != null; - } - - @Override - public SeekStatus seekCeil(BytesRef target) throws IOException { - updateEnum(fstEnum.seekCeil(target)); - if (term == null) { - return SeekStatus.END; - } else { - return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; - } - } - - @Override - public void seekExact(BytesRef target, TermState otherState) { - if (!target.equals(term)) { - state.copyFrom(otherState); - term = BytesRef.deepCopyOf(target); - seekPending = true; - } - } - } - - // Iterates intersect result with automaton (cannot seek!) - private final class IntersectTermsEnum extends BaseTermsEnum { - /* Current term, null when enum ends or unpositioned */ - BytesRefBuilder term; - - /* True when current term's metadata is decoded */ - boolean decoded; - - /* True when there is pending term when calling next() */ - boolean pending; - - /* stack to record how current term is constructed, - * used to accumulate metadata or rewind term: - * level == term.length + 1, - * == 0 when term is null */ - Frame[] stack; - int level; - - /* term dict fst */ - final FST fst; - final FST.BytesReader fstReader; - final Outputs fstOutputs; - - /* query automaton to intersect with */ - final ByteRunAutomaton fsa; - - private final class Frame { - /* fst stats */ - FST.Arc arc; - - Long output; - - /* automaton stats */ - int state; - - Frame() { - this.arc = new FST.Arc<>(); - this.state = -1; - } - - public String toString() { - return "arc=" + arc + " state=" + state; - } - } - - IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); - this.fst = index; - this.fstReader = fst.getBytesReader(); - this.fstOutputs = index.outputs; - this.fsa = compiled.runAutomaton; - this.level = -1; - this.stack = new Frame[16]; - for (int i = 0 ; i < stack.length; i++) { - this.stack[i] = new Frame(); - } - - Frame frame; - frame = loadVirtualFrame(newFrame()); - this.level++; - frame = loadFirstFrame(newFrame()); - pushFrame(frame); - - this.decoded = false; - this.pending = false; - - if (startTerm == null) { - pending = isAccept(topFrame()); - } else { - doSeekCeil(startTerm); - pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); - } - } - - @Override - public BytesRef term() throws IOException { - return term == null ? null : term.get(); - } - - @Override - void decodeMetaData() throws IOException { - if (!decoded) { - super.decodeMetaData(); - decoded = true; - } - } - - @Override - void decodeStats() throws IOException { - ord = topFrame().output; - super.decodeStats(); - } - - @Override - public SeekStatus seekCeil(BytesRef target) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public BytesRef next() throws IOException { - //if (TEST) System.out.println("Enum next()"); - if (pending) { - pending = false; - decodeStats(); - return term(); - } - decoded = false; - DFS: - while (level > 0) { - Frame frame = newFrame(); - if (loadExpandFrame(topFrame(), frame) != null) { // has valid target - pushFrame(frame); - if (isAccept(frame)) { // gotcha - break; - } - continue; // check next target - } - frame = popFrame(); - while(level > 0) { - if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling - pushFrame(frame); - if (isAccept(frame)) { // gotcha - break DFS; - } - continue DFS; // check next target - } - frame = popFrame(); - } - return null; - } - decodeStats(); - return term(); - } - - BytesRef doSeekCeil(BytesRef target) throws IOException { - //if (TEST) System.out.println("Enum doSeekCeil()"); - Frame frame= null; - int label, upto = 0, limit = target.length; - while (upto < limit) { // to target prefix, or ceil label (rewind prefix) - frame = newFrame(); - label = target.bytes[upto] & 0xff; - frame = loadCeilFrame(label, topFrame(), frame); - if (frame == null || frame.arc.label() != label) { - break; - } - assert isValid(frame); // target must be fetched from automaton - pushFrame(frame); - upto++; - } - if (upto == limit) { // got target - return term(); - } - if (frame != null) { // got larger term('s prefix) - pushFrame(frame); - return isAccept(frame) ? term() : next(); - } - while (level > 0) { // got target's prefix, advance to larger term - frame = popFrame(); - while (level > 0 && !canRewind(frame)) { - frame = popFrame(); - } - if (loadNextFrame(topFrame(), frame) != null) { - pushFrame(frame); - return isAccept(frame) ? term() : next(); - } - } - return null; - } - - /** Virtual frame, never pop */ - Frame loadVirtualFrame(Frame frame) { - frame.output = fstOutputs.getNoOutput(); - frame.state = -1; - return frame; - } - - /** Load frame for start arc(node) on fst */ - Frame loadFirstFrame(Frame frame) { - frame.arc = fst.getFirstArc(frame.arc); - frame.output = frame.arc.output(); - frame.state = 0; - return frame; - } - - /** Load frame for target arc(node) on fst */ - Frame loadExpandFrame(Frame top, Frame frame) throws IOException { - if (!canGrow(top)) { - return null; - } - frame.arc = fst.readFirstRealTargetArc(top.arc.target(), frame.arc, fstReader); - frame.state = fsa.step(top.state, frame.arc.label()); - frame.output = frame.arc.output(); - //if (TEST) System.out.println(" loadExpand frame="+frame); - if (frame.state == -1) { - return loadNextFrame(top, frame); - } - return frame; - } - - /** Load frame for sibling arc(node) on fst */ - Frame loadNextFrame(Frame top, Frame frame) throws IOException { - if (!canRewind(frame)) { - return null; - } - while (!frame.arc.isLast()) { - frame.arc = fst.readNextRealArc(frame.arc, fstReader); - frame.output = frame.arc.output(); - frame.state = fsa.step(top.state, frame.arc.label()); - if (frame.state != -1) { - break; - } - } - //if (TEST) System.out.println(" loadNext frame="+frame); - if (frame.state == -1) { - return null; - } - return frame; - } - - /** Load frame for target arc(node) on fst, so that - * arc.label >= label and !fsa.reject(arc.label) */ - Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { - FST.Arc arc = frame.arc; - arc = Util.readCeilArc(label, fst, top.arc, arc, fstReader); - if (arc == null) { - return null; - } - frame.state = fsa.step(top.state, arc.label()); - //if (TEST) System.out.println(" loadCeil frame="+frame); - if (frame.state == -1) { - return loadNextFrame(top, frame); - } - frame.output = arc.output(); - return frame; - } - - boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts - return fsa.isAccept(frame.state) && frame.arc.isFinal(); - } - boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject - return /*frame != null &&*/ frame.state != -1; - } - boolean canGrow(Frame frame) { // can walk forward on both fst&fsa - return frame.state != -1 && FST.targetHasArcs(frame.arc); - } - boolean canRewind(Frame frame) { // can jump to sibling - return !frame.arc.isLast(); - } - - void pushFrame(Frame frame) { - final FST.Arc arc = frame.arc; - frame.output = fstOutputs.add(topFrame().output, frame.output); - term = grow(arc.label()); - level++; - assert frame == stack[level]; - } - - Frame popFrame() { - term = shrink(); - return stack[level--]; - } - - Frame newFrame() { - if (level+1 == stack.length) { - final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(stack, 0, temp, 0, stack.length); - for (int i = stack.length; i < temp.length; i++) { - temp[i] = new Frame(); - } - stack = temp; - } - return stack[level+1]; - } - - Frame topFrame() { - return stack[level]; - } - - BytesRefBuilder grow(int label) { - if (term == null) { - term = new BytesRefBuilder(); - } else { - term.append((byte) label); - } - return term; - } - - BytesRefBuilder shrink() { - if (term.length() == 0) { - term = null; - } else { - term.setLength(term.length() - 1); - } - return term; - } - } - } - - static void walk(FST fst) throws IOException { - final ArrayList> queue = new ArrayList<>(); - final BitSet seen = new BitSet(); - final FST.BytesReader reader = fst.getBytesReader(); - final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); - queue.add(startArc); - while (!queue.isEmpty()) { - final FST.Arc arc = queue.remove(0); - final long node = arc.target(); - //System.out.println(arc); - if (FST.targetHasArcs(arc) && !seen.get((int) node)) { - seen.set((int) node); - fst.readFirstRealTargetArc(node, arc, reader); - while (true) { - queue.add(new FST.Arc().copyFrom(arc)); - if (arc.isLast()) { - break; - } else { - fst.readNextRealArc(arc, reader); - } - } - } - } - } - - @Override - public long ramBytesUsed() { - long ramBytesUsed = postingsReader.ramBytesUsed(); - for (TermsReader r : fields.values()) { - ramBytesUsed += r.ramBytesUsed(); - } - return ramBytesUsed; - } - - @Override - public Collection getChildResources() { - List resources = new ArrayList<>(Accountables.namedAccountables("field", fields)); - resources.add(Accountables.namedAccountable("delegate", postingsReader)); - return Collections.unmodifiableList(resources); - } - - @Override - public String toString() { - return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")"; - } - - @Override - public void checkIntegrity() throws IOException { - postingsReader.checkIntegrity(); - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java deleted file mode 100644 index a31a2f940b3..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java +++ /dev/null @@ -1,386 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.NormsProducer; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteBuffersDataOutput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.FSTCompiler; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.fst.Util; - -/** - * FST-based term dict, using ord as FST output. - * - * The FST holds the mapping between <term, ord>, and - * term's metadata is delta encoded into a single byte block. - * - * Typically the byte block consists of four parts: - * 1. term statistics: docFreq, totalTermFreq; - * 2. monotonic long[], e.g. the pointer to the postings list for that term; - * 3. generic byte[], e.g. other information customized by postings base. - * 4. single-level skip list to speed up metadata decoding by ord. - * - *

- * Files: - *

- * - * - *

Term Index

- *

- * The .tix contains a list of FSTs, one for each field. - * The FST maps a term to its corresponding order in current field. - *

- * - *
    - *
  • TermIndex(.tix) --> Header, TermFSTNumFields, Footer
  • - *
  • TermFST --> {@link FST FST<long>}
  • - *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • - *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • - *
- * - *

Notes:

- *
    - *
  • - * Since terms are already sorted before writing to Term Block, - * their ords can directly used to seek term metadata from term block. - *
  • - *
- * - * - *

Term Block

- *

- * The .tbk contains all the statistics and metadata for terms, along with field summary (e.g. - * per-field data like number of documents in current field). For each field, there are four blocks: - *

    - *
  • statistics bytes block: contains term statistics;
  • - *
  • metadata longs block: delta-encodes monotonic part of metadata;
  • - *
  • metadata bytes block: encodes other parts of metadata;
  • - *
  • skip block: contains skip data, to speed up metadata seeking and decoding
  • - *
- * - *

File Format:

- *
    - *
  • TermBlock(.tbk) --> Header, PostingsHeader, FieldSummary, DirOffset
  • - *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq, - * DocCount, LongsSize, DataBlock > NumFields, Footer
  • - * - *
  • DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, - * SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock
  • - *
  • SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, - * MetaLongsSkipDeltaLongsSize >NumTerms - *
  • StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > NumTerms - *
  • MetaLongsBlock --> < LongDeltaLongsSize, BytesSize > NumTerms - *
  • MetaBytesBlock --> Byte MetaBytesBlockLength - *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • - *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • - *
  • NumFields, FieldNumber, DocCount, DocFreq, LongsSize, - * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • - *
  • NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, - * StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq, - * LongDelta,--> {@link DataOutput#writeVLong VLong}
  • - *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • - *
- *

Notes:

- *
    - *
  • - * The format of PostingsHeader and MetaBytes are customized by the specific postings implementation: - * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data - * (non-monotonic ones like pulsed postings data). - *
  • - *
  • - * During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek - * term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset - * for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates - * the value of preceding metadata longs for every SkipInterval's term. - *
  • - *
  • - * DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term. - * Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case, - * so that encoding of TotalTermFreq may be omitted. - *
  • - *
- * - * @lucene.experimental - */ - -public class FSTOrdTermsWriter extends FieldsConsumer { - static final String TERMS_INDEX_EXTENSION = "tix"; - static final String TERMS_BLOCK_EXTENSION = "tbk"; - static final String TERMS_CODEC_NAME = "FSTOrdTerms"; - static final String TERMS_INDEX_CODEC_NAME = "FSTOrdIndex"; - - public static final int VERSION_START = 2; - public static final int VERSION_CURRENT = VERSION_START; - public static final int SKIP_INTERVAL = 8; - - final PostingsWriterBase postingsWriter; - final FieldInfos fieldInfos; - final int maxDoc; - final List fields = new ArrayList<>(); - IndexOutput blockOut = null; - IndexOutput indexOut = null; - - public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); - final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION); - - this.postingsWriter = postingsWriter; - this.fieldInfos = state.fieldInfos; - this.maxDoc = state.segmentInfo.maxDoc(); - - boolean success = false; - try { - this.indexOut = state.directory.createOutput(termsIndexFileName, state.context); - this.blockOut = state.directory.createOutput(termsBlockFileName, state.context); - CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, - state.segmentInfo.getId(), state.segmentSuffix); - CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT, - state.segmentInfo.getId(), state.segmentSuffix); - this.postingsWriter.init(blockOut, state); - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(indexOut, blockOut); - } - } - } - - @Override - public void write(Fields fields, NormsProducer norms) throws IOException { - for(String field : fields) { - Terms terms = fields.terms(field); - if (terms == null) { - continue; - } - FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - TermsEnum termsEnum = terms.iterator(); - TermsWriter termsWriter = new TermsWriter(fieldInfo); - - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - FixedBitSet docsSeen = new FixedBitSet(maxDoc); - while (true) { - BytesRef term = termsEnum.next(); - if (term == null) { - break; - } - BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms); - if (termState != null) { - termsWriter.finishTerm(term, termState); - sumTotalTermFreq += termState.totalTermFreq; - sumDocFreq += termState.docFreq; - } - } - - termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); - } - } - - @Override - public void close() throws IOException { - if (blockOut != null) { - boolean success = false; - try { - final long blockDirStart = blockOut.getFilePointer(); - - // write field summary - blockOut.writeVInt(fields.size()); - for (FieldMetaData field : fields) { - blockOut.writeVInt(field.fieldInfo.number); - blockOut.writeVLong(field.numTerms); - if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { - blockOut.writeVLong(field.sumTotalTermFreq); - } - blockOut.writeVLong(field.sumDocFreq); - blockOut.writeVInt(field.docCount); - blockOut.writeVInt(field.longsSize); - blockOut.writeVLong(field.statsOut.size()); - blockOut.writeVLong(field.metaLongsOut.size()); - blockOut.writeVLong(field.metaBytesOut.size()); - - field.skipOut.copyTo(blockOut); - field.statsOut.copyTo(blockOut); - field.metaLongsOut.copyTo(blockOut); - field.metaBytesOut.copyTo(blockOut); - field.dict.save(indexOut); - } - writeTrailer(blockOut, blockDirStart); - CodecUtil.writeFooter(indexOut); - CodecUtil.writeFooter(blockOut); - success = true; - } finally { - if (success) { - IOUtils.close(blockOut, indexOut, postingsWriter); - } else { - IOUtils.closeWhileHandlingException(blockOut, indexOut, postingsWriter); - } - blockOut = null; - } - } - } - - private void writeTrailer(IndexOutput out, long dirStart) throws IOException { - out.writeLong(dirStart); - } - - private static class FieldMetaData { - public FieldInfo fieldInfo; - public long numTerms; - public long sumTotalTermFreq; - public long sumDocFreq; - public int docCount; - public int longsSize; - public FST dict; - - // TODO: block encode each part - - // vint encode next skip point (fully decoded when reading) - public ByteBuffersDataOutput skipOut; - // vint encode df, (ttf-df) - public ByteBuffersDataOutput statsOut; - // vint encode monotonic long[] and length for corresponding byte[] - public ByteBuffersDataOutput metaLongsOut; - // generic byte[] - public ByteBuffersDataOutput metaBytesOut; - } - - final class TermsWriter { - private final FSTCompiler fstCompiler; - private final PositiveIntOutputs outputs; - private final FieldInfo fieldInfo; - private final int longsSize; - private long numTerms; - - private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); - private final ByteBuffersDataOutput statsOut = new ByteBuffersDataOutput(); - private final ByteBuffersDataOutput metaLongsOut = new ByteBuffersDataOutput(); - private final ByteBuffersDataOutput metaBytesOut = new ByteBuffersDataOutput(); - private final ByteBuffersDataOutput skipOut = new ByteBuffersDataOutput(); - private long lastBlockStatsFP; - private long lastBlockMetaLongsFP; - private long lastBlockMetaBytesFP; - private long[] lastBlockLongs; - - private long[] lastLongs; - private long lastMetaBytesFP; - - TermsWriter(FieldInfo fieldInfo) { - this.numTerms = 0; - this.fieldInfo = fieldInfo; - this.longsSize = postingsWriter.setField(fieldInfo); - this.outputs = PositiveIntOutputs.getSingleton(); - this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); - - this.lastBlockStatsFP = 0; - this.lastBlockMetaLongsFP = 0; - this.lastBlockMetaBytesFP = 0; - this.lastBlockLongs = new long[longsSize]; - - this.lastLongs = new long[longsSize]; - this.lastMetaBytesFP = 0; - } - - public void finishTerm(BytesRef text, BlockTermState state) throws IOException { - if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) { - bufferSkip(); - } - // write term meta data into fst - final long longs[] = new long[longsSize]; - final long delta = state.totalTermFreq - state.docFreq; - if (state.totalTermFreq > 0) { - if (delta == 0) { - statsOut.writeVInt(state.docFreq<<1|1); - } else { - statsOut.writeVInt(state.docFreq<<1); - statsOut.writeVLong(state.totalTermFreq-state.docFreq); - } - } else { - statsOut.writeVInt(state.docFreq); - } - postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true); - for (int i = 0; i < longsSize; i++) { - metaLongsOut.writeVLong(longs[i] - lastLongs[i]); - lastLongs[i] = longs[i]; - } - metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP); - - fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms); - numTerms++; - - lastMetaBytesFP = metaBytesOut.size(); - } - - public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { - if (numTerms > 0) { - final FieldMetaData metadata = new FieldMetaData(); - metadata.fieldInfo = fieldInfo; - metadata.numTerms = numTerms; - metadata.sumTotalTermFreq = sumTotalTermFreq; - metadata.sumDocFreq = sumDocFreq; - metadata.docCount = docCount; - metadata.longsSize = longsSize; - metadata.skipOut = skipOut; - metadata.statsOut = statsOut; - metadata.metaLongsOut = metaLongsOut; - metadata.metaBytesOut = metaBytesOut; - metadata.dict = fstCompiler.compile(); - fields.add(metadata); - } - } - - private void bufferSkip() throws IOException { - skipOut.writeVLong(statsOut.size() - lastBlockStatsFP); - skipOut.writeVLong(metaLongsOut.size() - lastBlockMetaLongsFP); - skipOut.writeVLong(metaBytesOut.size() - lastBlockMetaBytesFP); - for (int i = 0; i < longsSize; i++) { - skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]); - } - lastBlockStatsFP = statsOut.size(); - lastBlockMetaLongsFP = metaLongsOut.size(); - lastBlockMetaBytesFP = metaBytesOut.size(); - System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize); - } - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java deleted file mode 100644 index 97a799604e0..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - - -import java.io.IOException; - -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; -import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.util.IOUtils; - -/** - * FST term dict + Lucene50PBF - */ - -public final class FSTPostingsFormat extends PostingsFormat { - public FSTPostingsFormat() { - super("FST50"); - } - - @Override - public String toString() { - return getName(); - } - - @Override - public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene84PostingsWriter(state); - - boolean success = false; - try { - FieldsConsumer ret = new FSTTermsWriter(state, postingsWriter); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsWriter); - } - } - } - - @Override - public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene84PostingsReader(state); - boolean success = false; - try { - FieldsProducer ret = new FSTTermsReader(state, postingsReader); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsReader); - } - } - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java deleted file mode 100644 index 3695fe872e5..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.fst.Outputs; - -/** - * An FST {@link Outputs} implementation for - * {@link FSTTermsWriter}. - * - * @lucene.experimental - */ - -// NOTE: outputs should be per-field, since -// longsSize is fixed for each field -class FSTTermOutputs extends Outputs { - private final static TermData NO_OUTPUT = new TermData(); - //private static boolean TEST = false; - private final boolean hasPos; - private final int longsSize; - - /** - * Represents the metadata for one term. - * On an FST, only long[] part is 'shared' and pushed towards root. - * byte[] and term stats will be kept on deeper arcs. - */ - static class TermData implements Accountable { - private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermData.class); - long[] longs; - byte[] bytes; - int docFreq; - long totalTermFreq; - TermData() { - this.longs = null; - this.bytes = null; - this.docFreq = 0; - this.totalTermFreq = -1; - } - TermData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) { - this.longs = longs; - this.bytes = bytes; - this.docFreq = docFreq; - this.totalTermFreq = totalTermFreq; - } - - @Override - public long ramBytesUsed() { - long ramBytesUsed = BASE_RAM_BYTES_USED; - if (longs != null) { - ramBytesUsed += RamUsageEstimator.sizeOf(longs); - } - if (bytes != null) { - ramBytesUsed += RamUsageEstimator.sizeOf(bytes); - } - return ramBytesUsed; - } - - // NOTE: actually, FST nodes are seldom - // identical when outputs on their arcs - // aren't NO_OUTPUTs. - @Override - public int hashCode() { - int hash = 0; - if (longs != null) { - final int end = longs.length; - for (int i = 0; i < end; i++) { - hash -= longs[i]; - } - } - if (bytes != null) { - hash = -hash; - final int end = bytes.length; - for (int i = 0; i < end; i++) { - hash += bytes[i]; - } - } - hash += docFreq + totalTermFreq; - return hash; - } - - @Override - public String toString() { - return "FSTTermOutputs$TermData longs=" + Arrays.toString(longs) + " bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq; - } - - @Override - public boolean equals(Object other_) { - if (other_ == this) { - return true; - } else if (!(other_ instanceof FSTTermOutputs.TermData)) { - return false; - } - TermData other = (TermData) other_; - return statsEqual(this, other) && - longsEqual(this, other) && - bytesEqual(this, other); - - } - } - - protected FSTTermOutputs(FieldInfo fieldInfo, int longsSize) { - this.hasPos = fieldInfo.getIndexOptions() != IndexOptions.DOCS; - this.longsSize = longsSize; - } - - @Override - public long ramBytesUsed(TermData output) { - return output.ramBytesUsed(); - } - - @Override - // - // The return value will be the smaller one, when these two are - // 'comparable', i.e. - // 1. every value in t1 is not larger than in t2, or - // 2. every value in t1 is not smaller than t2. - // - public TermData common(TermData t1, TermData t2) { - //if (TEST) System.out.print("common("+t1+", "+t2+") = "); - if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) { - //if (TEST) System.out.println("ret:"+NO_OUTPUT); - return NO_OUTPUT; - } - assert t1.longs.length == t2.longs.length; - - long[] min = t1.longs, max = t2.longs; - int pos = 0; - TermData ret; - - while (pos < longsSize && min[pos] == max[pos]) { - pos++; - } - if (pos < longsSize) { // unequal long[] - if (min[pos] > max[pos]) { - min = t2.longs; - max = t1.longs; - } - // check whether strictly smaller - while (pos < longsSize && min[pos] <= max[pos]) { - pos++; - } - if (pos < longsSize || allZero(min)) { // not comparable or all-zero - ret = NO_OUTPUT; - } else { - ret = new TermData(min, null, 0, -1); - } - } else { // equal long[] - if (statsEqual(t1, t2) && bytesEqual(t1, t2)) { - ret = t1; - } else if (allZero(min)) { - ret = NO_OUTPUT; - } else { - ret = new TermData(min, null, 0, -1); - } - } - //if (TEST) System.out.println("ret:"+ret); - return ret; - } - - @Override - public TermData subtract(TermData t1, TermData t2) { - //if (TEST) System.out.print("subtract("+t1+", "+t2+") = "); - if (t2 == NO_OUTPUT) { - //if (TEST) System.out.println("ret:"+t1); - return t1; - } - assert t1.longs.length == t2.longs.length; - - int pos = 0; - long diff = 0; - long[] share = new long[longsSize]; - - while (pos < longsSize) { - share[pos] = t1.longs[pos] - t2.longs[pos]; - diff += share[pos]; - pos++; - } - - TermData ret; - if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) { - ret = NO_OUTPUT; - } else { - ret = new TermData(share, t1.bytes, t1.docFreq, t1.totalTermFreq); - } - //if (TEST) System.out.println("ret:"+ret); - return ret; - } - - // TODO: if we refactor a 'addSelf(TermData other)', - // we can gain about 5~7% for fuzzy queries, however this also - // means we are putting too much stress on FST Outputs decoding? - @Override - public TermData add(TermData t1, TermData t2) { - //if (TEST) System.out.print("add("+t1+", "+t2+") = "); - if (t1 == NO_OUTPUT) { - //if (TEST) System.out.println("ret:"+t2); - return t2; - } else if (t2 == NO_OUTPUT) { - //if (TEST) System.out.println("ret:"+t1); - return t1; - } - assert t1.longs.length == t2.longs.length; - - int pos = 0; - long[] accum = new long[longsSize]; - - while (pos < longsSize) { - accum[pos] = t1.longs[pos] + t2.longs[pos]; - pos++; - } - - TermData ret; - if (t2.bytes != null || t2.docFreq > 0) { - ret = new TermData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq); - } else { - ret = new TermData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq); - } - //if (TEST) System.out.println("ret:"+ret); - return ret; - } - - @Override - public void write(TermData data, DataOutput out) throws IOException { - assert hasPos || data.totalTermFreq == -1; - int bit0 = allZero(data.longs) ? 0 : 1; - int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1; - int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2; - int bits = bit0 | bit1 | bit2; - if (bit1 > 0) { // determine extra length - if (data.bytes.length < 32) { - bits |= (data.bytes.length << 3); - out.writeByte((byte)bits); - } else { - out.writeByte((byte)bits); - out.writeVInt(data.bytes.length); - } - } else { - out.writeByte((byte)bits); - } - if (bit0 > 0) { // not all-zero case - for (int pos = 0; pos < longsSize; pos++) { - out.writeVLong(data.longs[pos]); - } - } - if (bit1 > 0) { // bytes exists - out.writeBytes(data.bytes, 0, data.bytes.length); - } - if (bit2 > 0) { // stats exist - if (hasPos) { - if (data.docFreq == data.totalTermFreq) { - out.writeVInt((data.docFreq << 1) | 1); - } else { - out.writeVInt((data.docFreq << 1)); - out.writeVLong(data.totalTermFreq - data.docFreq); - } - } else { - out.writeVInt(data.docFreq); - } - } - } - - @Override - public TermData read(DataInput in) throws IOException { - long[] longs = new long[longsSize]; - byte[] bytes = null; - int docFreq = 0; - long totalTermFreq = -1; - int bits = in.readByte() & 0xff; - int bit0 = bits & 1; - int bit1 = bits & 2; - int bit2 = bits & 4; - int bytesSize = (bits >>> 3); - if (bit1 > 0 && bytesSize == 0) { // determine extra length - bytesSize = in.readVInt(); - } - if (bit0 > 0) { // not all-zero case - for (int pos = 0; pos < longsSize; pos++) { - longs[pos] = in.readVLong(); - } - } - if (bit1 > 0) { // bytes exists - bytes = new byte[bytesSize]; - in.readBytes(bytes, 0, bytesSize); - } - if (bit2 > 0) { // stats exist - int code = in.readVInt(); - if (hasPos) { - totalTermFreq = docFreq = code >>> 1; - if ((code & 1) == 0) { - totalTermFreq += in.readVLong(); - } - } else { - docFreq = code; - } - } - return new TermData(longs, bytes, docFreq, totalTermFreq); - } - - - @Override - public void skipOutput(DataInput in) throws IOException { - int bits = in.readByte() & 0xff; - int bit0 = bits & 1; - int bit1 = bits & 2; - int bit2 = bits & 4; - int bytesSize = (bits >>> 3); - if (bit1 > 0 && bytesSize == 0) { // determine extra length - bytesSize = in.readVInt(); - } - if (bit0 > 0) { // not all-zero case - for (int pos = 0; pos < longsSize; pos++) { - in.readVLong(); - } - } - if (bit1 > 0) { // bytes exists - in.skipBytes(bytesSize); - } - if (bit2 > 0) { // stats exist - int code = in.readVInt(); - if (hasPos && (code & 1) == 0) { - in.readVLong(); - } - } - } - - @Override - public TermData getNoOutput() { - return NO_OUTPUT; - } - - @Override - public String outputToString(TermData data) { - return data.toString(); - } - - static boolean statsEqual(final TermData t1, final TermData t2) { - return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq; - } - static boolean bytesEqual(final TermData t1, final TermData t2) { - if (t1.bytes == null && t2.bytes == null) { - return true; - } - return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes); - } - static boolean longsEqual(final TermData t1, final TermData t2) { - if (t1.longs == null && t2.longs == null) { - return true; - } - return t1.longs != null && t2.longs != null && Arrays.equals(t1.longs, t2.longs); - } - static boolean allZero(final long[] l) { - for (int i = 0; i < l.length; i++) { - if (l[i] != 0) { - return false; - } - } - return true; - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java deleted file mode 100644 index 33084766424..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java +++ /dev/null @@ -1,785 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.Collection; -import java.util.Collections; -import java.util.Iterator; -import java.util.List; -import java.util.TreeMap; - -import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.index.CorruptIndexException; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.ImpactsEnum; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.SegmentInfo; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteArrayDataInput; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.util.Accountable; -import org.apache.lucene.util.Accountables; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.automaton.ByteRunAutomaton; -import org.apache.lucene.util.automaton.CompiledAutomaton; -import org.apache.lucene.util.fst.BytesRefFSTEnum; -import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Outputs; -import org.apache.lucene.util.fst.Util; - -/** - * FST-based terms dictionary reader. - * - * The FST directly maps each term and its metadata, - * it is memory resident. - * - * @lucene.experimental - */ - -public class FSTTermsReader extends FieldsProducer { - final TreeMap fields = new TreeMap<>(); - final PostingsReaderBase postingsReader; - //static boolean TEST = false; - - public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { - final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION); - - this.postingsReader = postingsReader; - final IndexInput in = state.directory.openInput(termsFileName, state.context); - - boolean success = false; - try { - CodecUtil.checkIndexHeader(in, FSTTermsWriter.TERMS_CODEC_NAME, - FSTTermsWriter.TERMS_VERSION_START, - FSTTermsWriter.TERMS_VERSION_CURRENT, - state.segmentInfo.getId(), state.segmentSuffix); - CodecUtil.checksumEntireFile(in); - this.postingsReader.init(in, state); - seekDir(in); - - final FieldInfos fieldInfos = state.fieldInfos; - final int numFields = in.readVInt(); - for (int i = 0; i < numFields; i++) { - int fieldNumber = in.readVInt(); - FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); - long numTerms = in.readVLong(); - long sumTotalTermFreq = in.readVLong(); - // if frequencies are omitted, sumTotalTermFreq=sumDocFreq and we only write one value - long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong(); - int docCount = in.readVInt(); - int longsSize = in.readVInt(); - TermsReader current = new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize); - TermsReader previous = fields.put(fieldInfo.name, current); - checkFieldSummary(state.segmentInfo, in, current, previous); - } - success = true; - } finally { - if (success) { - IOUtils.close(in); - } else { - IOUtils.closeWhileHandlingException(in); - } - } - } - - private void seekDir(IndexInput in) throws IOException { - in.seek(in.length() - CodecUtil.footerLength() - 8); - in.seek(in.readLong()); - } - private void checkFieldSummary(SegmentInfo info, IndexInput in, TermsReader field, TermsReader previous) throws IOException { - // #docs with field must be <= #docs - if (field.docCount < 0 || field.docCount > info.maxDoc()) { - throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.maxDoc(), in); - } - // #postings must be >= #docs with field - if (field.sumDocFreq < field.docCount) { - throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount, in); - } - // #positions must be >= #postings - if (field.sumTotalTermFreq < field.sumDocFreq) { - throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq, in); - } - if (previous != null) { - throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name, in); - } - } - - @Override - public Iterator iterator() { - return Collections.unmodifiableSet(fields.keySet()).iterator(); - } - - @Override - public Terms terms(String field) throws IOException { - assert field != null; - return fields.get(field); - } - - @Override - public int size() { - return fields.size(); - } - - @Override - public void close() throws IOException { - try { - IOUtils.close(postingsReader); - } finally { - fields.clear(); - } - } - - private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsReader.class); - final class TermsReader extends Terms implements Accountable { - - final FieldInfo fieldInfo; - final long numTerms; - final long sumTotalTermFreq; - final long sumDocFreq; - final int docCount; - final int longsSize; - final FST dict; - - TermsReader(FieldInfo fieldInfo, IndexInput in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException { - this.fieldInfo = fieldInfo; - this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; - this.longsSize = longsSize; - this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo, longsSize)); - } - - @Override - public long ramBytesUsed() { - long bytesUsed = BASE_RAM_BYTES_USED; - if (dict != null) { - bytesUsed += dict.ramBytesUsed(); - } - return bytesUsed; - } - - @Override - public Collection getChildResources() { - if (dict == null) { - return Collections.emptyList(); - } else { - return Collections.singletonList(Accountables.namedAccountable("terms", dict)); - } - } - - @Override - public String toString() { - return "FSTTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")"; - } - - @Override - public boolean hasFreqs() { - return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - } - - @Override - public boolean hasOffsets() { - return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; - } - - @Override - public boolean hasPositions() { - return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; - } - - @Override - public boolean hasPayloads() { - return fieldInfo.hasPayloads(); - } - - @Override - public long size() { - return numTerms; - } - - @Override - public long getSumTotalTermFreq() { - return sumTotalTermFreq; - } - - @Override - public long getSumDocFreq() throws IOException { - return sumDocFreq; - } - - @Override - public int getDocCount() throws IOException { - return docCount; - } - - @Override - public TermsEnum iterator() throws IOException { - return new SegmentTermsEnum(); - } - - @Override - public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { - throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); - } - return new IntersectTermsEnum(compiled, startTerm); - } - - // Only wraps common operations for PBF interact - abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { - - /* Current term stats + decoded metadata (customized by PBF) */ - final BlockTermState state; - - /* Current term stats + undecoded metadata (long[] & byte[]) */ - FSTTermOutputs.TermData meta; - ByteArrayDataInput bytesReader; - - /** Decodes metadata into customized term state */ - abstract void decodeMetaData() throws IOException; - - BaseTermsEnum() throws IOException { - this.state = postingsReader.newTermState(); - this.bytesReader = new ByteArrayDataInput(); - // NOTE: metadata will only be initialized in child class - } - - @Override - public TermState termState() throws IOException { - decodeMetaData(); - return state.clone(); - } - - @Override - public int docFreq() throws IOException { - return state.docFreq; - } - - @Override - public long totalTermFreq() throws IOException { - return state.totalTermFreq == -1 ? state.docFreq : state.totalTermFreq; - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - decodeMetaData(); - return postingsReader.postings(fieldInfo, state, reuse, flags); - } - - @Override - public ImpactsEnum impacts(int flags) throws IOException { - decodeMetaData(); - return postingsReader.impacts(fieldInfo, state, flags); - } - - @Override - public void seekExact(long ord) throws IOException { - throw new UnsupportedOperationException(); - } - - @Override - public long ord() { - throw new UnsupportedOperationException(); - } - } - - - // Iterates through all terms in this field - private final class SegmentTermsEnum extends BaseTermsEnum { - /* Current term, null when enum ends or unpositioned */ - BytesRef term; - final BytesRefFSTEnum fstEnum; - - /* True when current term's metadata is decoded */ - boolean decoded; - - /* True when current enum is 'positioned' by seekExact(TermState) */ - boolean seekPending; - - SegmentTermsEnum() throws IOException { - super(); - this.fstEnum = new BytesRefFSTEnum<>(dict); - this.decoded = false; - this.seekPending = false; - this.meta = null; - } - - @Override - public BytesRef term() throws IOException { - return term; - } - - // Let PBF decode metadata from long[] and byte[] - @Override - void decodeMetaData() throws IOException { - if (!decoded && !seekPending) { - if (meta.bytes != null) { - bytesReader.reset(meta.bytes, 0, meta.bytes.length); - } - postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true); - decoded = true; - } - } - - // Update current enum according to FSTEnum - void updateEnum(final InputOutput pair) { - if (pair == null) { - term = null; - } else { - term = pair.input; - meta = pair.output; - state.docFreq = meta.docFreq; - state.totalTermFreq = meta.totalTermFreq; - } - decoded = false; - seekPending = false; - } - - @Override - public BytesRef next() throws IOException { - if (seekPending) { // previously positioned, but termOutputs not fetched - seekPending = false; - SeekStatus status = seekCeil(term); - assert status == SeekStatus.FOUND; // must positioned on valid term - } - updateEnum(fstEnum.next()); - return term; - } - - @Override - public boolean seekExact(BytesRef target) throws IOException { - updateEnum(fstEnum.seekExact(target)); - return term != null; - } - - @Override - public SeekStatus seekCeil(BytesRef target) throws IOException { - updateEnum(fstEnum.seekCeil(target)); - if (term == null) { - return SeekStatus.END; - } else { - return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; - } - } - - @Override - public void seekExact(BytesRef target, TermState otherState) { - if (!target.equals(term)) { - state.copyFrom(otherState); - term = BytesRef.deepCopyOf(target); - seekPending = true; - } - } - } - - // Iterates intersect result with automaton (cannot seek!) - private final class IntersectTermsEnum extends BaseTermsEnum { - /* Current term, null when enum ends or unpositioned */ - BytesRefBuilder term; - /* True when current term's metadata is decoded */ - boolean decoded; - - /* True when there is pending term when calling next() */ - boolean pending; - - /* stack to record how current term is constructed, - * used to accumulate metadata or rewind term: - * level == term.length + 1, - * == 0 when term is null */ - Frame[] stack; - int level; - - /* to which level the metadata is accumulated - * so that we can accumulate metadata lazily */ - int metaUpto; - - /* term dict fst */ - final FST fst; - final FST.BytesReader fstReader; - final Outputs fstOutputs; - - /* query automaton to intersect with */ - final ByteRunAutomaton fsa; - - private final class Frame { - /* fst stats */ - FST.Arc fstArc; - - FSTTermOutputs.TermData output; - - /* automaton stats */ - int fsaState; - - Frame() { - this.fstArc = new FST.Arc<>(); - this.fsaState = -1; - } - - public String toString() { - return "arc=" + fstArc + " state=" + fsaState; - } - } - - IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - super(); - //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); - this.fst = dict; - this.fstReader = fst.getBytesReader(); - this.fstOutputs = dict.outputs; - this.fsa = compiled.runAutomaton; - this.level = -1; - this.stack = new Frame[16]; - for (int i = 0 ; i < stack.length; i++) { - this.stack[i] = new Frame(); - } - - loadVirtualFrame(newFrame()); - this.level++; - pushFrame(loadFirstFrame(newFrame())); - - this.meta = null; - this.metaUpto = 1; - this.decoded = false; - this.pending = false; - - if (startTerm == null) { - pending = isAccept(topFrame()); - } else { - doSeekCeil(startTerm); - pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); - } - } - - @Override - public BytesRef term() throws IOException { - return term == null ? null : term.get(); - } - - @Override - void decodeMetaData() throws IOException { - assert term != null; - if (!decoded) { - if (meta.bytes != null) { - bytesReader.reset(meta.bytes, 0, meta.bytes.length); - } - postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true); - decoded = true; - } - } - - /** Lazily accumulate meta data, when we got a accepted term */ - void loadMetaData() { - Frame last, next; - last = stack[metaUpto]; - while (metaUpto != level) { - metaUpto++; - next = stack[metaUpto]; - next.output = fstOutputs.add(next.output, last.output); - last = next; - } - if (last.fstArc.isFinal()) { - meta = fstOutputs.add(last.output, last.fstArc.nextFinalOutput()); - } else { - meta = last.output; - } - state.docFreq = meta.docFreq; - state.totalTermFreq = meta.totalTermFreq; - } - - @Override - public SeekStatus seekCeil(BytesRef target) throws IOException { - decoded = false; - doSeekCeil(target); - loadMetaData(); - if (term == null) { - return SeekStatus.END; - } else { - return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; - } - } - - @Override - public BytesRef next() throws IOException { - //if (TEST) System.out.println("Enum next()"); - if (pending) { - pending = false; - loadMetaData(); - return term(); - } - decoded = false; - DFS: - while (level > 0) { - Frame frame = newFrame(); - if (loadExpandFrame(topFrame(), frame) != null) { // has valid target - pushFrame(frame); - if (isAccept(frame)) { // gotcha - break; - } - continue; // check next target - } - frame = popFrame(); - while(level > 0) { - if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling - pushFrame(frame); - if (isAccept(frame)) { // gotcha - break DFS; - } - continue DFS; // check next target - } - frame = popFrame(); - } - return null; - } - loadMetaData(); - return term(); - } - - private BytesRef doSeekCeil(BytesRef target) throws IOException { - //if (TEST) System.out.println("Enum doSeekCeil()"); - Frame frame= null; - int label, upto = 0, limit = target.length; - while (upto < limit) { // to target prefix, or ceil label (rewind prefix) - frame = newFrame(); - label = target.bytes[upto] & 0xff; - frame = loadCeilFrame(label, topFrame(), frame); - if (frame == null || frame.fstArc.label() != label) { - break; - } - assert isValid(frame); // target must be fetched from automaton - pushFrame(frame); - upto++; - } - if (upto == limit) { // got target - return term(); - } - if (frame != null) { // got larger term('s prefix) - pushFrame(frame); - return isAccept(frame) ? term() : next(); - } - while (level > 0) { // got target's prefix, advance to larger term - frame = popFrame(); - while (level > 0 && !canRewind(frame)) { - frame = popFrame(); - } - if (loadNextFrame(topFrame(), frame) != null) { - pushFrame(frame); - return isAccept(frame) ? term() : next(); - } - } - return null; - } - - /** Virtual frame, never pop */ - Frame loadVirtualFrame(Frame frame) { - frame.output = fstOutputs.getNoOutput(); - frame.fsaState = -1; - return frame; - } - - /** Load frame for start arc(node) on fst */ - Frame loadFirstFrame(Frame frame) throws IOException { - frame.fstArc = fst.getFirstArc(frame.fstArc); - frame.output = frame.fstArc.output(); - frame.fsaState = 0; - return frame; - } - - /** Load frame for target arc(node) on fst */ - Frame loadExpandFrame(Frame top, Frame frame) throws IOException { - if (!canGrow(top)) { - return null; - } - frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader); - frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); - //if (TEST) System.out.println(" loadExpand frame="+frame); - if (frame.fsaState == -1) { - return loadNextFrame(top, frame); - } - frame.output = frame.fstArc.output(); - return frame; - } - - /** Load frame for sibling arc(node) on fst */ - Frame loadNextFrame(Frame top, Frame frame) throws IOException { - if (!canRewind(frame)) { - return null; - } - while (!frame.fstArc.isLast()) { - frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader); - frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); - if (frame.fsaState != -1) { - break; - } - } - //if (TEST) System.out.println(" loadNext frame="+frame); - if (frame.fsaState == -1) { - return null; - } - frame.output = frame.fstArc.output(); - return frame; - } - - /** Load frame for target arc(node) on fst, so that - * arc.label >= label and !fsa.reject(arc.label) */ - Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { - FST.Arc arc = frame.fstArc; - arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); - if (arc == null) { - return null; - } - frame.fsaState = fsa.step(top.fsaState, arc.label()); - //if (TEST) System.out.println(" loadCeil frame="+frame); - if (frame.fsaState == -1) { - return loadNextFrame(top, frame); - } - frame.output = frame.fstArc.output(); - return frame; - } - - boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts - return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal(); - } - boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject - return /*frame != null &&*/ frame.fsaState != -1; - } - boolean canGrow(Frame frame) { // can walk forward on both fst&fsa - return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc); - } - boolean canRewind(Frame frame) { // can jump to sibling - return !frame.fstArc.isLast(); - } - - void pushFrame(Frame frame) { - term = grow(frame.fstArc.label()); - level++; - //if (TEST) System.out.println(" term=" + term + " level=" + level); - } - - Frame popFrame() { - term = shrink(); - level--; - metaUpto = metaUpto > level ? level : metaUpto; - //if (TEST) System.out.println(" term=" + term + " level=" + level); - return stack[level+1]; - } - - Frame newFrame() { - if (level+1 == stack.length) { - final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(stack, 0, temp, 0, stack.length); - for (int i = stack.length; i < temp.length; i++) { - temp[i] = new Frame(); - } - stack = temp; - } - return stack[level+1]; - } - - Frame topFrame() { - return stack[level]; - } - - BytesRefBuilder grow(int label) { - if (term == null) { - term = new BytesRefBuilder(); - } else { - term.append((byte)label); - } - return term; - } - - BytesRefBuilder shrink() { - if (term.length() == 0) { - term = null; - } else { - term.setLength(term.length() - 1); - } - return term; - } - } - } - - static void walk(FST fst) throws IOException { - final ArrayList> queue = new ArrayList<>(); - final BitSet seen = new BitSet(); - final FST.BytesReader reader = fst.getBytesReader(); - final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); - queue.add(startArc); - while (!queue.isEmpty()) { - final FST.Arc arc = queue.remove(0); - final long node = arc.target(); - //System.out.println(arc); - if (FST.targetHasArcs(arc) && !seen.get((int) node)) { - seen.set((int) node); - fst.readFirstRealTargetArc(node, arc, reader); - while (true) { - queue.add(new FST.Arc().copyFrom(arc)); - if (arc.isLast()) { - break; - } else { - fst.readNextRealArc(arc, reader); - } - } - } - } - } - - @Override - public long ramBytesUsed() { - long ramBytesUsed = postingsReader.ramBytesUsed(); - for (TermsReader r : fields.values()) { - ramBytesUsed += r.ramBytesUsed(); - } - return ramBytesUsed; - } - - @Override - public Collection getChildResources() { - List resources = new ArrayList<>(Accountables.namedAccountables("field", fields)); - resources.add(Accountables.namedAccountable("delegate", postingsReader)); - return Collections.unmodifiableCollection(resources); - } - - @Override - public String toString() { - return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")"; - } - - @Override - public void checkIntegrity() throws IOException { - postingsReader.checkIntegrity(); - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java deleted file mode 100644 index 2ef15651041..00000000000 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.CodecUtil; -import org.apache.lucene.codecs.FieldsConsumer; -import org.apache.lucene.codecs.NormsProducer; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.store.ByteBuffersDataOutput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.FSTCompiler; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.Util; - -/** - * FST-based term dict, using metadata as FST output. - * - * The FST directly holds the mapping between <term, metadata>. - * - * Term metadata consists of three parts: - * 1. term statistics: docFreq, totalTermFreq; - * 2. monotonic long[], e.g. the pointer to the postings list for that term; - * 3. generic byte[], e.g. other information need by postings reader. - * - *

- * File: - *

- *

- * - * - *

Term Dictionary

- *

- * The .tst contains a list of FSTs, one for each field. - * The FST maps a term to its corresponding statistics (e.g. docfreq) - * and metadata (e.g. information for postings list reader like file pointer - * to postings list). - *

- *

- * Typically the metadata is separated into two parts: - *

    - *
  • - * Monotonical long array: Some metadata will always be ascending in order - * with the corresponding term. This part is used by FST to share outputs between arcs. - *
  • - *
  • - * Generic byte array: Used to store non-monotonic metadata. - *
  • - *
- * - * File format: - *
    - *
  • TermsDict(.tst) --> Header, PostingsHeader, FieldSummary, DirOffset
  • - *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, - * SumDocFreq, DocCount, LongsSize, TermFST >NumFields
  • - *
  • TermFST --> {@link FST FST<TermData>}
  • - *
  • TermData --> Flag, BytesSize?, LongDeltaLongsSize?, ByteBytesSize?, - * < DocFreq[Same?], (TotalTermFreq-DocFreq) > ?
  • - *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • - *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • - *
  • DocFreq, LongsSize, BytesSize, NumFields, - * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • - *
  • TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta --> - * {@link DataOutput#writeVLong VLong}
  • - *
- *

Notes:

- *
    - *
  • - * The format of PostingsHeader and generic meta bytes are customized by the specific postings implementation: - * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data - * (non-monotonic ones like pulsed postings data). - *
  • - *
  • - * The format of TermData is determined by FST, typically monotonic metadata will be dense around shallow arcs, - * while in deeper arcs only generic bytes and term statistics exist. - *
  • - *
  • - * The byte Flag is used to indicate which part of metadata exists on current arc. Specially the monotonic part - * is omitted when it is an array of 0s. - *
  • - *
  • - * Since LongsSize is per-field fixed, it is only written once in field summary. - *
  • - *
- * - * @lucene.experimental - */ - -public class FSTTermsWriter extends FieldsConsumer { - static final String TERMS_EXTENSION = "tfp"; - static final String TERMS_CODEC_NAME = "FSTTerms"; - public static final int TERMS_VERSION_START = 2; - public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START; - - final PostingsWriterBase postingsWriter; - final FieldInfos fieldInfos; - IndexOutput out; - final int maxDoc; - final List fields = new ArrayList<>(); - - public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { - final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); - - this.postingsWriter = postingsWriter; - this.fieldInfos = state.fieldInfos; - this.out = state.directory.createOutput(termsFileName, state.context); - this.maxDoc = state.segmentInfo.maxDoc(); - - boolean success = false; - try { - CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT, - state.segmentInfo.getId(), state.segmentSuffix); - - this.postingsWriter.init(out, state); - success = true; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(out); - } - } - } - - private void writeTrailer(IndexOutput out, long dirStart) throws IOException { - out.writeLong(dirStart); - } - - @Override - public void write(Fields fields, NormsProducer norms) throws IOException { - for(String field : fields) { - Terms terms = fields.terms(field); - if (terms == null) { - continue; - } - FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; - TermsEnum termsEnum = terms.iterator(); - TermsWriter termsWriter = new TermsWriter(fieldInfo); - - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - FixedBitSet docsSeen = new FixedBitSet(maxDoc); - - while (true) { - BytesRef term = termsEnum.next(); - if (term == null) { - break; - } - - BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms); - if (termState != null) { - termsWriter.finishTerm(term, termState); - sumTotalTermFreq += termState.totalTermFreq; - sumDocFreq += termState.docFreq; - } - } - - termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); - } - } - - @Override - public void close() throws IOException { - if (out != null) { - boolean success = false; - try { - // write field summary - final long dirStart = out.getFilePointer(); - - out.writeVInt(fields.size()); - for (FieldMetaData field : fields) { - out.writeVInt(field.fieldInfo.number); - out.writeVLong(field.numTerms); - if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { - out.writeVLong(field.sumTotalTermFreq); - } - out.writeVLong(field.sumDocFreq); - out.writeVInt(field.docCount); - out.writeVInt(field.longsSize); - field.dict.save(out); - } - writeTrailer(out, dirStart); - CodecUtil.writeFooter(out); - success = true; - } finally { - if (success) { - IOUtils.close(out, postingsWriter); - } else { - IOUtils.closeWhileHandlingException(out, postingsWriter); - } - out = null; - } - } - } - - private static class FieldMetaData { - public final FieldInfo fieldInfo; - public final long numTerms; - public final long sumTotalTermFreq; - public final long sumDocFreq; - public final int docCount; - public final int longsSize; - public final FST dict; - - public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST fst) { - this.fieldInfo = fieldInfo; - this.numTerms = numTerms; - this.sumTotalTermFreq = sumTotalTermFreq; - this.sumDocFreq = sumDocFreq; - this.docCount = docCount; - this.longsSize = longsSize; - this.dict = fst; - } - } - - final class TermsWriter { - private final FSTCompiler fstCompiler; - private final FSTTermOutputs outputs; - private final FieldInfo fieldInfo; - private final int longsSize; - private long numTerms; - - private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); - private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); - - TermsWriter(FieldInfo fieldInfo) { - this.numTerms = 0; - this.fieldInfo = fieldInfo; - this.longsSize = postingsWriter.setField(fieldInfo); - this.outputs = new FSTTermOutputs(fieldInfo, longsSize); - this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); - } - - public void finishTerm(BytesRef text, BlockTermState state) throws IOException { - // write term meta data into fst - final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData(); - meta.longs = new long[longsSize]; - meta.bytes = null; - meta.docFreq = state.docFreq; - meta.totalTermFreq = state.totalTermFreq; - postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true); - if (metaWriter.size() > 0) { - meta.bytes = metaWriter.toArrayCopy(); - metaWriter.reset(); - } - fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta); - numTerms++; - } - - public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { - // save FST dict - if (numTerms > 0) { - final FST fst = fstCompiler.compile(); - fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst)); - } - } - } -} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java index 52c7465d093..ec73ddcb838 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java @@ -94,7 +94,7 @@ public class DeltaBaseTermStateSerializer implements Accountable { /** * Writes a {@link BlockTermState} to the provided {@link DataOutput}. *

- * Simpler variant of {@link Lucene84PostingsWriter#encodeTerm(long[], DataOutput, FieldInfo, BlockTermState, boolean)}. + * Simpler variant of {@link Lucene84PostingsWriter#encodeTerm(DataOutput, FieldInfo, BlockTermState, boolean)}. */ public void writeTermState(DataOutput termStatesOutput, FieldInfo fieldInfo, BlockTermState termState) throws IOException { IndexOptions indexOptions = fieldInfo.getIndexOptions(); @@ -143,7 +143,7 @@ public class DeltaBaseTermStateSerializer implements Accountable { /** * Reads a {@link BlockTermState} from the provided {@link DataInput}. *

- * Simpler variant of {@link Lucene84PostingsReader#decodeTerm(long[], DataInput, FieldInfo, BlockTermState, boolean)}. + * Simpler variant of {@link Lucene84PostingsReader#decodeTerm(DataInput, FieldInfo, BlockTermState, boolean)}. * * @param reuse {@link BlockTermState} to reuse; or null to create a new one. */ diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 55b8a48e3ef..ec315c0f642 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -16,7 +16,5 @@ org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat -org.apache.lucene.codecs.memory.FSTOrdPostingsFormat -org.apache.lucene.codecs.memory.FSTPostingsFormat org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java deleted file mode 100644 index ec860859a85..00000000000 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import org.apache.lucene.codecs.Codec; -import org.apache.lucene.index.BasePostingsFormatTestCase; -import org.apache.lucene.util.TestUtil; - -/** - * Tests FSTOrdPostingsFormat - */ -public class TestFSTOrdPostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new FSTOrdPostingsFormat()); - - @Override - protected Codec getCodec() { - return codec; - } -} diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java deleted file mode 100644 index 939c5e33fe5..00000000000 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.codecs.memory; - - -import org.apache.lucene.codecs.Codec; -import org.apache.lucene.index.BasePostingsFormatTestCase; -import org.apache.lucene.util.TestUtil; - -/** - * Tests FSTPostingsFormat - */ -public class TestFSTPostingsFormat extends BasePostingsFormatTestCase { - private final Codec codec = TestUtil.alwaysPostingsFormat(new FSTPostingsFormat()); - - @Override - protected Codec getCodec() { - return codec; - } -} diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java index a77e7820152..8ef246fbb30 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java @@ -159,7 +159,7 @@ public class TestTermBytesComparator extends LuceneTestCase { } @Override - public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { + public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { } @Override diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java index f63d63643fa..6d09fe36e16 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java @@ -268,7 +268,7 @@ public class STBlockReaderTest extends LuceneTestCase { } @Override - public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { + public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java index 4fed1a07e7a..a1244ca7686 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java @@ -61,7 +61,7 @@ public abstract class PostingsReaderBase implements Closeable, Accountable { /** Actually decode metadata for next term * @see PostingsWriterBase#encodeTerm */ - public abstract void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; + public abstract void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; /** Must fully consume state, since after this call that * TermState may be reused. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java index 48c6027b286..a8f8ed42aa8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java @@ -68,21 +68,12 @@ public abstract class PostingsWriterBase implements Closeable { * Usually elements in {@code longs} are file pointers, so each one always * increases when a new term is consumed. {@code out} is used to write generic * bytes, which are not monotonic. - * - * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g. - * the pointer to postings list may not be defined for some terms but is defined - * for others, if it is designed to inline some postings data in term dictionary. - * In this case, the postings writer should always use the last value, so that each - * element in metadata long[] remains monotonic. */ - public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; + public abstract void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; /** - * Sets the current field for writing, and returns the - * fixed length of long[] metadata (which is fixed per - * field), called when the writing switches to another field. */ - // TODO: better name? - public abstract int setField(FieldInfo fieldInfo); + * Sets the current field for writing. */ + public abstract void setField(FieldInfo fieldInfo); @Override public abstract void close() throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java index f9770869f24..f51f0c6f967 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java @@ -87,7 +87,7 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase { * fixed length of long[] metadata (which is fixed per * field), called when the writing switches to another field. */ @Override - public int setField(FieldInfo fieldInfo) { + public void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; indexOptions = fieldInfo.getIndexOptions(); @@ -113,8 +113,6 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase { enumFlags = PostingsEnum.OFFSETS; } } - - return 0; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java index 0a0cd31c857..b9dc0bb436b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java @@ -128,8 +128,11 @@ public final class BlockTreeTermsReader extends FieldsProducer { /** Auto-prefix terms have been superseded by points. */ public static final int VERSION_AUTO_PREFIX_TERMS_REMOVED = 3; + /** The long[] + byte[] metadata has been replaced with a single byte[]. */ + public static final int VERSION_META_LONGS_REMOVED = 4; + /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS_REMOVED; + public static final int VERSION_CURRENT = VERSION_META_LONGS_REMOVED; /** Extension of terms index file */ static final String TERMS_INDEX_EXTENSION = "tip"; @@ -212,9 +215,11 @@ public final class BlockTreeTermsReader extends FieldsProducer { // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written. final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong(); final int docCount = termsIn.readVInt(); - final int longsSize = termsIn.readVInt(); - if (longsSize < 0) { - throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); + if (version < VERSION_META_LONGS_REMOVED) { + final int longsSize = termsIn.readVInt(); + if (longsSize < 0) { + throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); + } } BytesRef minTerm = readBytesRef(termsIn); BytesRef maxTerm = readBytesRef(termsIn); @@ -231,7 +236,7 @@ public final class BlockTreeTermsReader extends FieldsProducer { final long indexStartFP = indexIn.readVLong(); FieldReader previous = fieldMap.put(fieldInfo.name, new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, - indexStartFP, longsSize, indexIn, minTerm, maxTerm, state.openedFromWriter, perFieldLoadMode)); + indexStartFP, indexIn, minTerm, maxTerm, state.openedFromWriter, perFieldLoadMode)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index deece0b5266..380cf799a4d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -224,11 +224,10 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; - private final int longsSize; public final BytesRef minTerm; public final BytesRef maxTerm; - public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, + public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, BytesRef minTerm, BytesRef maxTerm) { assert numTerms > 0; this.fieldInfo = fieldInfo; @@ -239,7 +238,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; - this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; } @@ -509,7 +507,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { class TermsWriter { private final FieldInfo fieldInfo; - private final int longsSize; private long numTerms; final FixedBitSet docsSeen; long sumTotalTermFreq; @@ -524,8 +521,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private int[] prefixStarts = new int[8]; - private final long[] longs; - // Pending stack of terms and blocks. As terms arrive (in sorted order) // we append to this stack, and once the top of the stack has enough // terms starting with a common prefix, we write a new block with @@ -720,13 +715,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } // Write term meta data - postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); - for (int pos = 0; pos < longsSize; pos++) { - assert longs[pos] >= 0; - metaWriter.writeVLong(longs[pos]); - } - bytesWriter.copyTo(metaWriter); - bytesWriter.reset(); + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); absolute = false; } } else { @@ -771,13 +760,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // separate anymore: // Write term meta data - postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); - for (int pos = 0; pos < longsSize; pos++) { - assert longs[pos] >= 0; - metaWriter.writeVLong(longs[pos]); - } - bytesWriter.copyTo(metaWriter); - bytesWriter.reset(); + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); absolute = false; } else { PendingBlock block = (PendingBlock) ent; @@ -845,9 +828,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { this.fieldInfo = fieldInfo; assert fieldInfo.getIndexOptions() != IndexOptions.NONE; docsSeen = new FixedBitSet(maxDoc); - - this.longsSize = postingsWriter.setField(fieldInfo); - this.longs = new long[longsSize]; + postingsWriter.setField(fieldInfo); } /** Writes one term's worth of postings. */ @@ -964,7 +945,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { sumTotalTermFreq, sumDocFreq, docsSeen.cardinality(), - longsSize, minTerm, maxTerm)); } else { assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1; @@ -976,7 +956,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); - private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); } private boolean closed; @@ -1009,7 +988,6 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } termsOut.writeVLong(field.sumDocFreq); termsOut.writeVInt(field.docCount); - termsOut.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(termsOut, field.minTerm); writeBytesRef(termsOut, field.maxTerm); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java index 9189b63d366..c185cbcb733 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java @@ -58,7 +58,6 @@ public final class FieldReader extends Terms implements Accountable { final BytesRef rootCode; final BytesRef minTerm; final BytesRef maxTerm; - final int longsSize; final BlockTreeTermsReader parent; final FST index; @@ -66,7 +65,7 @@ public final class FieldReader extends Terms implements Accountable { //private boolean DEBUG; FieldReader(BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, - long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm, boolean openedFromWriter, BlockTreeTermsReader.FSTLoadMode fstLoadMode) throws IOException { + long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm, boolean openedFromWriter, BlockTreeTermsReader.FSTLoadMode fstLoadMode) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -77,7 +76,6 @@ public final class FieldReader extends Terms implements Accountable { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; - this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java index b1cfa7c04d2..d64a4aa8232 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java @@ -80,11 +80,8 @@ final class IntersectTermsEnumFrame { FST.Arc arc; final BlockTermState termState; - - // metadata buffer, holding monotonic values - final long[] longs; - // metadata buffer, holding general values + // metadata buffer byte[] bytes = new byte[32]; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); @@ -102,7 +99,6 @@ final class IntersectTermsEnumFrame { this.ord = ord; this.termState = ite.fr.parent.postingsReader.newTermState(); this.termState.totalTermFreq = -1; - this.longs = new long[ite.fr.longsSize]; } void loadNextFloorBlock() throws IOException { @@ -278,11 +274,8 @@ final class IntersectTermsEnumFrame { } else { termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); } - // metadata - for (int i = 0; i < ite.fr.longsSize; i++) { - longs[i] = bytesReader.readVLong(); - } - ite.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ite.fr.fieldInfo, termState, absolute); + // metadata + ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java index fdb4cc6955b..1e9e6245a39 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java @@ -85,9 +85,7 @@ final class SegmentTermsEnumFrame { final BlockTermState state; - // metadata buffer, holding monotonic values - final long[] longs; - // metadata buffer, holding general values + // metadata buffer byte[] bytes = new byte[32]; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); @@ -98,7 +96,6 @@ final class SegmentTermsEnumFrame { this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; - this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -424,11 +421,8 @@ final class SegmentTermsEnumFrame { state.totalTermFreq = state.docFreq + statsReader.readVLong(); //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); } - // metadata - for (int i = 0; i < ste.fr.longsSize; i++) { - longs[i] = bytesReader.readVLong(); - } - ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); + // metadata + ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java index b0620997726..895db33f0b4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java @@ -166,7 +166,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } @Override - public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; @@ -179,11 +179,11 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { termState.payStartFP = 0; } - termState.docStartFP += longs[0]; + termState.docStartFP += in.readVLong(); if (fieldHasPositions) { - termState.posStartFP += longs[1]; + termState.posStartFP += in.readVLong(); if (fieldHasOffsets || fieldHasPayloads) { - termState.payStartFP += longs[2]; + termState.payStartFP += in.readVLong(); } } if (termState.docFreq == 1) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java index e42669af415..29d812e59c8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java @@ -190,20 +190,11 @@ public final class Lucene84PostingsWriter extends PushPostingsWriterBase { } @Override - public int setField(FieldInfo fieldInfo) { + public void setField(FieldInfo fieldInfo) { super.setField(fieldInfo); skipWriter.setField(writePositions, writeOffsets, writePayloads); lastState = emptyState; fieldHasNorms = fieldInfo.hasNorms(); - if (writePositions) { - if (writePayloads || writeOffsets) { - return 3; // doc + pos + pay FP - } else { - return 2; // doc + pos FP - } - } else { - return 1; // doc FP - } } @Override @@ -466,16 +457,16 @@ public final class Lucene84PostingsWriter extends PushPostingsWriterBase { } @Override - public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IntBlockTermState state = (IntBlockTermState)_state; if (absolute) { lastState = emptyState; } - longs[0] = state.docStartFP - lastState.docStartFP; + out.writeVLong(state.docStartFP - lastState.docStartFP); if (writePositions) { - longs[1] = state.posStartFP - lastState.posStartFP; + out.writeVLong(state.posStartFP - lastState.posStartFP); if (writePayloads || writeOffsets) { - longs[2] = state.payStartFP - lastState.payStartFP; + out.writeVLong(state.payStartFP - lastState.payStartFP); } } if (state.singletonDocID != -1) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java index e7f4c4c7730..3ecd4734b33 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java @@ -50,7 +50,7 @@ final class IDVersionPostingsReader extends PostingsReaderBase { } @Override - public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IDVersionTermState termState = (IDVersionTermState) _termState; termState.docID = in.readVInt(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index 30e19807b31..2ac451fcc08 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -46,7 +46,6 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { private long lastVersion; private final Bits liveDocs; - private String segment; public IDVersionPostingsWriter(Bits liveDocs) { this.liveDocs = liveDocs; @@ -60,11 +59,10 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); - segment = state.segmentInfo.name; } @Override - public int setField(FieldInfo fieldInfo) { + public void setField(FieldInfo fieldInfo) { super.setField(fieldInfo); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { throw new IllegalArgumentException("field must be index using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS"); @@ -75,7 +73,6 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { throw new IllegalArgumentException("field cannot index term vectors: CheckIndex will report this as index corruption"); } lastState = emptyState; - return 0; } @Override @@ -154,7 +151,7 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { private long lastEncodedVersion; @Override - public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IDVersionTermState state = (IDVersionTermState) _state; out.writeVInt(state.docID); if (absolute) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java index 6d260773353..5b1ea64c405 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java @@ -83,9 +83,7 @@ final class IDVersionSegmentTermsEnumFrame { final BlockTermState state; - // metadata buffer, holding monotonic values - public long[] longs; - // metadata buffer, holding general values + // metadata public byte[] bytes; ByteArrayDataInput bytesReader; @@ -96,7 +94,6 @@ final class IDVersionSegmentTermsEnumFrame { this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; - this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -396,11 +393,8 @@ final class IDVersionSegmentTermsEnumFrame { state.docFreq = 1; state.totalTermFreq = 1; //if (DEBUG) System.out.println(" dF=" + state.docFreq); - // metadata - for (int i = 0; i < ste.fr.longsSize; i++) { - longs[i] = bytesReader.readVLong(); - } - ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); + // metadata + ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index 8001a22d9ff..ff5d6ec83b9 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -127,7 +127,6 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { final long sumDocFreq = numTerms; assert numTerms <= Integer.MAX_VALUE; final int docCount = (int) numTerms; - final int longsSize = in.readVInt(); BytesRef minTerm = readBytesRef(in); BytesRef maxTerm = readBytesRef(in); @@ -143,7 +142,7 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { final long indexStartFP = indexIn.readVLong(); VersionFieldReader previous = fields.put(fieldInfo.name, new VersionFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, - indexStartFP, longsSize, indexIn, minTerm, maxTerm)); + indexStartFP, indexIn, minTerm, maxTerm)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index 9e2f7549f4a..b9c57491a88 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -143,11 +143,10 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { public final Pair rootCode; public final long numTerms; public final long indexStartFP; - private final int longsSize; public final BytesRef minTerm; public final BytesRef maxTerm; - public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, int longsSize, + public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, BytesRef minTerm, BytesRef maxTerm) { assert numTerms > 0; this.fieldInfo = fieldInfo; @@ -155,7 +154,6 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { this.rootCode = rootCode; this.indexStartFP = indexStartFP; this.numTerms = numTerms; - this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; } @@ -403,7 +401,6 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { class TermsWriter { private final FieldInfo fieldInfo; - private final int longsSize; private long numTerms; final FixedBitSet docsSeen; long indexStartFP; @@ -416,8 +413,6 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private int[] prefixStarts = new int[8]; - private final long[] longs; - // Pending stack of terms and blocks. As terms arrive (in sorted order) // we append to this stack, and once the top of the stack has enough // terms starting with a common prefix, we write a new block with @@ -605,13 +600,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; // Write term meta data - postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); - for (int pos = 0; pos < longsSize; pos++) { - assert longs[pos] >= 0; - metaWriter.writeVLong(longs[pos]); - } - bytesWriter.copyTo(metaWriter); - bytesWriter.reset(); + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); absolute = false; } } else { @@ -648,13 +637,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { // separate anymore: // Write term meta data - postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); - for (int pos = 0; pos < longsSize; pos++) { - assert longs[pos] >= 0; - metaWriter.writeVLong(longs[pos]); - } - bytesWriter.copyTo(metaWriter); - bytesWriter.reset(); + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); absolute = false; } else { PendingBlock block = (PendingBlock) ent; @@ -720,8 +703,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { this.fieldInfo = fieldInfo; docsSeen = new FixedBitSet(maxDoc); - this.longsSize = postingsWriter.setField(fieldInfo); - this.longs = new long[longsSize]; + postingsWriter.setField(fieldInfo); } /** Writes one term's worth of postings. */ @@ -818,7 +800,6 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { ((PendingBlock) pending.get(0)).index.getEmptyOutput(), numTerms, indexStartFP, - longsSize, minTerm, maxTerm)); } else { // cannot assert this: we skip deleted docIDs in the postings: @@ -828,7 +809,6 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); - private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); } private boolean closed; @@ -856,7 +836,6 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { out.writeVInt(field.rootCode.output1.length); out.writeBytes(field.rootCode.output1.bytes, field.rootCode.output1.offset, field.rootCode.output1.length); out.writeVLong(field.rootCode.output2); - out.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(out, field.minTerm); writeBytesRef(out, field.maxTerm); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java index 581201f9ea4..93888ae589d 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java @@ -45,14 +45,13 @@ final class VersionFieldReader extends Terms implements Accountable { final Pair rootCode; final BytesRef minTerm; final BytesRef maxTerm; - final int longsSize; final VersionBlockTreeTermsReader parent; final FST> index; //private boolean DEBUG; VersionFieldReader(VersionBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, Pair rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, - long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { + long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -63,7 +62,6 @@ final class VersionFieldReader extends Terms implements Accountable { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; - this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java index e55eb8747ae..028827ebede 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java @@ -41,10 +41,6 @@ import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter; import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; -import org.apache.lucene.codecs.memory.FSTOrdTermsReader; -import org.apache.lucene.codecs.memory.FSTOrdTermsWriter; -import org.apache.lucene.codecs.memory.FSTTermsReader; -import org.apache.lucene.codecs.memory.FSTTermsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; @@ -122,29 +118,9 @@ public final class MockRandomPostingsFormat extends PostingsFormat { PostingsWriterBase postingsWriter = new Lucene84PostingsWriter(state); final FieldsConsumer fields; - final int t1 = random.nextInt(5); + final int t1 = random.nextInt(3); - if (t1 == 0) { - boolean success = false; - try { - fields = new FSTTermsWriter(state, postingsWriter); - success = true; - } finally { - if (!success) { - postingsWriter.close(); - } - } - } else if (t1 == 1) { - boolean success = false; - try { - fields = new FSTOrdTermsWriter(state, postingsWriter); - success = true; - } finally { - if (!success) { - postingsWriter.close(); - } - } - } else if (t1 == 2) { + if (t1 == 0) { // Use BlockTree terms dict if (LuceneTestCase.VERBOSE) { @@ -165,7 +141,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { postingsWriter.close(); } } - } else if (t1 == 3) { + } else if (t1 == 1) { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing Block terms dict"); @@ -235,7 +211,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { } } } - } else if (t1 == 4) { + } else if (t1 == 2) { // Use OrdsBlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing OrdsBlockTree"); @@ -287,28 +263,8 @@ public final class MockRandomPostingsFormat extends PostingsFormat { PostingsReaderBase postingsReader = new Lucene84PostingsReader(state); final FieldsProducer fields; - final int t1 = random.nextInt(5); + final int t1 = random.nextInt(3); if (t1 == 0) { - boolean success = false; - try { - fields = new FSTTermsReader(state, postingsReader); - success = true; - } finally { - if (!success) { - postingsReader.close(); - } - } - } else if (t1 == 1) { - boolean success = false; - try { - fields = new FSTOrdTermsReader(state, postingsReader); - success = true; - } finally { - if (!success) { - postingsReader.close(); - } - } - } else if (t1 == 2) { // Use BlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading BlockTree terms dict"); @@ -323,7 +279,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { postingsReader.close(); } } - } else if (t1 == 3) { + } else if (t1 == 1) { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading Block terms dict"); @@ -374,7 +330,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { } } } - } else if (t1 == 4) { + } else if (t1 == 2) { // Use OrdsBlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading OrdsBlockTree terms dict"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index 8bb9a070268..a3b7da996ea 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -45,8 +45,6 @@ import org.apache.lucene.codecs.bloom.TestBloomFilteredLucenePostings; import org.apache.lucene.codecs.lucene60.Lucene60PointsReader; import org.apache.lucene.codecs.lucene60.Lucene60PointsWriter; import org.apache.lucene.codecs.memory.DirectPostingsFormat; -import org.apache.lucene.codecs.memory.FSTOrdPostingsFormat; -import org.apache.lucene.codecs.memory.FSTPostingsFormat; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.store.Directory; @@ -189,8 +187,6 @@ public class RandomCodec extends AssertingCodec { add(avoidCodecs, TestUtil.getDefaultPostingsFormat(minItemsPerBlock, maxItemsPerBlock, RandomPicks.randomFrom(random, BlockTreeTermsReader.FSTLoadMode.values())), - new FSTPostingsFormat(), - new FSTOrdPostingsFormat(), new DirectPostingsFormat(LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : maxItemsPerBlock), LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : lowFreqCutoff)), //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucenePostings to be constructed From 3b660d6b842894d7ddafea07059125c3f9fe19d1 Mon Sep 17 00:00:00 2001 From: noble Date: Fri, 10 Jan 2020 01:17:50 +1100 Subject: [PATCH 6/8] SOLR-14165: SolrResponse serialVersionUID has changed in a backward incompatible way --- solr/CHANGES.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index ca5ac0f97a3..32781e830dc 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -224,6 +224,25 @@ Other Changes * SOLR-14169: Fix 20 Resource Leak warnings in SolrJ's apache/solr/common (Andras Salamon via Tomás Fernández Löbbe) +================== 8.4.1 ================== + +Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. + +Versions of Major Components +--------------------- +Apache Tika 1.19.1 +Carrot2 3.16.0 +Velocity 2.0 and Velocity Tools 3.0 +Apache ZooKeeper 3.5.5 +Jetty 9.4.19.v20190610 + + +Bug Fixes +---------------------- + +* SOLR-14165: SolrResponse serialVersionUID has changed in a backward incompatible way (Andy Webb via noble) + + ================== 8.4.0 ================== Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. From 22155bf7a7cba8ddb9652d1b30aa1429c233fcc6 Mon Sep 17 00:00:00 2001 From: Kevin Risden Date: Fri, 3 Jan 2020 15:34:55 -0500 Subject: [PATCH 7/8] SOLR-14163: SOLR_SSL_CLIENT_HOSTNAME_VERIFICATION needs to work with Jetty server/client SSL contexts Closes #1147 Signed-off-by: Kevin Risden --- solr/CHANGES.txt | 2 ++ solr/bin/solr | 7 +++--- solr/bin/solr.cmd | 4 ++++ solr/server/etc/jetty-ssl.xml | 1 - solr/solr-ref-guide/src/enabling-ssl.adoc | 10 +++------ .../solr/client/solrj/embedded/SSLConfig.java | 22 ++++++++++++++++++- .../client/solrj/impl/Http2SolrClient.java | 5 ++++- .../solrj/impl/Http2SolrClientTest.java | 11 ++++++++++ 8 files changed, 48 insertions(+), 14 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 32781e830dc..ce20f8527fa 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -210,6 +210,8 @@ Bug Fixes * SOLR-13089: Fix lsof edge cases in the solr CLI script (Martijn Koster via janhoy) +* SOLR-14163: SOLR_SSL_CLIENT_HOSTNAME_VERIFICATION needs to work with Jetty server/client SSL contexts (Kevin Risden) + Other Changes --------------------- diff --git a/solr/bin/solr b/solr/bin/solr index 869a9ff6be1..4faf61b772b 100755 --- a/solr/bin/solr +++ b/solr/bin/solr @@ -206,14 +206,13 @@ if [ "$SOLR_SSL_ENABLED" == "true" ]; then SOLR_SSL_OPTS+=" -Dsolr.jetty.truststore.type=$SOLR_SSL_TRUST_STORE_TYPE" fi - if [ -n "$SOLR_SSL_NEED_CLIENT_AUTH" ]; then - SOLR_SSL_OPTS+=" -Dsolr.jetty.ssl.needClientAuth=$SOLR_SSL_NEED_CLIENT_AUTH" - fi - if [ -z "$SOLR_SSL_CLIENT_HOSTNAME_VERIFICATION" ] ; then SOLR_SSL_OPTS+=" -Dsolr.jetty.ssl.verifyClientHostName=HTTPS" fi + if [ -n "$SOLR_SSL_NEED_CLIENT_AUTH" ]; then + SOLR_SSL_OPTS+=" -Dsolr.jetty.ssl.needClientAuth=$SOLR_SSL_NEED_CLIENT_AUTH" + fi if [ -n "$SOLR_SSL_WANT_CLIENT_AUTH" ]; then SOLR_SSL_OPTS+=" -Dsolr.jetty.ssl.wantClientAuth=$SOLR_SSL_WANT_CLIENT_AUTH" fi diff --git a/solr/bin/solr.cmd b/solr/bin/solr.cmd index fe816947ba1..05a7472c6c3 100755 --- a/solr/bin/solr.cmd +++ b/solr/bin/solr.cmd @@ -82,6 +82,10 @@ IF "%SOLR_SSL_ENABLED%"=="true" ( set "SOLR_SSL_OPTS=!SOLR_SSL_OPTS! -Dsolr.jetty.truststore.type=%SOLR_SSL_TRUST_STORE_TYPE%" ) + IF NOT DEFINED SOLR_SSL_CLIENT_HOSTNAME_VERIFICATION ( + set "SOLR_SSL_OPTS=!SOLR_SSL_OPTS! -Dsolr.jetty.ssl.verifyClientHostName=HTTPS" + ) + IF DEFINED SOLR_SSL_NEED_CLIENT_AUTH ( set "SOLR_SSL_OPTS=!SOLR_SSL_OPTS! -Dsolr.jetty.ssl.needClientAuth=%SOLR_SSL_NEED_CLIENT_AUTH%" ) diff --git a/solr/server/etc/jetty-ssl.xml b/solr/server/etc/jetty-ssl.xml index 3688c4c7159..53e0ec1109b 100644 --- a/solr/server/etc/jetty-ssl.xml +++ b/solr/server/etc/jetty-ssl.xml @@ -17,7 +17,6 @@ - diff --git a/solr/solr-ref-guide/src/enabling-ssl.adoc b/solr/solr-ref-guide/src/enabling-ssl.adoc index ccfbf008d20..5840e3f35e0 100644 --- a/solr/solr-ref-guide/src/enabling-ssl.adoc +++ b/solr/solr-ref-guide/src/enabling-ssl.adoc @@ -66,7 +66,7 @@ NOTE: If you setup Solr as a service on Linux using the steps outlined in < Date: Thu, 9 Jan 2020 17:37:54 +0100 Subject: [PATCH 8/8] Revert "LUCENE-9116: Remove long[] from `PostingsWriterBase#encodeTerm`. (#1149)" This reverts commit d0b4a166e06521757cc3be25dafd7705e1eeecdc. --- lucene/CHANGES.txt | 6 - .../lucene50/Lucene50PostingsReader.java | 8 +- .../lucene50/Lucene50PostingsWriter.java | 19 +- .../codecs/blockterms/BlockTermsReader.java | 14 +- .../codecs/blockterms/BlockTermsWriter.java | 20 +- .../OrdsBlockTreeTermsReader.java | 3 +- .../OrdsBlockTreeTermsWriter.java | 29 +- .../codecs/blocktreeords/OrdsFieldReader.java | 4 +- .../OrdsIntersectTermsEnumFrame.java | 12 +- .../OrdsSegmentTermsEnumFrame.java | 12 +- .../codecs/memory/FSTOrdPostingsFormat.java | 78 ++ .../codecs/memory/FSTOrdTermsReader.java | 884 ++++++++++++++++++ .../codecs/memory/FSTOrdTermsWriter.java | 386 ++++++++ .../codecs/memory/FSTPostingsFormat.java | 78 ++ .../lucene/codecs/memory/FSTTermOutputs.java | 383 ++++++++ .../lucene/codecs/memory/FSTTermsReader.java | 785 ++++++++++++++++ .../lucene/codecs/memory/FSTTermsWriter.java | 291 ++++++ .../DeltaBaseTermStateSerializer.java | 4 +- .../org.apache.lucene.codecs.PostingsFormat | 2 + .../memory/TestFSTOrdPostingsFormat.java | 34 + .../codecs/memory/TestFSTPostingsFormat.java | 34 + .../uniformsplit/TestTermBytesComparator.java | 2 +- .../sharedterms/STBlockReaderTest.java | 2 +- .../lucene/codecs/PostingsReaderBase.java | 2 +- .../lucene/codecs/PostingsWriterBase.java | 15 +- .../lucene/codecs/PushPostingsWriterBase.java | 4 +- .../blocktree/BlockTreeTermsReader.java | 15 +- .../blocktree/BlockTreeTermsWriter.java | 30 +- .../lucene/codecs/blocktree/FieldReader.java | 4 +- .../blocktree/IntersectTermsEnumFrame.java | 13 +- .../blocktree/SegmentTermsEnumFrame.java | 12 +- .../lucene84/Lucene84PostingsReader.java | 8 +- .../lucene84/Lucene84PostingsWriter.java | 19 +- .../idversion/IDVersionPostingsReader.java | 2 +- .../idversion/IDVersionPostingsWriter.java | 7 +- .../IDVersionSegmentTermsEnumFrame.java | 12 +- .../VersionBlockTreeTermsReader.java | 3 +- .../VersionBlockTreeTermsWriter.java | 29 +- .../codecs/idversion/VersionFieldReader.java | 4 +- .../mockrandom/MockRandomPostingsFormat.java | 58 +- .../org/apache/lucene/index/RandomCodec.java | 4 + 41 files changed, 3239 insertions(+), 92 deletions(-) create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java create mode 100644 lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java create mode 100644 lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java create mode 100644 lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1b1f250547e..fcc359f7326 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -88,12 +88,6 @@ API Changes yield Passages sized a little different due to the fact that the sizing pivot is now the center of the first match and not its left edge. -* LUCENE-9116: PostingsWriterBase and PostingsReaderBase no longer support - setting a field's metadata via a `long[]`. (Adrien Grand) - -* LUCENE-9116: The FST and FSTOrd postings formats have been removed. - (Adrien Grand) - New Features --------------------- (No changes) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java index adae891c4ab..0ea8c802cd2 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene50/Lucene50PostingsReader.java @@ -154,7 +154,7 @@ public final class Lucene50PostingsReader extends PostingsReaderBase { } @Override - public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; @@ -167,11 +167,11 @@ public final class Lucene50PostingsReader extends PostingsReaderBase { termState.payStartFP = 0; } - termState.docStartFP += in.readVLong(); + termState.docStartFP += longs[0]; if (fieldHasPositions) { - termState.posStartFP += in.readVLong(); + termState.posStartFP += longs[1]; if (fieldHasOffsets || fieldHasPayloads) { - termState.payStartFP += in.readVLong(); + termState.payStartFP += longs[2]; } } if (termState.docFreq == 1) { diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java index 8f425a2036c..a600e61fb32 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/codecs/lucene50/Lucene50PostingsWriter.java @@ -187,11 +187,20 @@ public final class Lucene50PostingsWriter extends PushPostingsWriterBase { } @Override - public void setField(FieldInfo fieldInfo) { + public int setField(FieldInfo fieldInfo) { super.setField(fieldInfo); skipWriter.setField(writePositions, writeOffsets, writePayloads); lastState = emptyState; fieldHasNorms = fieldInfo.hasNorms(); + if (writePositions) { + if (writePayloads || writeOffsets) { + return 3; // doc + pos + pay FP + } else { + return 2; // doc + pos FP + } + } else { + return 1; // doc FP + } } @Override @@ -454,16 +463,16 @@ public final class Lucene50PostingsWriter extends PushPostingsWriterBase { } @Override - public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IntBlockTermState state = (IntBlockTermState)_state; if (absolute) { lastState = emptyState; } - out.writeVLong(state.docStartFP - lastState.docStartFP); + longs[0] = state.docStartFP - lastState.docStartFP; if (writePositions) { - out.writeVLong(state.posStartFP - lastState.posStartFP); + longs[1] = state.posStartFP - lastState.posStartFP; if (writePayloads || writeOffsets) { - out.writeVLong(state.payStartFP - lastState.payStartFP); + longs[2] = state.payStartFP - lastState.payStartFP; } } if (state.singletonDocID != -1) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java index 480f5fde271..964f616c6ff 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsReader.java @@ -145,6 +145,7 @@ public class BlockTermsReader extends FieldsProducer { // when frequencies are omitted, sumDocFreq=totalTermFreq and we only write one value final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong(); final int docCount = in.readVInt(); + final int longsSize = in.readVInt(); if (docCount < 0 || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs throw new CorruptIndexException("invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), in); } @@ -154,7 +155,7 @@ public class BlockTermsReader extends FieldsProducer { if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings throw new CorruptIndexException("invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, in); } - FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount)); + FieldReader previous = fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq, docCount, longsSize)); if (previous != null) { throw new CorruptIndexException("duplicate fields: " + fieldInfo.name, in); } @@ -222,8 +223,9 @@ public class BlockTermsReader extends FieldsProducer { final long sumTotalTermFreq; final long sumDocFreq; final int docCount; + final int longsSize; - FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { + FieldReader(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.numTerms = numTerms; @@ -231,6 +233,7 @@ public class BlockTermsReader extends FieldsProducer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; } @Override @@ -323,6 +326,7 @@ public class BlockTermsReader extends FieldsProducer { private final ByteArrayDataInput freqReader = new ByteArrayDataInput(); private int metaDataUpto; + private long[] longs; private byte[] bytes; private ByteArrayDataInput bytesReader; @@ -339,6 +343,7 @@ public class BlockTermsReader extends FieldsProducer { termSuffixes = new byte[128]; docFreqBytes = new byte[64]; //System.out.println("BTR.enum init this=" + this + " postingsReader=" + postingsReader); + longs = new long[longsSize]; } // TODO: we may want an alternate mode here which is @@ -821,7 +826,10 @@ public class BlockTermsReader extends FieldsProducer { //System.out.println(" totTF=" + state.totalTermFreq); } // metadata - postingsReader.decodeTerm(bytesReader, fieldInfo, state, absolute); + for (int i = 0; i < longs.length; i++) { + longs[i] = bytesReader.readVLong(); + } + postingsReader.decodeTerm(longs, bytesReader, fieldInfo, state, absolute); metaDataUpto++; absolute = false; } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java index e064aa1ecf2..f620bd83d0f 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blockterms/BlockTermsWriter.java @@ -81,8 +81,9 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; + public final int longsSize; - public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount) { + public FieldMetaData(FieldInfo fieldInfo, long numTerms, long termsStartPointer, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) { assert numTerms > 0; this.fieldInfo = fieldInfo; this.termsStartPointer = termsStartPointer; @@ -90,6 +91,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; } } @@ -174,6 +176,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); } writeTrailer(dirStart); CodecUtil.writeFooter(out); @@ -203,6 +206,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { long sumTotalTermFreq; long sumDocFreq; int docCount; + int longsSize; private TermEntry[] pendingTerms; @@ -222,7 +226,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { } termsStartPointer = out.getFilePointer(); this.postingsWriter = postingsWriter; - postingsWriter.setField(fieldInfo); + this.longsSize = postingsWriter.setField(fieldInfo); } private final BytesRefBuilder lastPrevTerm = new BytesRefBuilder(); @@ -281,7 +285,8 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { termsStartPointer, fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0 ? sumTotalTermFreq : -1, sumDocFreq, - docsSeen.cardinality())); + docsSeen.cardinality(), + longsSize)); } } @@ -302,6 +307,7 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { } private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput bufferWriter = ByteBuffersDataOutput.newResettableInstance(); private void flushBlock() throws IOException { //System.out.println("BTW.flushBlock seg=" + segment + " pendingCount=" + pendingCount + " fp=" + out.getFilePointer()); @@ -347,10 +353,16 @@ public class BlockTermsWriter extends FieldsConsumer implements Closeable { bytesWriter.reset(); // 4th pass: write the metadata + long[] longs = new long[longsSize]; boolean absolute = true; for(int termCount=0;termCount 0; this.fieldInfo = fieldInfo; @@ -158,6 +159,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; } @@ -422,6 +424,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { class TermsWriter { private final FieldInfo fieldInfo; + private final int longsSize; private long numTerms; final FixedBitSet docsSeen; long sumTotalTermFreq; @@ -436,6 +439,8 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private int[] prefixStarts = new int[8]; + private final long[] longs; + // Pending stack of terms and blocks. As terms arrive (in sorted order) // we append to this stack, and once the top of the stack has enough // terms starting with a common prefix, we write a new block with @@ -628,7 +633,13 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { } // Write term meta data - postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.copyTo(metaWriter); + bytesWriter.reset(); absolute = false; } totalTermCount = end-start; @@ -673,7 +684,13 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { // separate anymore: // Write term meta data - postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.copyTo(metaWriter); + bytesWriter.reset(); absolute = false; totalTermCount++; @@ -746,7 +763,8 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { TermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; docsSeen = new FixedBitSet(maxDoc); - postingsWriter.setField(fieldInfo); + this.longsSize = postingsWriter.setField(fieldInfo); + this.longs = new long[longsSize]; } /** Writes one term's worth of postings. */ @@ -856,6 +874,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { sumTotalTermFreq, sumDocFreq, docsSeen.cardinality(), + longsSize, minTerm, maxTerm)); } else { assert docsSeen.cardinality() == 0; @@ -865,6 +884,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); } private boolean closed; @@ -896,6 +916,7 @@ public final class OrdsBlockTreeTermsWriter extends FieldsConsumer { } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(out, field.minTerm); writeBytesRef(out, field.maxTerm); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java index 54954e85d3d..5d02258837d 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsFieldReader.java @@ -46,6 +46,7 @@ final class OrdsFieldReader extends Terms implements Accountable { final Output rootCode; final BytesRef minTerm; final BytesRef maxTerm; + final int longsSize; final OrdsBlockTreeTermsReader parent; final FST index; @@ -53,7 +54,7 @@ final class OrdsFieldReader extends Terms implements Accountable { OrdsFieldReader(OrdsBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, Output rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, - long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { + long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -64,6 +65,7 @@ final class OrdsFieldReader extends Terms implements Accountable { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; + this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java index ab7eab73427..a34f0fda1d0 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsIntersectTermsEnumFrame.java @@ -84,7 +84,9 @@ final class OrdsIntersectTermsEnumFrame { final BlockTermState termState; - // metadata + // metadata buffer, holding monotonic values + public long[] longs; + // metadata buffer, holding general values public byte[] bytes; ByteArrayDataInput bytesReader; @@ -101,6 +103,7 @@ final class OrdsIntersectTermsEnumFrame { this.ord = ord; this.termState = ite.fr.parent.postingsReader.newTermState(); this.termState.totalTermFreq = -1; + this.longs = new long[ite.fr.longsSize]; } void loadNextFloorBlock() throws IOException { @@ -295,8 +298,11 @@ final class OrdsIntersectTermsEnumFrame { termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); } - // metadata - ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); + // metadata + for (int i = 0; i < ite.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ite.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ite.fr.fieldInfo, termState, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java index 240e781c7cc..ee3782f29cd 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java @@ -97,7 +97,9 @@ final class OrdsSegmentTermsEnumFrame { final BlockTermState state; - // metadata + // metadata buffer, holding monotonic values + public long[] longs; + // metadata buffer, holding general values public byte[] bytes; ByteArrayDataInput bytesReader; @@ -108,6 +110,7 @@ final class OrdsSegmentTermsEnumFrame { this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; + this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -504,8 +507,11 @@ final class OrdsSegmentTermsEnumFrame { } //if (DEBUG) System.out.println(" longsSize=" + ste.fr.longsSize); - // metadata - ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); + // metadata + for (int i = 0; i < ste.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java new file mode 100644 index 00000000000..0ce12178a90 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdPostingsFormat.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + + +import java.io.IOException; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; +import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * FSTOrd term dict + Lucene50PBF + */ + +public final class FSTOrdPostingsFormat extends PostingsFormat { + public FSTOrdPostingsFormat() { + super("FSTOrd50"); + } + + @Override + public String toString() { + return getName(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene84PostingsWriter(state); + + boolean success = false; + try { + FieldsConsumer ret = new FSTOrdTermsWriter(state, postingsWriter); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene84PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new FSTOrdTermsReader(state, postingsReader); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java new file mode 100644 index 00000000000..7ecf19cc9f6 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsReader.java @@ -0,0 +1,884 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.TreeMap; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; + +/** + * FST-based terms dictionary reader. + * + * The FST index maps each term and its ord, and during seek + * the ord is used to fetch metadata from a single block. + * The term dictionary is fully memory resident. + * + * @lucene.experimental + */ +public class FSTOrdTermsReader extends FieldsProducer { + static final int INTERVAL = FSTOrdTermsWriter.SKIP_INTERVAL; + final TreeMap fields = new TreeMap<>(); + final PostingsReaderBase postingsReader; + //static final boolean TEST = false; + + public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { + final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION); + final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION); + + this.postingsReader = postingsReader; + ChecksumIndexInput indexIn = null; + IndexInput blockIn = null; + boolean success = false; + try { + indexIn = state.directory.openChecksumInput(termsIndexFileName, state.context); + blockIn = state.directory.openInput(termsBlockFileName, state.context); + int version = CodecUtil.checkIndexHeader(indexIn, FSTOrdTermsWriter.TERMS_INDEX_CODEC_NAME, + FSTOrdTermsWriter.VERSION_START, + FSTOrdTermsWriter.VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + int version2 = CodecUtil.checkIndexHeader(blockIn, FSTOrdTermsWriter.TERMS_CODEC_NAME, + FSTOrdTermsWriter.VERSION_START, + FSTOrdTermsWriter.VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + + if (version != version2) { + throw new CorruptIndexException("Format versions mismatch: index=" + version + ", terms=" + version2, blockIn); + } + + CodecUtil.checksumEntireFile(blockIn); + + this.postingsReader.init(blockIn, state); + seekDir(blockIn); + + final FieldInfos fieldInfos = state.fieldInfos; + final int numFields = blockIn.readVInt(); + for (int i = 0; i < numFields; i++) { + FieldInfo fieldInfo = fieldInfos.fieldInfo(blockIn.readVInt()); + boolean hasFreq = fieldInfo.getIndexOptions() != IndexOptions.DOCS; + long numTerms = blockIn.readVLong(); + long sumTotalTermFreq = blockIn.readVLong(); + // if freqs are omitted, sumDocFreq=sumTotalTermFreq and we only write one value + long sumDocFreq = hasFreq ? blockIn.readVLong() : sumTotalTermFreq; + int docCount = blockIn.readVInt(); + int longsSize = blockIn.readVInt(); + FST index = new FST<>(indexIn, PositiveIntOutputs.getSingleton()); + + TermsReader current = new TermsReader(fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index); + TermsReader previous = fields.put(fieldInfo.name, current); + checkFieldSummary(state.segmentInfo, indexIn, blockIn, current, previous); + } + CodecUtil.checkFooter(indexIn); + success = true; + } finally { + if (success) { + IOUtils.close(indexIn, blockIn); + } else { + IOUtils.closeWhileHandlingException(indexIn, blockIn); + } + } + } + + private void seekDir(IndexInput in) throws IOException { + in.seek(in.length() - CodecUtil.footerLength() - 8); + in.seek(in.readLong()); + } + private void checkFieldSummary(SegmentInfo info, IndexInput indexIn, IndexInput blockIn, TermsReader field, TermsReader previous) throws IOException { + // #docs with field must be <= #docs + if (field.docCount < 0 || field.docCount > info.maxDoc()) { + throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.maxDoc() + " (blockIn=" + blockIn + ")", indexIn); + } + // #postings must be >= #docs with field + if (field.sumDocFreq < field.docCount) { + throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (blockIn=" + blockIn + ")", indexIn); + } + // #positions must be >= #postings + if (field.sumTotalTermFreq < field.sumDocFreq) { + throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (blockIn=" + blockIn + ")", indexIn); + } + if (previous != null) { + throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name + " (blockIn=" + blockIn + ")", indexIn); + } + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + @Override + public void close() throws IOException { + try { + IOUtils.close(postingsReader); + } finally { + fields.clear(); + } + } + + final class TermsReader extends Terms implements Accountable { + final FieldInfo fieldInfo; + final long numTerms; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final int longsSize; + final FST index; + + final int numSkipInfo; + final long[] skipInfo; + final byte[] statsBlock; + final byte[] metaLongsBlock; + final byte[] metaBytesBlock; + + TermsReader(FieldInfo fieldInfo, IndexInput blockIn, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST index) throws IOException { + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.longsSize = longsSize; + this.index = index; + + assert (numTerms & (~0xffffffffL)) == 0; + final int numBlocks = (int)(numTerms + INTERVAL - 1) / INTERVAL; + this.numSkipInfo = longsSize + 3; + this.skipInfo = new long[numBlocks * numSkipInfo]; + this.statsBlock = new byte[(int)blockIn.readVLong()]; + this.metaLongsBlock = new byte[(int)blockIn.readVLong()]; + this.metaBytesBlock = new byte[(int)blockIn.readVLong()]; + + int last = 0, next = 0; + for (int i = 1; i < numBlocks; i++) { + next = numSkipInfo * i; + for (int j = 0; j < numSkipInfo; j++) { + skipInfo[next + j] = skipInfo[last + j] + blockIn.readVLong(); + } + last = next; + } + blockIn.readBytes(statsBlock, 0, statsBlock.length); + blockIn.readBytes(metaLongsBlock, 0, metaLongsBlock.length); + blockIn.readBytes(metaBytesBlock, 0, metaBytesBlock.length); + } + + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } + + @Override + public int getDocCount() throws IOException { + return docCount; + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new IntersectTermsEnum(compiled, startTerm); + } + + @Override + public long ramBytesUsed() { + long ramBytesUsed = 0; + if (index != null) { + ramBytesUsed += index.ramBytesUsed(); + ramBytesUsed += RamUsageEstimator.sizeOf(metaBytesBlock); + ramBytesUsed += RamUsageEstimator.sizeOf(metaLongsBlock); + ramBytesUsed += RamUsageEstimator.sizeOf(skipInfo); + ramBytesUsed += RamUsageEstimator.sizeOf(statsBlock); + } + return ramBytesUsed; + } + + @Override + public Collection getChildResources() { + if (index == null) { + return Collections.emptyList(); + } else { + return Collections.singletonList(Accountables.namedAccountable("terms", index)); + } + } + + @Override + public String toString() { + return "FSTOrdTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")"; + } + + // Only wraps common operations for PBF interact + abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { + + /* Current term's ord, starts from 0 */ + long ord; + + /* Current term stats + decoded metadata (customized by PBF) */ + final BlockTermState state; + + /* Datainput to load stats & metadata */ + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + final ByteArrayDataInput metaLongsReader = new ByteArrayDataInput(); + final ByteArrayDataInput metaBytesReader = new ByteArrayDataInput(); + + /* To which block is buffered */ + int statsBlockOrd; + int metaBlockOrd; + + /* Current buffered metadata (long[] & byte[]) */ + long[][] longs; + int[] bytesStart; + int[] bytesLength; + + /* Current buffered stats (df & ttf) */ + int[] docFreq; + long[] totalTermFreq; + + BaseTermsEnum() throws IOException { + this.state = postingsReader.newTermState(); + this.statsReader.reset(statsBlock); + this.metaLongsReader.reset(metaLongsBlock); + this.metaBytesReader.reset(metaBytesBlock); + + this.longs = new long[INTERVAL][longsSize]; + this.bytesStart = new int[INTERVAL]; + this.bytesLength = new int[INTERVAL]; + this.docFreq = new int[INTERVAL]; + this.totalTermFreq = new long[INTERVAL]; + this.statsBlockOrd = -1; + this.metaBlockOrd = -1; + } + + /** Decodes stats data into term state */ + void decodeStats() throws IOException { + final int upto = (int)ord % INTERVAL; + final int oldBlockOrd = statsBlockOrd; + statsBlockOrd = (int)ord / INTERVAL; + if (oldBlockOrd != statsBlockOrd) { + refillStats(); + } + state.docFreq = docFreq[upto]; + state.totalTermFreq = totalTermFreq[upto]; + } + + /** Let PBF decode metadata */ + void decodeMetaData() throws IOException { + final int upto = (int)ord % INTERVAL; + final int oldBlockOrd = metaBlockOrd; + metaBlockOrd = (int)ord / INTERVAL; + if (metaBlockOrd != oldBlockOrd) { + refillMetadata(); + } + metaBytesReader.setPosition(bytesStart[upto]); + postingsReader.decodeTerm(longs[upto], metaBytesReader, fieldInfo, state, true); + } + + /** Load current stats shard */ + final void refillStats() throws IOException { + final int offset = statsBlockOrd * numSkipInfo; + final int statsFP = (int)skipInfo[offset]; + statsReader.setPosition(statsFP); + for (int i = 0; i < INTERVAL && !statsReader.eof(); i++) { + int code = statsReader.readVInt(); + if (hasFreqs()) { + docFreq[i] = (code >>> 1); + if ((code & 1) == 1) { + totalTermFreq[i] = docFreq[i]; + } else { + totalTermFreq[i] = docFreq[i] + statsReader.readVLong(); + } + } else { + docFreq[i] = code; + totalTermFreq[i] = code; + } + } + } + + /** Load current metadata shard */ + final void refillMetadata() throws IOException { + final int offset = metaBlockOrd * numSkipInfo; + final int metaLongsFP = (int)skipInfo[offset + 1]; + final int metaBytesFP = (int)skipInfo[offset + 2]; + metaLongsReader.setPosition(metaLongsFP); + for (int j = 0; j < longsSize; j++) { + longs[0][j] = skipInfo[offset + 3 + j] + metaLongsReader.readVLong(); + } + bytesStart[0] = metaBytesFP; + bytesLength[0] = (int)metaLongsReader.readVLong(); + for (int i = 1; i < INTERVAL && !metaLongsReader.eof(); i++) { + for (int j = 0; j < longsSize; j++) { + longs[i][j] = longs[i-1][j] + metaLongsReader.readVLong(); + } + bytesStart[i] = bytesStart[i-1] + bytesLength[i-1]; + bytesLength[i] = (int)metaLongsReader.readVLong(); + } + } + + @Override + public TermState termState() throws IOException { + decodeMetaData(); + return state.clone(); + } + + @Override + public int docFreq() throws IOException { + return state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + return state.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + decodeMetaData(); + return postingsReader.postings(fieldInfo, state, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + decodeMetaData(); + return postingsReader.impacts(fieldInfo, state, flags); + } + + // TODO: this can be achieved by making use of Util.getByOutput() + // and should have related tests + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + } + + // Iterates through all terms in this field + private final class SegmentTermsEnum extends BaseTermsEnum { + final BytesRefFSTEnum fstEnum; + /* Current term, null when enum ends or unpositioned */ + BytesRef term; + + /* True when current term's metadata is decoded */ + boolean decoded; + + /* True when current enum is 'positioned' by seekExact(TermState) */ + boolean seekPending; + + SegmentTermsEnum() throws IOException { + this.fstEnum = new BytesRefFSTEnum<>(index); + this.decoded = false; + this.seekPending = false; + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + @Override + void decodeMetaData() throws IOException { + if (!decoded && !seekPending) { + super.decodeMetaData(); + decoded = true; + } + } + + // Update current enum according to FSTEnum + void updateEnum(final InputOutput pair) throws IOException { + if (pair == null) { + term = null; + } else { + term = pair.input; + ord = pair.output; + decodeStats(); + } + decoded = false; + seekPending = false; + } + + @Override + public BytesRef next() throws IOException { + if (seekPending) { // previously positioned, but termOutputs not fetched + seekPending = false; + SeekStatus status = seekCeil(term); + assert status == SeekStatus.FOUND; // must positioned on valid term + } + updateEnum(fstEnum.next()); + return term; + } + + @Override + public boolean seekExact(BytesRef target) throws IOException { + updateEnum(fstEnum.seekExact(target)); + return term != null; + } + + @Override + public SeekStatus seekCeil(BytesRef target) throws IOException { + updateEnum(fstEnum.seekCeil(target)); + if (term == null) { + return SeekStatus.END; + } else { + return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + } + } + + @Override + public void seekExact(BytesRef target, TermState otherState) { + if (!target.equals(term)) { + state.copyFrom(otherState); + term = BytesRef.deepCopyOf(target); + seekPending = true; + } + } + } + + // Iterates intersect result with automaton (cannot seek!) + private final class IntersectTermsEnum extends BaseTermsEnum { + /* Current term, null when enum ends or unpositioned */ + BytesRefBuilder term; + + /* True when current term's metadata is decoded */ + boolean decoded; + + /* True when there is pending term when calling next() */ + boolean pending; + + /* stack to record how current term is constructed, + * used to accumulate metadata or rewind term: + * level == term.length + 1, + * == 0 when term is null */ + Frame[] stack; + int level; + + /* term dict fst */ + final FST fst; + final FST.BytesReader fstReader; + final Outputs fstOutputs; + + /* query automaton to intersect with */ + final ByteRunAutomaton fsa; + + private final class Frame { + /* fst stats */ + FST.Arc arc; + + Long output; + + /* automaton stats */ + int state; + + Frame() { + this.arc = new FST.Arc<>(); + this.state = -1; + } + + public String toString() { + return "arc=" + arc + " state=" + state; + } + } + + IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); + this.fst = index; + this.fstReader = fst.getBytesReader(); + this.fstOutputs = index.outputs; + this.fsa = compiled.runAutomaton; + this.level = -1; + this.stack = new Frame[16]; + for (int i = 0 ; i < stack.length; i++) { + this.stack[i] = new Frame(); + } + + Frame frame; + frame = loadVirtualFrame(newFrame()); + this.level++; + frame = loadFirstFrame(newFrame()); + pushFrame(frame); + + this.decoded = false; + this.pending = false; + + if (startTerm == null) { + pending = isAccept(topFrame()); + } else { + doSeekCeil(startTerm); + pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); + } + } + + @Override + public BytesRef term() throws IOException { + return term == null ? null : term.get(); + } + + @Override + void decodeMetaData() throws IOException { + if (!decoded) { + super.decodeMetaData(); + decoded = true; + } + } + + @Override + void decodeStats() throws IOException { + ord = topFrame().output; + super.decodeStats(); + } + + @Override + public SeekStatus seekCeil(BytesRef target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public BytesRef next() throws IOException { + //if (TEST) System.out.println("Enum next()"); + if (pending) { + pending = false; + decodeStats(); + return term(); + } + decoded = false; + DFS: + while (level > 0) { + Frame frame = newFrame(); + if (loadExpandFrame(topFrame(), frame) != null) { // has valid target + pushFrame(frame); + if (isAccept(frame)) { // gotcha + break; + } + continue; // check next target + } + frame = popFrame(); + while(level > 0) { + if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling + pushFrame(frame); + if (isAccept(frame)) { // gotcha + break DFS; + } + continue DFS; // check next target + } + frame = popFrame(); + } + return null; + } + decodeStats(); + return term(); + } + + BytesRef doSeekCeil(BytesRef target) throws IOException { + //if (TEST) System.out.println("Enum doSeekCeil()"); + Frame frame= null; + int label, upto = 0, limit = target.length; + while (upto < limit) { // to target prefix, or ceil label (rewind prefix) + frame = newFrame(); + label = target.bytes[upto] & 0xff; + frame = loadCeilFrame(label, topFrame(), frame); + if (frame == null || frame.arc.label() != label) { + break; + } + assert isValid(frame); // target must be fetched from automaton + pushFrame(frame); + upto++; + } + if (upto == limit) { // got target + return term(); + } + if (frame != null) { // got larger term('s prefix) + pushFrame(frame); + return isAccept(frame) ? term() : next(); + } + while (level > 0) { // got target's prefix, advance to larger term + frame = popFrame(); + while (level > 0 && !canRewind(frame)) { + frame = popFrame(); + } + if (loadNextFrame(topFrame(), frame) != null) { + pushFrame(frame); + return isAccept(frame) ? term() : next(); + } + } + return null; + } + + /** Virtual frame, never pop */ + Frame loadVirtualFrame(Frame frame) { + frame.output = fstOutputs.getNoOutput(); + frame.state = -1; + return frame; + } + + /** Load frame for start arc(node) on fst */ + Frame loadFirstFrame(Frame frame) { + frame.arc = fst.getFirstArc(frame.arc); + frame.output = frame.arc.output(); + frame.state = 0; + return frame; + } + + /** Load frame for target arc(node) on fst */ + Frame loadExpandFrame(Frame top, Frame frame) throws IOException { + if (!canGrow(top)) { + return null; + } + frame.arc = fst.readFirstRealTargetArc(top.arc.target(), frame.arc, fstReader); + frame.state = fsa.step(top.state, frame.arc.label()); + frame.output = frame.arc.output(); + //if (TEST) System.out.println(" loadExpand frame="+frame); + if (frame.state == -1) { + return loadNextFrame(top, frame); + } + return frame; + } + + /** Load frame for sibling arc(node) on fst */ + Frame loadNextFrame(Frame top, Frame frame) throws IOException { + if (!canRewind(frame)) { + return null; + } + while (!frame.arc.isLast()) { + frame.arc = fst.readNextRealArc(frame.arc, fstReader); + frame.output = frame.arc.output(); + frame.state = fsa.step(top.state, frame.arc.label()); + if (frame.state != -1) { + break; + } + } + //if (TEST) System.out.println(" loadNext frame="+frame); + if (frame.state == -1) { + return null; + } + return frame; + } + + /** Load frame for target arc(node) on fst, so that + * arc.label >= label and !fsa.reject(arc.label) */ + Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { + FST.Arc arc = frame.arc; + arc = Util.readCeilArc(label, fst, top.arc, arc, fstReader); + if (arc == null) { + return null; + } + frame.state = fsa.step(top.state, arc.label()); + //if (TEST) System.out.println(" loadCeil frame="+frame); + if (frame.state == -1) { + return loadNextFrame(top, frame); + } + frame.output = arc.output(); + return frame; + } + + boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts + return fsa.isAccept(frame.state) && frame.arc.isFinal(); + } + boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject + return /*frame != null &&*/ frame.state != -1; + } + boolean canGrow(Frame frame) { // can walk forward on both fst&fsa + return frame.state != -1 && FST.targetHasArcs(frame.arc); + } + boolean canRewind(Frame frame) { // can jump to sibling + return !frame.arc.isLast(); + } + + void pushFrame(Frame frame) { + final FST.Arc arc = frame.arc; + frame.output = fstOutputs.add(topFrame().output, frame.output); + term = grow(arc.label()); + level++; + assert frame == stack[level]; + } + + Frame popFrame() { + term = shrink(); + return stack[level--]; + } + + Frame newFrame() { + if (level+1 == stack.length) { + final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, temp, 0, stack.length); + for (int i = stack.length; i < temp.length; i++) { + temp[i] = new Frame(); + } + stack = temp; + } + return stack[level+1]; + } + + Frame topFrame() { + return stack[level]; + } + + BytesRefBuilder grow(int label) { + if (term == null) { + term = new BytesRefBuilder(); + } else { + term.append((byte) label); + } + return term; + } + + BytesRefBuilder shrink() { + if (term.length() == 0) { + term = null; + } else { + term.setLength(term.length() - 1); + } + return term; + } + } + } + + static void walk(FST fst) throws IOException { + final ArrayList> queue = new ArrayList<>(); + final BitSet seen = new BitSet(); + final FST.BytesReader reader = fst.getBytesReader(); + final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + queue.add(startArc); + while (!queue.isEmpty()) { + final FST.Arc arc = queue.remove(0); + final long node = arc.target(); + //System.out.println(arc); + if (FST.targetHasArcs(arc) && !seen.get((int) node)) { + seen.set((int) node); + fst.readFirstRealTargetArc(node, arc, reader); + while (true) { + queue.add(new FST.Arc().copyFrom(arc)); + if (arc.isLast()) { + break; + } else { + fst.readNextRealArc(arc, reader); + } + } + } + } + } + + @Override + public long ramBytesUsed() { + long ramBytesUsed = postingsReader.ramBytesUsed(); + for (TermsReader r : fields.values()) { + ramBytesUsed += r.ramBytesUsed(); + } + return ramBytesUsed; + } + + @Override + public Collection getChildResources() { + List resources = new ArrayList<>(Accountables.namedAccountables("field", fields)); + resources.add(Accountables.namedAccountable("delegate", postingsReader)); + return Collections.unmodifiableList(resources); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")"; + } + + @Override + public void checkIntegrity() throws IOException { + postingsReader.checkIntegrity(); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java new file mode 100644 index 00000000000..a31a2f940b3 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTOrdTermsWriter.java @@ -0,0 +1,386 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; + +/** + * FST-based term dict, using ord as FST output. + * + * The FST holds the mapping between <term, ord>, and + * term's metadata is delta encoded into a single byte block. + * + * Typically the byte block consists of four parts: + * 1. term statistics: docFreq, totalTermFreq; + * 2. monotonic long[], e.g. the pointer to the postings list for that term; + * 3. generic byte[], e.g. other information customized by postings base. + * 4. single-level skip list to speed up metadata decoding by ord. + * + *

+ * Files: + *

+ * + * + *

Term Index

+ *

+ * The .tix contains a list of FSTs, one for each field. + * The FST maps a term to its corresponding order in current field. + *

+ * + *
    + *
  • TermIndex(.tix) --> Header, TermFSTNumFields, Footer
  • + *
  • TermFST --> {@link FST FST<long>}
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • + *
+ * + *

Notes:

+ *
    + *
  • + * Since terms are already sorted before writing to Term Block, + * their ords can directly used to seek term metadata from term block. + *
  • + *
+ * + * + *

Term Block

+ *

+ * The .tbk contains all the statistics and metadata for terms, along with field summary (e.g. + * per-field data like number of documents in current field). For each field, there are four blocks: + *

    + *
  • statistics bytes block: contains term statistics;
  • + *
  • metadata longs block: delta-encodes monotonic part of metadata;
  • + *
  • metadata bytes block: encodes other parts of metadata;
  • + *
  • skip block: contains skip data, to speed up metadata seeking and decoding
  • + *
+ * + *

File Format:

+ *
    + *
  • TermBlock(.tbk) --> Header, PostingsHeader, FieldSummary, DirOffset
  • + *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq, + * DocCount, LongsSize, DataBlock > NumFields, Footer
  • + * + *
  • DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, + * SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock
  • + *
  • SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, + * MetaLongsSkipDeltaLongsSize >NumTerms + *
  • StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > NumTerms + *
  • MetaLongsBlock --> < LongDeltaLongsSize, BytesSize > NumTerms + *
  • MetaBytesBlock --> Byte MetaBytesBlockLength + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • + *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • + *
  • NumFields, FieldNumber, DocCount, DocFreq, LongsSize, + * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • + *
  • NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, + * StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq, + * LongDelta,--> {@link DataOutput#writeVLong VLong}
  • + *
  • Footer --> {@link CodecUtil#writeFooter CodecFooter}
  • + *
+ *

Notes:

+ *
    + *
  • + * The format of PostingsHeader and MetaBytes are customized by the specific postings implementation: + * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data + * (non-monotonic ones like pulsed postings data). + *
  • + *
  • + * During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek + * term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset + * for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates + * the value of preceding metadata longs for every SkipInterval's term. + *
  • + *
  • + * DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term. + * Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case, + * so that encoding of TotalTermFreq may be omitted. + *
  • + *
+ * + * @lucene.experimental + */ + +public class FSTOrdTermsWriter extends FieldsConsumer { + static final String TERMS_INDEX_EXTENSION = "tix"; + static final String TERMS_BLOCK_EXTENSION = "tbk"; + static final String TERMS_CODEC_NAME = "FSTOrdTerms"; + static final String TERMS_INDEX_CODEC_NAME = "FSTOrdIndex"; + + public static final int VERSION_START = 2; + public static final int VERSION_CURRENT = VERSION_START; + public static final int SKIP_INTERVAL = 8; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + final int maxDoc; + final List fields = new ArrayList<>(); + IndexOutput blockOut = null; + IndexOutput indexOut = null; + + public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { + final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); + final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION); + + this.postingsWriter = postingsWriter; + this.fieldInfos = state.fieldInfos; + this.maxDoc = state.segmentInfo.maxDoc(); + + boolean success = false; + try { + this.indexOut = state.directory.createOutput(termsIndexFileName, state.context); + this.blockOut = state.directory.createOutput(termsBlockFileName, state.context); + CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + this.postingsWriter.init(blockOut, state); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(indexOut, blockOut); + } + } + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + TermsEnum termsEnum = terms.iterator(); + TermsWriter termsWriter = new TermsWriter(fieldInfo); + + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + FixedBitSet docsSeen = new FixedBitSet(maxDoc); + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms); + if (termState != null) { + termsWriter.finishTerm(term, termState); + sumTotalTermFreq += termState.totalTermFreq; + sumDocFreq += termState.docFreq; + } + } + + termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); + } + } + + @Override + public void close() throws IOException { + if (blockOut != null) { + boolean success = false; + try { + final long blockDirStart = blockOut.getFilePointer(); + + // write field summary + blockOut.writeVInt(fields.size()); + for (FieldMetaData field : fields) { + blockOut.writeVInt(field.fieldInfo.number); + blockOut.writeVLong(field.numTerms); + if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { + blockOut.writeVLong(field.sumTotalTermFreq); + } + blockOut.writeVLong(field.sumDocFreq); + blockOut.writeVInt(field.docCount); + blockOut.writeVInt(field.longsSize); + blockOut.writeVLong(field.statsOut.size()); + blockOut.writeVLong(field.metaLongsOut.size()); + blockOut.writeVLong(field.metaBytesOut.size()); + + field.skipOut.copyTo(blockOut); + field.statsOut.copyTo(blockOut); + field.metaLongsOut.copyTo(blockOut); + field.metaBytesOut.copyTo(blockOut); + field.dict.save(indexOut); + } + writeTrailer(blockOut, blockDirStart); + CodecUtil.writeFooter(indexOut); + CodecUtil.writeFooter(blockOut); + success = true; + } finally { + if (success) { + IOUtils.close(blockOut, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(blockOut, indexOut, postingsWriter); + } + blockOut = null; + } + } + } + + private void writeTrailer(IndexOutput out, long dirStart) throws IOException { + out.writeLong(dirStart); + } + + private static class FieldMetaData { + public FieldInfo fieldInfo; + public long numTerms; + public long sumTotalTermFreq; + public long sumDocFreq; + public int docCount; + public int longsSize; + public FST dict; + + // TODO: block encode each part + + // vint encode next skip point (fully decoded when reading) + public ByteBuffersDataOutput skipOut; + // vint encode df, (ttf-df) + public ByteBuffersDataOutput statsOut; + // vint encode monotonic long[] and length for corresponding byte[] + public ByteBuffersDataOutput metaLongsOut; + // generic byte[] + public ByteBuffersDataOutput metaBytesOut; + } + + final class TermsWriter { + private final FSTCompiler fstCompiler; + private final PositiveIntOutputs outputs; + private final FieldInfo fieldInfo; + private final int longsSize; + private long numTerms; + + private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); + private final ByteBuffersDataOutput statsOut = new ByteBuffersDataOutput(); + private final ByteBuffersDataOutput metaLongsOut = new ByteBuffersDataOutput(); + private final ByteBuffersDataOutput metaBytesOut = new ByteBuffersDataOutput(); + private final ByteBuffersDataOutput skipOut = new ByteBuffersDataOutput(); + private long lastBlockStatsFP; + private long lastBlockMetaLongsFP; + private long lastBlockMetaBytesFP; + private long[] lastBlockLongs; + + private long[] lastLongs; + private long lastMetaBytesFP; + + TermsWriter(FieldInfo fieldInfo) { + this.numTerms = 0; + this.fieldInfo = fieldInfo; + this.longsSize = postingsWriter.setField(fieldInfo); + this.outputs = PositiveIntOutputs.getSingleton(); + this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + + this.lastBlockStatsFP = 0; + this.lastBlockMetaLongsFP = 0; + this.lastBlockMetaBytesFP = 0; + this.lastBlockLongs = new long[longsSize]; + + this.lastLongs = new long[longsSize]; + this.lastMetaBytesFP = 0; + } + + public void finishTerm(BytesRef text, BlockTermState state) throws IOException { + if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) { + bufferSkip(); + } + // write term meta data into fst + final long longs[] = new long[longsSize]; + final long delta = state.totalTermFreq - state.docFreq; + if (state.totalTermFreq > 0) { + if (delta == 0) { + statsOut.writeVInt(state.docFreq<<1|1); + } else { + statsOut.writeVInt(state.docFreq<<1); + statsOut.writeVLong(state.totalTermFreq-state.docFreq); + } + } else { + statsOut.writeVInt(state.docFreq); + } + postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true); + for (int i = 0; i < longsSize; i++) { + metaLongsOut.writeVLong(longs[i] - lastLongs[i]); + lastLongs[i] = longs[i]; + } + metaLongsOut.writeVLong(metaBytesOut.size() - lastMetaBytesFP); + + fstCompiler.add(Util.toIntsRef(text, scratchTerm), numTerms); + numTerms++; + + lastMetaBytesFP = metaBytesOut.size(); + } + + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { + if (numTerms > 0) { + final FieldMetaData metadata = new FieldMetaData(); + metadata.fieldInfo = fieldInfo; + metadata.numTerms = numTerms; + metadata.sumTotalTermFreq = sumTotalTermFreq; + metadata.sumDocFreq = sumDocFreq; + metadata.docCount = docCount; + metadata.longsSize = longsSize; + metadata.skipOut = skipOut; + metadata.statsOut = statsOut; + metadata.metaLongsOut = metaLongsOut; + metadata.metaBytesOut = metaBytesOut; + metadata.dict = fstCompiler.compile(); + fields.add(metadata); + } + } + + private void bufferSkip() throws IOException { + skipOut.writeVLong(statsOut.size() - lastBlockStatsFP); + skipOut.writeVLong(metaLongsOut.size() - lastBlockMetaLongsFP); + skipOut.writeVLong(metaBytesOut.size() - lastBlockMetaBytesFP); + for (int i = 0; i < longsSize; i++) { + skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]); + } + lastBlockStatsFP = statsOut.size(); + lastBlockMetaLongsFP = metaLongsOut.size(); + lastBlockMetaBytesFP = metaBytesOut.size(); + System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize); + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java new file mode 100644 index 00000000000..97a799604e0 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + + +import java.io.IOException; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; +import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** + * FST term dict + Lucene50PBF + */ + +public final class FSTPostingsFormat extends PostingsFormat { + public FSTPostingsFormat() { + super("FST50"); + } + + @Override + public String toString() { + return getName(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene84PostingsWriter(state); + + boolean success = false; + try { + FieldsConsumer ret = new FSTTermsWriter(state, postingsWriter); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene84PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new FSTTermsReader(state, postingsReader); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java new file mode 100644 index 00000000000..3695fe872e5 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermOutputs.java @@ -0,0 +1,383 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.fst.Outputs; + +/** + * An FST {@link Outputs} implementation for + * {@link FSTTermsWriter}. + * + * @lucene.experimental + */ + +// NOTE: outputs should be per-field, since +// longsSize is fixed for each field +class FSTTermOutputs extends Outputs { + private final static TermData NO_OUTPUT = new TermData(); + //private static boolean TEST = false; + private final boolean hasPos; + private final int longsSize; + + /** + * Represents the metadata for one term. + * On an FST, only long[] part is 'shared' and pushed towards root. + * byte[] and term stats will be kept on deeper arcs. + */ + static class TermData implements Accountable { + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermData.class); + long[] longs; + byte[] bytes; + int docFreq; + long totalTermFreq; + TermData() { + this.longs = null; + this.bytes = null; + this.docFreq = 0; + this.totalTermFreq = -1; + } + TermData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) { + this.longs = longs; + this.bytes = bytes; + this.docFreq = docFreq; + this.totalTermFreq = totalTermFreq; + } + + @Override + public long ramBytesUsed() { + long ramBytesUsed = BASE_RAM_BYTES_USED; + if (longs != null) { + ramBytesUsed += RamUsageEstimator.sizeOf(longs); + } + if (bytes != null) { + ramBytesUsed += RamUsageEstimator.sizeOf(bytes); + } + return ramBytesUsed; + } + + // NOTE: actually, FST nodes are seldom + // identical when outputs on their arcs + // aren't NO_OUTPUTs. + @Override + public int hashCode() { + int hash = 0; + if (longs != null) { + final int end = longs.length; + for (int i = 0; i < end; i++) { + hash -= longs[i]; + } + } + if (bytes != null) { + hash = -hash; + final int end = bytes.length; + for (int i = 0; i < end; i++) { + hash += bytes[i]; + } + } + hash += docFreq + totalTermFreq; + return hash; + } + + @Override + public String toString() { + return "FSTTermOutputs$TermData longs=" + Arrays.toString(longs) + " bytes=" + Arrays.toString(bytes) + " docFreq=" + docFreq + " totalTermFreq=" + totalTermFreq; + } + + @Override + public boolean equals(Object other_) { + if (other_ == this) { + return true; + } else if (!(other_ instanceof FSTTermOutputs.TermData)) { + return false; + } + TermData other = (TermData) other_; + return statsEqual(this, other) && + longsEqual(this, other) && + bytesEqual(this, other); + + } + } + + protected FSTTermOutputs(FieldInfo fieldInfo, int longsSize) { + this.hasPos = fieldInfo.getIndexOptions() != IndexOptions.DOCS; + this.longsSize = longsSize; + } + + @Override + public long ramBytesUsed(TermData output) { + return output.ramBytesUsed(); + } + + @Override + // + // The return value will be the smaller one, when these two are + // 'comparable', i.e. + // 1. every value in t1 is not larger than in t2, or + // 2. every value in t1 is not smaller than t2. + // + public TermData common(TermData t1, TermData t2) { + //if (TEST) System.out.print("common("+t1+", "+t2+") = "); + if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) { + //if (TEST) System.out.println("ret:"+NO_OUTPUT); + return NO_OUTPUT; + } + assert t1.longs.length == t2.longs.length; + + long[] min = t1.longs, max = t2.longs; + int pos = 0; + TermData ret; + + while (pos < longsSize && min[pos] == max[pos]) { + pos++; + } + if (pos < longsSize) { // unequal long[] + if (min[pos] > max[pos]) { + min = t2.longs; + max = t1.longs; + } + // check whether strictly smaller + while (pos < longsSize && min[pos] <= max[pos]) { + pos++; + } + if (pos < longsSize || allZero(min)) { // not comparable or all-zero + ret = NO_OUTPUT; + } else { + ret = new TermData(min, null, 0, -1); + } + } else { // equal long[] + if (statsEqual(t1, t2) && bytesEqual(t1, t2)) { + ret = t1; + } else if (allZero(min)) { + ret = NO_OUTPUT; + } else { + ret = new TermData(min, null, 0, -1); + } + } + //if (TEST) System.out.println("ret:"+ret); + return ret; + } + + @Override + public TermData subtract(TermData t1, TermData t2) { + //if (TEST) System.out.print("subtract("+t1+", "+t2+") = "); + if (t2 == NO_OUTPUT) { + //if (TEST) System.out.println("ret:"+t1); + return t1; + } + assert t1.longs.length == t2.longs.length; + + int pos = 0; + long diff = 0; + long[] share = new long[longsSize]; + + while (pos < longsSize) { + share[pos] = t1.longs[pos] - t2.longs[pos]; + diff += share[pos]; + pos++; + } + + TermData ret; + if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) { + ret = NO_OUTPUT; + } else { + ret = new TermData(share, t1.bytes, t1.docFreq, t1.totalTermFreq); + } + //if (TEST) System.out.println("ret:"+ret); + return ret; + } + + // TODO: if we refactor a 'addSelf(TermData other)', + // we can gain about 5~7% for fuzzy queries, however this also + // means we are putting too much stress on FST Outputs decoding? + @Override + public TermData add(TermData t1, TermData t2) { + //if (TEST) System.out.print("add("+t1+", "+t2+") = "); + if (t1 == NO_OUTPUT) { + //if (TEST) System.out.println("ret:"+t2); + return t2; + } else if (t2 == NO_OUTPUT) { + //if (TEST) System.out.println("ret:"+t1); + return t1; + } + assert t1.longs.length == t2.longs.length; + + int pos = 0; + long[] accum = new long[longsSize]; + + while (pos < longsSize) { + accum[pos] = t1.longs[pos] + t2.longs[pos]; + pos++; + } + + TermData ret; + if (t2.bytes != null || t2.docFreq > 0) { + ret = new TermData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq); + } else { + ret = new TermData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq); + } + //if (TEST) System.out.println("ret:"+ret); + return ret; + } + + @Override + public void write(TermData data, DataOutput out) throws IOException { + assert hasPos || data.totalTermFreq == -1; + int bit0 = allZero(data.longs) ? 0 : 1; + int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1; + int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2; + int bits = bit0 | bit1 | bit2; + if (bit1 > 0) { // determine extra length + if (data.bytes.length < 32) { + bits |= (data.bytes.length << 3); + out.writeByte((byte)bits); + } else { + out.writeByte((byte)bits); + out.writeVInt(data.bytes.length); + } + } else { + out.writeByte((byte)bits); + } + if (bit0 > 0) { // not all-zero case + for (int pos = 0; pos < longsSize; pos++) { + out.writeVLong(data.longs[pos]); + } + } + if (bit1 > 0) { // bytes exists + out.writeBytes(data.bytes, 0, data.bytes.length); + } + if (bit2 > 0) { // stats exist + if (hasPos) { + if (data.docFreq == data.totalTermFreq) { + out.writeVInt((data.docFreq << 1) | 1); + } else { + out.writeVInt((data.docFreq << 1)); + out.writeVLong(data.totalTermFreq - data.docFreq); + } + } else { + out.writeVInt(data.docFreq); + } + } + } + + @Override + public TermData read(DataInput in) throws IOException { + long[] longs = new long[longsSize]; + byte[] bytes = null; + int docFreq = 0; + long totalTermFreq = -1; + int bits = in.readByte() & 0xff; + int bit0 = bits & 1; + int bit1 = bits & 2; + int bit2 = bits & 4; + int bytesSize = (bits >>> 3); + if (bit1 > 0 && bytesSize == 0) { // determine extra length + bytesSize = in.readVInt(); + } + if (bit0 > 0) { // not all-zero case + for (int pos = 0; pos < longsSize; pos++) { + longs[pos] = in.readVLong(); + } + } + if (bit1 > 0) { // bytes exists + bytes = new byte[bytesSize]; + in.readBytes(bytes, 0, bytesSize); + } + if (bit2 > 0) { // stats exist + int code = in.readVInt(); + if (hasPos) { + totalTermFreq = docFreq = code >>> 1; + if ((code & 1) == 0) { + totalTermFreq += in.readVLong(); + } + } else { + docFreq = code; + } + } + return new TermData(longs, bytes, docFreq, totalTermFreq); + } + + + @Override + public void skipOutput(DataInput in) throws IOException { + int bits = in.readByte() & 0xff; + int bit0 = bits & 1; + int bit1 = bits & 2; + int bit2 = bits & 4; + int bytesSize = (bits >>> 3); + if (bit1 > 0 && bytesSize == 0) { // determine extra length + bytesSize = in.readVInt(); + } + if (bit0 > 0) { // not all-zero case + for (int pos = 0; pos < longsSize; pos++) { + in.readVLong(); + } + } + if (bit1 > 0) { // bytes exists + in.skipBytes(bytesSize); + } + if (bit2 > 0) { // stats exist + int code = in.readVInt(); + if (hasPos && (code & 1) == 0) { + in.readVLong(); + } + } + } + + @Override + public TermData getNoOutput() { + return NO_OUTPUT; + } + + @Override + public String outputToString(TermData data) { + return data.toString(); + } + + static boolean statsEqual(final TermData t1, final TermData t2) { + return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq; + } + static boolean bytesEqual(final TermData t1, final TermData t2) { + if (t1.bytes == null && t2.bytes == null) { + return true; + } + return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes); + } + static boolean longsEqual(final TermData t1, final TermData t2) { + if (t1.longs == null && t2.longs == null) { + return true; + } + return t1.longs != null && t2.longs != null && Arrays.equals(t1.longs, t2.longs); + } + static boolean allZero(final long[] l) { + for (int i = 0; i < l.length; i++) { + if (l[i] != 0) { + return false; + } + } + return true; + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java new file mode 100644 index 00000000000..33084766424 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsReader.java @@ -0,0 +1,785 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.TreeMap; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.Accountables; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.automaton.ByteRunAutomaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; +import org.apache.lucene.util.fst.BytesRefFSTEnum; +import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Outputs; +import org.apache.lucene.util.fst.Util; + +/** + * FST-based terms dictionary reader. + * + * The FST directly maps each term and its metadata, + * it is memory resident. + * + * @lucene.experimental + */ + +public class FSTTermsReader extends FieldsProducer { + final TreeMap fields = new TreeMap<>(); + final PostingsReaderBase postingsReader; + //static boolean TEST = false; + + public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION); + + this.postingsReader = postingsReader; + final IndexInput in = state.directory.openInput(termsFileName, state.context); + + boolean success = false; + try { + CodecUtil.checkIndexHeader(in, FSTTermsWriter.TERMS_CODEC_NAME, + FSTTermsWriter.TERMS_VERSION_START, + FSTTermsWriter.TERMS_VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.checksumEntireFile(in); + this.postingsReader.init(in, state); + seekDir(in); + + final FieldInfos fieldInfos = state.fieldInfos; + final int numFields = in.readVInt(); + for (int i = 0; i < numFields; i++) { + int fieldNumber = in.readVInt(); + FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + long numTerms = in.readVLong(); + long sumTotalTermFreq = in.readVLong(); + // if frequencies are omitted, sumTotalTermFreq=sumDocFreq and we only write one value + long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : in.readVLong(); + int docCount = in.readVInt(); + int longsSize = in.readVInt(); + TermsReader current = new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize); + TermsReader previous = fields.put(fieldInfo.name, current); + checkFieldSummary(state.segmentInfo, in, current, previous); + } + success = true; + } finally { + if (success) { + IOUtils.close(in); + } else { + IOUtils.closeWhileHandlingException(in); + } + } + } + + private void seekDir(IndexInput in) throws IOException { + in.seek(in.length() - CodecUtil.footerLength() - 8); + in.seek(in.readLong()); + } + private void checkFieldSummary(SegmentInfo info, IndexInput in, TermsReader field, TermsReader previous) throws IOException { + // #docs with field must be <= #docs + if (field.docCount < 0 || field.docCount > info.maxDoc()) { + throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.maxDoc(), in); + } + // #postings must be >= #docs with field + if (field.sumDocFreq < field.docCount) { + throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount, in); + } + // #positions must be >= #postings + if (field.sumTotalTermFreq < field.sumDocFreq) { + throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq, in); + } + if (previous != null) { + throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name, in); + } + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + + @Override + public void close() throws IOException { + try { + IOUtils.close(postingsReader); + } finally { + fields.clear(); + } + } + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsReader.class); + final class TermsReader extends Terms implements Accountable { + + final FieldInfo fieldInfo; + final long numTerms; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final int longsSize; + final FST dict; + + TermsReader(FieldInfo fieldInfo, IndexInput in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException { + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.longsSize = longsSize; + this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo, longsSize)); + } + + @Override + public long ramBytesUsed() { + long bytesUsed = BASE_RAM_BYTES_USED; + if (dict != null) { + bytesUsed += dict.ramBytesUsed(); + } + return bytesUsed; + } + + @Override + public Collection getChildResources() { + if (dict == null) { + return Collections.emptyList(); + } else { + return Collections.singletonList(Accountables.namedAccountable("terms", dict)); + } + } + + @Override + public String toString() { + return "FSTTerms(terms=" + numTerms + ",postings=" + sumDocFreq + ",positions=" + sumTotalTermFreq + ",docs=" + docCount + ")"; + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + } + + @Override + public boolean hasOffsets() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() throws IOException { + return sumDocFreq; + } + + @Override + public int getDocCount() throws IOException { + return docCount; + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(); + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new IntersectTermsEnum(compiled, startTerm); + } + + // Only wraps common operations for PBF interact + abstract class BaseTermsEnum extends org.apache.lucene.index.BaseTermsEnum { + + /* Current term stats + decoded metadata (customized by PBF) */ + final BlockTermState state; + + /* Current term stats + undecoded metadata (long[] & byte[]) */ + FSTTermOutputs.TermData meta; + ByteArrayDataInput bytesReader; + + /** Decodes metadata into customized term state */ + abstract void decodeMetaData() throws IOException; + + BaseTermsEnum() throws IOException { + this.state = postingsReader.newTermState(); + this.bytesReader = new ByteArrayDataInput(); + // NOTE: metadata will only be initialized in child class + } + + @Override + public TermState termState() throws IOException { + decodeMetaData(); + return state.clone(); + } + + @Override + public int docFreq() throws IOException { + return state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + return state.totalTermFreq == -1 ? state.docFreq : state.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + decodeMetaData(); + return postingsReader.postings(fieldInfo, state, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + decodeMetaData(); + return postingsReader.impacts(fieldInfo, state, flags); + } + + @Override + public void seekExact(long ord) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + } + + + // Iterates through all terms in this field + private final class SegmentTermsEnum extends BaseTermsEnum { + /* Current term, null when enum ends or unpositioned */ + BytesRef term; + final BytesRefFSTEnum fstEnum; + + /* True when current term's metadata is decoded */ + boolean decoded; + + /* True when current enum is 'positioned' by seekExact(TermState) */ + boolean seekPending; + + SegmentTermsEnum() throws IOException { + super(); + this.fstEnum = new BytesRefFSTEnum<>(dict); + this.decoded = false; + this.seekPending = false; + this.meta = null; + } + + @Override + public BytesRef term() throws IOException { + return term; + } + + // Let PBF decode metadata from long[] and byte[] + @Override + void decodeMetaData() throws IOException { + if (!decoded && !seekPending) { + if (meta.bytes != null) { + bytesReader.reset(meta.bytes, 0, meta.bytes.length); + } + postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true); + decoded = true; + } + } + + // Update current enum according to FSTEnum + void updateEnum(final InputOutput pair) { + if (pair == null) { + term = null; + } else { + term = pair.input; + meta = pair.output; + state.docFreq = meta.docFreq; + state.totalTermFreq = meta.totalTermFreq; + } + decoded = false; + seekPending = false; + } + + @Override + public BytesRef next() throws IOException { + if (seekPending) { // previously positioned, but termOutputs not fetched + seekPending = false; + SeekStatus status = seekCeil(term); + assert status == SeekStatus.FOUND; // must positioned on valid term + } + updateEnum(fstEnum.next()); + return term; + } + + @Override + public boolean seekExact(BytesRef target) throws IOException { + updateEnum(fstEnum.seekExact(target)); + return term != null; + } + + @Override + public SeekStatus seekCeil(BytesRef target) throws IOException { + updateEnum(fstEnum.seekCeil(target)); + if (term == null) { + return SeekStatus.END; + } else { + return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + } + } + + @Override + public void seekExact(BytesRef target, TermState otherState) { + if (!target.equals(term)) { + state.copyFrom(otherState); + term = BytesRef.deepCopyOf(target); + seekPending = true; + } + } + } + + // Iterates intersect result with automaton (cannot seek!) + private final class IntersectTermsEnum extends BaseTermsEnum { + /* Current term, null when enum ends or unpositioned */ + BytesRefBuilder term; + /* True when current term's metadata is decoded */ + boolean decoded; + + /* True when there is pending term when calling next() */ + boolean pending; + + /* stack to record how current term is constructed, + * used to accumulate metadata or rewind term: + * level == term.length + 1, + * == 0 when term is null */ + Frame[] stack; + int level; + + /* to which level the metadata is accumulated + * so that we can accumulate metadata lazily */ + int metaUpto; + + /* term dict fst */ + final FST fst; + final FST.BytesReader fstReader; + final Outputs fstOutputs; + + /* query automaton to intersect with */ + final ByteRunAutomaton fsa; + + private final class Frame { + /* fst stats */ + FST.Arc fstArc; + + FSTTermOutputs.TermData output; + + /* automaton stats */ + int fsaState; + + Frame() { + this.fstArc = new FST.Arc<>(); + this.fsaState = -1; + } + + public String toString() { + return "arc=" + fstArc + " state=" + fsaState; + } + } + + IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + super(); + //if (TEST) System.out.println("Enum init, startTerm=" + startTerm); + this.fst = dict; + this.fstReader = fst.getBytesReader(); + this.fstOutputs = dict.outputs; + this.fsa = compiled.runAutomaton; + this.level = -1; + this.stack = new Frame[16]; + for (int i = 0 ; i < stack.length; i++) { + this.stack[i] = new Frame(); + } + + loadVirtualFrame(newFrame()); + this.level++; + pushFrame(loadFirstFrame(newFrame())); + + this.meta = null; + this.metaUpto = 1; + this.decoded = false; + this.pending = false; + + if (startTerm == null) { + pending = isAccept(topFrame()); + } else { + doSeekCeil(startTerm); + pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame()); + } + } + + @Override + public BytesRef term() throws IOException { + return term == null ? null : term.get(); + } + + @Override + void decodeMetaData() throws IOException { + assert term != null; + if (!decoded) { + if (meta.bytes != null) { + bytesReader.reset(meta.bytes, 0, meta.bytes.length); + } + postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true); + decoded = true; + } + } + + /** Lazily accumulate meta data, when we got a accepted term */ + void loadMetaData() { + Frame last, next; + last = stack[metaUpto]; + while (metaUpto != level) { + metaUpto++; + next = stack[metaUpto]; + next.output = fstOutputs.add(next.output, last.output); + last = next; + } + if (last.fstArc.isFinal()) { + meta = fstOutputs.add(last.output, last.fstArc.nextFinalOutput()); + } else { + meta = last.output; + } + state.docFreq = meta.docFreq; + state.totalTermFreq = meta.totalTermFreq; + } + + @Override + public SeekStatus seekCeil(BytesRef target) throws IOException { + decoded = false; + doSeekCeil(target); + loadMetaData(); + if (term == null) { + return SeekStatus.END; + } else { + return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND; + } + } + + @Override + public BytesRef next() throws IOException { + //if (TEST) System.out.println("Enum next()"); + if (pending) { + pending = false; + loadMetaData(); + return term(); + } + decoded = false; + DFS: + while (level > 0) { + Frame frame = newFrame(); + if (loadExpandFrame(topFrame(), frame) != null) { // has valid target + pushFrame(frame); + if (isAccept(frame)) { // gotcha + break; + } + continue; // check next target + } + frame = popFrame(); + while(level > 0) { + if (loadNextFrame(topFrame(), frame) != null) { // has valid sibling + pushFrame(frame); + if (isAccept(frame)) { // gotcha + break DFS; + } + continue DFS; // check next target + } + frame = popFrame(); + } + return null; + } + loadMetaData(); + return term(); + } + + private BytesRef doSeekCeil(BytesRef target) throws IOException { + //if (TEST) System.out.println("Enum doSeekCeil()"); + Frame frame= null; + int label, upto = 0, limit = target.length; + while (upto < limit) { // to target prefix, or ceil label (rewind prefix) + frame = newFrame(); + label = target.bytes[upto] & 0xff; + frame = loadCeilFrame(label, topFrame(), frame); + if (frame == null || frame.fstArc.label() != label) { + break; + } + assert isValid(frame); // target must be fetched from automaton + pushFrame(frame); + upto++; + } + if (upto == limit) { // got target + return term(); + } + if (frame != null) { // got larger term('s prefix) + pushFrame(frame); + return isAccept(frame) ? term() : next(); + } + while (level > 0) { // got target's prefix, advance to larger term + frame = popFrame(); + while (level > 0 && !canRewind(frame)) { + frame = popFrame(); + } + if (loadNextFrame(topFrame(), frame) != null) { + pushFrame(frame); + return isAccept(frame) ? term() : next(); + } + } + return null; + } + + /** Virtual frame, never pop */ + Frame loadVirtualFrame(Frame frame) { + frame.output = fstOutputs.getNoOutput(); + frame.fsaState = -1; + return frame; + } + + /** Load frame for start arc(node) on fst */ + Frame loadFirstFrame(Frame frame) throws IOException { + frame.fstArc = fst.getFirstArc(frame.fstArc); + frame.output = frame.fstArc.output(); + frame.fsaState = 0; + return frame; + } + + /** Load frame for target arc(node) on fst */ + Frame loadExpandFrame(Frame top, Frame frame) throws IOException { + if (!canGrow(top)) { + return null; + } + frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target(), frame.fstArc, fstReader); + frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); + //if (TEST) System.out.println(" loadExpand frame="+frame); + if (frame.fsaState == -1) { + return loadNextFrame(top, frame); + } + frame.output = frame.fstArc.output(); + return frame; + } + + /** Load frame for sibling arc(node) on fst */ + Frame loadNextFrame(Frame top, Frame frame) throws IOException { + if (!canRewind(frame)) { + return null; + } + while (!frame.fstArc.isLast()) { + frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader); + frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label()); + if (frame.fsaState != -1) { + break; + } + } + //if (TEST) System.out.println(" loadNext frame="+frame); + if (frame.fsaState == -1) { + return null; + } + frame.output = frame.fstArc.output(); + return frame; + } + + /** Load frame for target arc(node) on fst, so that + * arc.label >= label and !fsa.reject(arc.label) */ + Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException { + FST.Arc arc = frame.fstArc; + arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader); + if (arc == null) { + return null; + } + frame.fsaState = fsa.step(top.fsaState, arc.label()); + //if (TEST) System.out.println(" loadCeil frame="+frame); + if (frame.fsaState == -1) { + return loadNextFrame(top, frame); + } + frame.output = frame.fstArc.output(); + return frame; + } + + boolean isAccept(Frame frame) { // reach a term both fst&fsa accepts + return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal(); + } + boolean isValid(Frame frame) { // reach a prefix both fst&fsa won't reject + return /*frame != null &&*/ frame.fsaState != -1; + } + boolean canGrow(Frame frame) { // can walk forward on both fst&fsa + return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc); + } + boolean canRewind(Frame frame) { // can jump to sibling + return !frame.fstArc.isLast(); + } + + void pushFrame(Frame frame) { + term = grow(frame.fstArc.label()); + level++; + //if (TEST) System.out.println(" term=" + term + " level=" + level); + } + + Frame popFrame() { + term = shrink(); + level--; + metaUpto = metaUpto > level ? level : metaUpto; + //if (TEST) System.out.println(" term=" + term + " level=" + level); + return stack[level+1]; + } + + Frame newFrame() { + if (level+1 == stack.length) { + final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, temp, 0, stack.length); + for (int i = stack.length; i < temp.length; i++) { + temp[i] = new Frame(); + } + stack = temp; + } + return stack[level+1]; + } + + Frame topFrame() { + return stack[level]; + } + + BytesRefBuilder grow(int label) { + if (term == null) { + term = new BytesRefBuilder(); + } else { + term.append((byte)label); + } + return term; + } + + BytesRefBuilder shrink() { + if (term.length() == 0) { + term = null; + } else { + term.setLength(term.length() - 1); + } + return term; + } + } + } + + static void walk(FST fst) throws IOException { + final ArrayList> queue = new ArrayList<>(); + final BitSet seen = new BitSet(); + final FST.BytesReader reader = fst.getBytesReader(); + final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + queue.add(startArc); + while (!queue.isEmpty()) { + final FST.Arc arc = queue.remove(0); + final long node = arc.target(); + //System.out.println(arc); + if (FST.targetHasArcs(arc) && !seen.get((int) node)) { + seen.set((int) node); + fst.readFirstRealTargetArc(node, arc, reader); + while (true) { + queue.add(new FST.Arc().copyFrom(arc)); + if (arc.isLast()) { + break; + } else { + fst.readNextRealArc(arc, reader); + } + } + } + } + } + + @Override + public long ramBytesUsed() { + long ramBytesUsed = postingsReader.ramBytesUsed(); + for (TermsReader r : fields.values()) { + ramBytesUsed += r.ramBytesUsed(); + } + return ramBytesUsed; + } + + @Override + public Collection getChildResources() { + List resources = new ArrayList<>(Accountables.namedAccountables("field", fields)); + resources.add(Accountables.namedAccountable("delegate", postingsReader)); + return Collections.unmodifiableCollection(resources); + } + + @Override + public String toString() { + return getClass().getSimpleName() + "(fields=" + fields.size() + ",delegate=" + postingsReader + ")"; + } + + @Override + public void checkIntegrity() throws IOException { + postingsReader.checkIntegrity(); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java new file mode 100644 index 00000000000..2ef15651041 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTTermsWriter.java @@ -0,0 +1,291 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.fst.FSTCompiler; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.Util; + +/** + * FST-based term dict, using metadata as FST output. + * + * The FST directly holds the mapping between <term, metadata>. + * + * Term metadata consists of three parts: + * 1. term statistics: docFreq, totalTermFreq; + * 2. monotonic long[], e.g. the pointer to the postings list for that term; + * 3. generic byte[], e.g. other information need by postings reader. + * + *

+ * File: + *

+ *

+ * + * + *

Term Dictionary

+ *

+ * The .tst contains a list of FSTs, one for each field. + * The FST maps a term to its corresponding statistics (e.g. docfreq) + * and metadata (e.g. information for postings list reader like file pointer + * to postings list). + *

+ *

+ * Typically the metadata is separated into two parts: + *

    + *
  • + * Monotonical long array: Some metadata will always be ascending in order + * with the corresponding term. This part is used by FST to share outputs between arcs. + *
  • + *
  • + * Generic byte array: Used to store non-monotonic metadata. + *
  • + *
+ * + * File format: + *
    + *
  • TermsDict(.tst) --> Header, PostingsHeader, FieldSummary, DirOffset
  • + *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, + * SumDocFreq, DocCount, LongsSize, TermFST >NumFields
  • + *
  • TermFST --> {@link FST FST<TermData>}
  • + *
  • TermData --> Flag, BytesSize?, LongDeltaLongsSize?, ByteBytesSize?, + * < DocFreq[Same?], (TotalTermFreq-DocFreq) > ?
  • + *
  • Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
  • + *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • + *
  • DocFreq, LongsSize, BytesSize, NumFields, + * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • + *
  • TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta --> + * {@link DataOutput#writeVLong VLong}
  • + *
+ *

Notes:

+ *
    + *
  • + * The format of PostingsHeader and generic meta bytes are customized by the specific postings implementation: + * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data + * (non-monotonic ones like pulsed postings data). + *
  • + *
  • + * The format of TermData is determined by FST, typically monotonic metadata will be dense around shallow arcs, + * while in deeper arcs only generic bytes and term statistics exist. + *
  • + *
  • + * The byte Flag is used to indicate which part of metadata exists on current arc. Specially the monotonic part + * is omitted when it is an array of 0s. + *
  • + *
  • + * Since LongsSize is per-field fixed, it is only written once in field summary. + *
  • + *
+ * + * @lucene.experimental + */ + +public class FSTTermsWriter extends FieldsConsumer { + static final String TERMS_EXTENSION = "tfp"; + static final String TERMS_CODEC_NAME = "FSTTerms"; + public static final int TERMS_VERSION_START = 2; + public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + IndexOutput out; + final int maxDoc; + final List fields = new ArrayList<>(); + + public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { + final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); + + this.postingsWriter = postingsWriter; + this.fieldInfos = state.fieldInfos; + this.out = state.directory.createOutput(termsFileName, state.context); + this.maxDoc = state.segmentInfo.maxDoc(); + + boolean success = false; + try { + CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + + this.postingsWriter.init(out, state); + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(out); + } + } + } + + private void writeTrailer(IndexOutput out, long dirStart) throws IOException { + out.writeLong(dirStart); + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + for(String field : fields) { + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + TermsEnum termsEnum = terms.iterator(); + TermsWriter termsWriter = new TermsWriter(fieldInfo); + + long sumTotalTermFreq = 0; + long sumDocFreq = 0; + FixedBitSet docsSeen = new FixedBitSet(maxDoc); + + while (true) { + BytesRef term = termsEnum.next(); + if (term == null) { + break; + } + + BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms); + if (termState != null) { + termsWriter.finishTerm(term, termState); + sumTotalTermFreq += termState.totalTermFreq; + sumDocFreq += termState.docFreq; + } + } + + termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality()); + } + } + + @Override + public void close() throws IOException { + if (out != null) { + boolean success = false; + try { + // write field summary + final long dirStart = out.getFilePointer(); + + out.writeVInt(fields.size()); + for (FieldMetaData field : fields) { + out.writeVInt(field.fieldInfo.number); + out.writeVLong(field.numTerms); + if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) { + out.writeVLong(field.sumTotalTermFreq); + } + out.writeVLong(field.sumDocFreq); + out.writeVInt(field.docCount); + out.writeVInt(field.longsSize); + field.dict.save(out); + } + writeTrailer(out, dirStart); + CodecUtil.writeFooter(out); + success = true; + } finally { + if (success) { + IOUtils.close(out, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(out, postingsWriter); + } + out = null; + } + } + } + + private static class FieldMetaData { + public final FieldInfo fieldInfo; + public final long numTerms; + public final long sumTotalTermFreq; + public final long sumDocFreq; + public final int docCount; + public final int longsSize; + public final FST dict; + + public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST fst) { + this.fieldInfo = fieldInfo; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.longsSize = longsSize; + this.dict = fst; + } + } + + final class TermsWriter { + private final FSTCompiler fstCompiler; + private final FSTTermOutputs outputs; + private final FieldInfo fieldInfo; + private final int longsSize; + private long numTerms; + + private final IntsRefBuilder scratchTerm = new IntsRefBuilder(); + private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + + TermsWriter(FieldInfo fieldInfo) { + this.numTerms = 0; + this.fieldInfo = fieldInfo; + this.longsSize = postingsWriter.setField(fieldInfo); + this.outputs = new FSTTermOutputs(fieldInfo, longsSize); + this.fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); + } + + public void finishTerm(BytesRef text, BlockTermState state) throws IOException { + // write term meta data into fst + final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData(); + meta.longs = new long[longsSize]; + meta.bytes = null; + meta.docFreq = state.docFreq; + meta.totalTermFreq = state.totalTermFreq; + postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true); + if (metaWriter.size() > 0) { + meta.bytes = metaWriter.toArrayCopy(); + metaWriter.reset(); + } + fstCompiler.add(Util.toIntsRef(text, scratchTerm), meta); + numTerms++; + } + + public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { + // save FST dict + if (numTerms > 0) { + final FST fst = fstCompiler.compile(); + fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst)); + } + } + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java index ec73ddcb838..52c7465d093 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java @@ -94,7 +94,7 @@ public class DeltaBaseTermStateSerializer implements Accountable { /** * Writes a {@link BlockTermState} to the provided {@link DataOutput}. *

- * Simpler variant of {@link Lucene84PostingsWriter#encodeTerm(DataOutput, FieldInfo, BlockTermState, boolean)}. + * Simpler variant of {@link Lucene84PostingsWriter#encodeTerm(long[], DataOutput, FieldInfo, BlockTermState, boolean)}. */ public void writeTermState(DataOutput termStatesOutput, FieldInfo fieldInfo, BlockTermState termState) throws IOException { IndexOptions indexOptions = fieldInfo.getIndexOptions(); @@ -143,7 +143,7 @@ public class DeltaBaseTermStateSerializer implements Accountable { /** * Reads a {@link BlockTermState} from the provided {@link DataInput}. *

- * Simpler variant of {@link Lucene84PostingsReader#decodeTerm(DataInput, FieldInfo, BlockTermState, boolean)}. + * Simpler variant of {@link Lucene84PostingsReader#decodeTerm(long[], DataInput, FieldInfo, BlockTermState, boolean)}. * * @param reuse {@link BlockTermState} to reuse; or null to create a new one. */ diff --git a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index ec315c0f642..55b8a48e3ef 100644 --- a/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -16,5 +16,7 @@ org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat org.apache.lucene.codecs.memory.DirectPostingsFormat +org.apache.lucene.codecs.memory.FSTOrdPostingsFormat +org.apache.lucene.codecs.memory.FSTPostingsFormat org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat org.apache.lucene.codecs.uniformsplit.sharedterms.STUniformSplitPostingsFormat diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java new file mode 100644 index 00000000000..ec860859a85 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTOrdPostingsFormat.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BasePostingsFormatTestCase; +import org.apache.lucene.util.TestUtil; + +/** + * Tests FSTOrdPostingsFormat + */ +public class TestFSTOrdPostingsFormat extends BasePostingsFormatTestCase { + private final Codec codec = TestUtil.alwaysPostingsFormat(new FSTOrdPostingsFormat()); + + @Override + protected Codec getCodec() { + return codec; + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java new file mode 100644 index 00000000000..939c5e33fe5 --- /dev/null +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/memory/TestFSTPostingsFormat.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.memory; + + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.BasePostingsFormatTestCase; +import org.apache.lucene.util.TestUtil; + +/** + * Tests FSTPostingsFormat + */ +public class TestFSTPostingsFormat extends BasePostingsFormatTestCase { + private final Codec codec = TestUtil.alwaysPostingsFormat(new FSTPostingsFormat()); + + @Override + protected Codec getCodec() { + return codec; + } +} diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java index 8ef246fbb30..a77e7820152 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytesComparator.java @@ -159,7 +159,7 @@ public class TestTermBytesComparator extends LuceneTestCase { } @Override - public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { } @Override diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java index 6d09fe36e16..f63d63643fa 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/STBlockReaderTest.java @@ -268,7 +268,7 @@ public class STBlockReaderTest extends LuceneTestCase { } @Override - public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) { } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java index a1244ca7686..4fed1a07e7a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsReaderBase.java @@ -61,7 +61,7 @@ public abstract class PostingsReaderBase implements Closeable, Accountable { /** Actually decode metadata for next term * @see PostingsWriterBase#encodeTerm */ - public abstract void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; + public abstract void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; /** Must fully consume state, since after this call that * TermState may be reused. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java index a8f8ed42aa8..48c6027b286 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java @@ -68,12 +68,21 @@ public abstract class PostingsWriterBase implements Closeable { * Usually elements in {@code longs} are file pointers, so each one always * increases when a new term is consumed. {@code out} is used to write generic * bytes, which are not monotonic. + * + * NOTE: sometimes long[] might contain "don't care" values that are unused, e.g. + * the pointer to postings list may not be defined for some terms but is defined + * for others, if it is designed to inline some postings data in term dictionary. + * In this case, the postings writer should always use the last value, so that each + * element in metadata long[] remains monotonic. */ - public abstract void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; + public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException; /** - * Sets the current field for writing. */ - public abstract void setField(FieldInfo fieldInfo); + * Sets the current field for writing, and returns the + * fixed length of long[] metadata (which is fixed per + * field), called when the writing switches to another field. */ + // TODO: better name? + public abstract int setField(FieldInfo fieldInfo); @Override public abstract void close() throws IOException; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java index f51f0c6f967..f9770869f24 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PushPostingsWriterBase.java @@ -87,7 +87,7 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase { * fixed length of long[] metadata (which is fixed per * field), called when the writing switches to another field. */ @Override - public void setField(FieldInfo fieldInfo) { + public int setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; indexOptions = fieldInfo.getIndexOptions(); @@ -113,6 +113,8 @@ public abstract class PushPostingsWriterBase extends PostingsWriterBase { enumFlags = PostingsEnum.OFFSETS; } } + + return 0; } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java index b9dc0bb436b..0a0cd31c857 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsReader.java @@ -128,11 +128,8 @@ public final class BlockTreeTermsReader extends FieldsProducer { /** Auto-prefix terms have been superseded by points. */ public static final int VERSION_AUTO_PREFIX_TERMS_REMOVED = 3; - /** The long[] + byte[] metadata has been replaced with a single byte[]. */ - public static final int VERSION_META_LONGS_REMOVED = 4; - /** Current terms format. */ - public static final int VERSION_CURRENT = VERSION_META_LONGS_REMOVED; + public static final int VERSION_CURRENT = VERSION_AUTO_PREFIX_TERMS_REMOVED; /** Extension of terms index file */ static final String TERMS_INDEX_EXTENSION = "tip"; @@ -215,11 +212,9 @@ public final class BlockTreeTermsReader extends FieldsProducer { // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is written. final long sumDocFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS ? sumTotalTermFreq : termsIn.readVLong(); final int docCount = termsIn.readVInt(); - if (version < VERSION_META_LONGS_REMOVED) { - final int longsSize = termsIn.readVInt(); - if (longsSize < 0) { - throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); - } + final int longsSize = termsIn.readVInt(); + if (longsSize < 0) { + throw new CorruptIndexException("invalid longsSize for field: " + fieldInfo.name + ", longsSize=" + longsSize, termsIn); } BytesRef minTerm = readBytesRef(termsIn); BytesRef maxTerm = readBytesRef(termsIn); @@ -236,7 +231,7 @@ public final class BlockTreeTermsReader extends FieldsProducer { final long indexStartFP = indexIn.readVLong(); FieldReader previous = fieldMap.put(fieldInfo.name, new FieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, - indexStartFP, indexIn, minTerm, maxTerm, state.openedFromWriter, perFieldLoadMode)); + indexStartFP, longsSize, indexIn, minTerm, maxTerm, state.openedFromWriter, perFieldLoadMode)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name, termsIn); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index 380cf799a4d..deece0b5266 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -224,10 +224,11 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; + private final int longsSize; public final BytesRef minTerm; public final BytesRef maxTerm; - public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, + public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long indexStartFP, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, BytesRef minTerm, BytesRef maxTerm) { assert numTerms > 0; this.fieldInfo = fieldInfo; @@ -238,6 +239,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; + this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; } @@ -507,6 +509,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { class TermsWriter { private final FieldInfo fieldInfo; + private final int longsSize; private long numTerms; final FixedBitSet docsSeen; long sumTotalTermFreq; @@ -521,6 +524,8 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private int[] prefixStarts = new int[8]; + private final long[] longs; + // Pending stack of terms and blocks. As terms arrive (in sorted order) // we append to this stack, and once the top of the stack has enough // terms starting with a common prefix, we write a new block with @@ -715,7 +720,13 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } // Write term meta data - postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.copyTo(metaWriter); + bytesWriter.reset(); absolute = false; } } else { @@ -760,7 +771,13 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { // separate anymore: // Write term meta data - postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.copyTo(metaWriter); + bytesWriter.reset(); absolute = false; } else { PendingBlock block = (PendingBlock) ent; @@ -828,7 +845,9 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { this.fieldInfo = fieldInfo; assert fieldInfo.getIndexOptions() != IndexOptions.NONE; docsSeen = new FixedBitSet(maxDoc); - postingsWriter.setField(fieldInfo); + + this.longsSize = postingsWriter.setField(fieldInfo); + this.longs = new long[longsSize]; } /** Writes one term's worth of postings. */ @@ -945,6 +964,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { sumTotalTermFreq, sumDocFreq, docsSeen.cardinality(), + longsSize, minTerm, maxTerm)); } else { assert sumTotalTermFreq == 0 || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1; @@ -956,6 +976,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); } private boolean closed; @@ -988,6 +1009,7 @@ public final class BlockTreeTermsWriter extends FieldsConsumer { } termsOut.writeVLong(field.sumDocFreq); termsOut.writeVInt(field.docCount); + termsOut.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(termsOut, field.minTerm); writeBytesRef(termsOut, field.maxTerm); diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java index c185cbcb733..9189b63d366 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/FieldReader.java @@ -58,6 +58,7 @@ public final class FieldReader extends Terms implements Accountable { final BytesRef rootCode; final BytesRef minTerm; final BytesRef maxTerm; + final int longsSize; final BlockTreeTermsReader parent; final FST index; @@ -65,7 +66,7 @@ public final class FieldReader extends Terms implements Accountable { //private boolean DEBUG; FieldReader(BlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, BytesRef rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, - long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm, boolean openedFromWriter, BlockTreeTermsReader.FSTLoadMode fstLoadMode) throws IOException { + long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm, boolean openedFromWriter, BlockTreeTermsReader.FSTLoadMode fstLoadMode) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -76,6 +77,7 @@ public final class FieldReader extends Terms implements Accountable { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; + this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java index d64a4aa8232..b1cfa7c04d2 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/IntersectTermsEnumFrame.java @@ -80,8 +80,11 @@ final class IntersectTermsEnumFrame { FST.Arc arc; final BlockTermState termState; + + // metadata buffer, holding monotonic values + final long[] longs; - // metadata buffer + // metadata buffer, holding general values byte[] bytes = new byte[32]; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); @@ -99,6 +102,7 @@ final class IntersectTermsEnumFrame { this.ord = ord; this.termState = ite.fr.parent.postingsReader.newTermState(); this.termState.totalTermFreq = -1; + this.longs = new long[ite.fr.longsSize]; } void loadNextFloorBlock() throws IOException { @@ -274,8 +278,11 @@ final class IntersectTermsEnumFrame { } else { termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); } - // metadata - ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); + // metadata + for (int i = 0; i < ite.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ite.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ite.fr.fieldInfo, termState, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java index 1e9e6245a39..fdb4cc6955b 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/SegmentTermsEnumFrame.java @@ -85,7 +85,9 @@ final class SegmentTermsEnumFrame { final BlockTermState state; - // metadata buffer + // metadata buffer, holding monotonic values + final long[] longs; + // metadata buffer, holding general values byte[] bytes = new byte[32]; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); @@ -96,6 +98,7 @@ final class SegmentTermsEnumFrame { this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; + this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -421,8 +424,11 @@ final class SegmentTermsEnumFrame { state.totalTermFreq = state.docFreq + statsReader.readVLong(); //if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq); } - // metadata - ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); + // metadata + for (int i = 0; i < ste.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java index 895db33f0b4..b0620997726 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsReader.java @@ -166,7 +166,7 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { } @Override - public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IntBlockTermState termState = (IntBlockTermState) _termState; final boolean fieldHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; @@ -179,11 +179,11 @@ public final class Lucene84PostingsReader extends PostingsReaderBase { termState.payStartFP = 0; } - termState.docStartFP += in.readVLong(); + termState.docStartFP += longs[0]; if (fieldHasPositions) { - termState.posStartFP += in.readVLong(); + termState.posStartFP += longs[1]; if (fieldHasOffsets || fieldHasPayloads) { - termState.payStartFP += in.readVLong(); + termState.payStartFP += longs[2]; } } if (termState.docFreq == 1) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java index 29d812e59c8..e42669af415 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/Lucene84PostingsWriter.java @@ -190,11 +190,20 @@ public final class Lucene84PostingsWriter extends PushPostingsWriterBase { } @Override - public void setField(FieldInfo fieldInfo) { + public int setField(FieldInfo fieldInfo) { super.setField(fieldInfo); skipWriter.setField(writePositions, writeOffsets, writePayloads); lastState = emptyState; fieldHasNorms = fieldInfo.hasNorms(); + if (writePositions) { + if (writePayloads || writeOffsets) { + return 3; // doc + pos + pay FP + } else { + return 2; // doc + pos FP + } + } else { + return 1; // doc FP + } } @Override @@ -457,16 +466,16 @@ public final class Lucene84PostingsWriter extends PushPostingsWriterBase { } @Override - public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IntBlockTermState state = (IntBlockTermState)_state; if (absolute) { lastState = emptyState; } - out.writeVLong(state.docStartFP - lastState.docStartFP); + longs[0] = state.docStartFP - lastState.docStartFP; if (writePositions) { - out.writeVLong(state.posStartFP - lastState.posStartFP); + longs[1] = state.posStartFP - lastState.posStartFP; if (writePayloads || writeOffsets) { - out.writeVLong(state.payStartFP - lastState.payStartFP); + longs[2] = state.payStartFP - lastState.payStartFP; } } if (state.singletonDocID != -1) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java index 3ecd4734b33..e7f4c4c7730 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsReader.java @@ -50,7 +50,7 @@ final class IDVersionPostingsReader extends PostingsReaderBase { } @Override - public void decodeTerm(DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) throws IOException { final IDVersionTermState termState = (IDVersionTermState) _termState; termState.docID = in.readVInt(); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java index 2ac451fcc08..30e19807b31 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionPostingsWriter.java @@ -46,6 +46,7 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { private long lastVersion; private final Bits liveDocs; + private String segment; public IDVersionPostingsWriter(Bits liveDocs) { this.liveDocs = liveDocs; @@ -59,10 +60,11 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { @Override public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { CodecUtil.writeIndexHeader(termsOut, TERMS_CODEC, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); + segment = state.segmentInfo.name; } @Override - public void setField(FieldInfo fieldInfo) { + public int setField(FieldInfo fieldInfo) { super.setField(fieldInfo); if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { throw new IllegalArgumentException("field must be index using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS"); @@ -73,6 +75,7 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { throw new IllegalArgumentException("field cannot index term vectors: CheckIndex will report this as index corruption"); } lastState = emptyState; + return 0; } @Override @@ -151,7 +154,7 @@ final class IDVersionPostingsWriter extends PushPostingsWriterBase { private long lastEncodedVersion; @Override - public void encodeTerm(DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { + public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { IDVersionTermState state = (IDVersionTermState) _state; out.writeVInt(state.docID); if (absolute) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java index 5b1ea64c405..6d260773353 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/IDVersionSegmentTermsEnumFrame.java @@ -83,7 +83,9 @@ final class IDVersionSegmentTermsEnumFrame { final BlockTermState state; - // metadata + // metadata buffer, holding monotonic values + public long[] longs; + // metadata buffer, holding general values public byte[] bytes; ByteArrayDataInput bytesReader; @@ -94,6 +96,7 @@ final class IDVersionSegmentTermsEnumFrame { this.ord = ord; this.state = ste.fr.parent.postingsReader.newTermState(); this.state.totalTermFreq = -1; + this.longs = new long[ste.fr.longsSize]; } public void setFloorData(ByteArrayDataInput in, BytesRef source) { @@ -393,8 +396,11 @@ final class IDVersionSegmentTermsEnumFrame { state.docFreq = 1; state.totalTermFreq = 1; //if (DEBUG) System.out.println(" dF=" + state.docFreq); - // metadata - ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); + // metadata + for (int i = 0; i < ste.fr.longsSize; i++) { + longs[i] = bytesReader.readVLong(); + } + ste.fr.parent.postingsReader.decodeTerm(longs, bytesReader, ste.fr.fieldInfo, state, absolute); metaDataUpto++; absolute = false; diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java index ff5d6ec83b9..8001a22d9ff 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsReader.java @@ -127,6 +127,7 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { final long sumDocFreq = numTerms; assert numTerms <= Integer.MAX_VALUE; final int docCount = (int) numTerms; + final int longsSize = in.readVInt(); BytesRef minTerm = readBytesRef(in); BytesRef maxTerm = readBytesRef(in); @@ -142,7 +143,7 @@ public final class VersionBlockTreeTermsReader extends FieldsProducer { final long indexStartFP = indexIn.readVLong(); VersionFieldReader previous = fields.put(fieldInfo.name, new VersionFieldReader(this, fieldInfo, numTerms, rootCode, sumTotalTermFreq, sumDocFreq, docCount, - indexStartFP, indexIn, minTerm, maxTerm)); + indexStartFP, longsSize, indexIn, minTerm, maxTerm)); if (previous != null) { throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java index b9c57491a88..9e2f7549f4a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -143,10 +143,11 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { public final Pair rootCode; public final long numTerms; public final long indexStartFP; + private final int longsSize; public final BytesRef minTerm; public final BytesRef maxTerm; - public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, + public FieldMetaData(FieldInfo fieldInfo, Pair rootCode, long numTerms, long indexStartFP, int longsSize, BytesRef minTerm, BytesRef maxTerm) { assert numTerms > 0; this.fieldInfo = fieldInfo; @@ -154,6 +155,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { this.rootCode = rootCode; this.indexStartFP = indexStartFP; this.numTerms = numTerms; + this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; } @@ -401,6 +403,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { class TermsWriter { private final FieldInfo fieldInfo; + private final int longsSize; private long numTerms; final FixedBitSet docsSeen; long indexStartFP; @@ -413,6 +416,8 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private int[] prefixStarts = new int[8]; + private final long[] longs; + // Pending stack of terms and blocks. As terms arrive (in sorted order) // we append to this stack, and once the top of the stack has enough // terms starting with a common prefix, we write a new block with @@ -600,7 +605,13 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; // Write term meta data - postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.copyTo(metaWriter); + bytesWriter.reset(); absolute = false; } } else { @@ -637,7 +648,13 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { // separate anymore: // Write term meta data - postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + postingsWriter.encodeTerm(longs, bytesWriter, fieldInfo, state, absolute); + for (int pos = 0; pos < longsSize; pos++) { + assert longs[pos] >= 0; + metaWriter.writeVLong(longs[pos]); + } + bytesWriter.copyTo(metaWriter); + bytesWriter.reset(); absolute = false; } else { PendingBlock block = (PendingBlock) ent; @@ -703,7 +720,8 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { this.fieldInfo = fieldInfo; docsSeen = new FixedBitSet(maxDoc); - postingsWriter.setField(fieldInfo); + this.longsSize = postingsWriter.setField(fieldInfo); + this.longs = new long[longsSize]; } /** Writes one term's worth of postings. */ @@ -800,6 +818,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { ((PendingBlock) pending.get(0)).index.getEmptyOutput(), numTerms, indexStartFP, + longsSize, minTerm, maxTerm)); } else { // cannot assert this: we skip deleted docIDs in the postings: @@ -809,6 +828,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { private final ByteBuffersDataOutput suffixWriter = ByteBuffersDataOutput.newResettableInstance(); private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput bytesWriter = ByteBuffersDataOutput.newResettableInstance(); } private boolean closed; @@ -836,6 +856,7 @@ public final class VersionBlockTreeTermsWriter extends FieldsConsumer { out.writeVInt(field.rootCode.output1.length); out.writeBytes(field.rootCode.output1.bytes, field.rootCode.output1.offset, field.rootCode.output1.length); out.writeVLong(field.rootCode.output2); + out.writeVInt(field.longsSize); indexOut.writeVLong(field.indexStartFP); writeBytesRef(out, field.minTerm); writeBytesRef(out, field.maxTerm); diff --git a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java index 93888ae589d..581201f9ea4 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java +++ b/lucene/sandbox/src/java/org/apache/lucene/codecs/idversion/VersionFieldReader.java @@ -45,13 +45,14 @@ final class VersionFieldReader extends Terms implements Accountable { final Pair rootCode; final BytesRef minTerm; final BytesRef maxTerm; + final int longsSize; final VersionBlockTreeTermsReader parent; final FST> index; //private boolean DEBUG; VersionFieldReader(VersionBlockTreeTermsReader parent, FieldInfo fieldInfo, long numTerms, Pair rootCode, long sumTotalTermFreq, long sumDocFreq, int docCount, - long indexStartFP, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { + long indexStartFP, int longsSize, IndexInput indexIn, BytesRef minTerm, BytesRef maxTerm) throws IOException { assert numTerms > 0; this.fieldInfo = fieldInfo; //DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); @@ -62,6 +63,7 @@ final class VersionFieldReader extends Terms implements Accountable { this.docCount = docCount; this.indexStartFP = indexStartFP; this.rootCode = rootCode; + this.longsSize = longsSize; this.minTerm = minTerm; this.maxTerm = maxTerm; // if (DEBUG) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java index 028827ebede..e55eb8747ae 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/mockrandom/MockRandomPostingsFormat.java @@ -41,6 +41,10 @@ import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter; import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader; import org.apache.lucene.codecs.lucene84.Lucene84PostingsWriter; +import org.apache.lucene.codecs.memory.FSTOrdTermsReader; +import org.apache.lucene.codecs.memory.FSTOrdTermsWriter; +import org.apache.lucene.codecs.memory.FSTTermsReader; +import org.apache.lucene.codecs.memory.FSTTermsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentReadState; @@ -118,9 +122,29 @@ public final class MockRandomPostingsFormat extends PostingsFormat { PostingsWriterBase postingsWriter = new Lucene84PostingsWriter(state); final FieldsConsumer fields; - final int t1 = random.nextInt(3); + final int t1 = random.nextInt(5); - if (t1 == 0) { + if (t1 == 0) { + boolean success = false; + try { + fields = new FSTTermsWriter(state, postingsWriter); + success = true; + } finally { + if (!success) { + postingsWriter.close(); + } + } + } else if (t1 == 1) { + boolean success = false; + try { + fields = new FSTOrdTermsWriter(state, postingsWriter); + success = true; + } finally { + if (!success) { + postingsWriter.close(); + } + } + } else if (t1 == 2) { // Use BlockTree terms dict if (LuceneTestCase.VERBOSE) { @@ -141,7 +165,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { postingsWriter.close(); } } - } else if (t1 == 1) { + } else if (t1 == 3) { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing Block terms dict"); @@ -211,7 +235,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { } } } - } else if (t1 == 2) { + } else if (t1 == 4) { // Use OrdsBlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: writing OrdsBlockTree"); @@ -263,8 +287,28 @@ public final class MockRandomPostingsFormat extends PostingsFormat { PostingsReaderBase postingsReader = new Lucene84PostingsReader(state); final FieldsProducer fields; - final int t1 = random.nextInt(3); + final int t1 = random.nextInt(5); if (t1 == 0) { + boolean success = false; + try { + fields = new FSTTermsReader(state, postingsReader); + success = true; + } finally { + if (!success) { + postingsReader.close(); + } + } + } else if (t1 == 1) { + boolean success = false; + try { + fields = new FSTOrdTermsReader(state, postingsReader); + success = true; + } finally { + if (!success) { + postingsReader.close(); + } + } + } else if (t1 == 2) { // Use BlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading BlockTree terms dict"); @@ -279,7 +323,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { postingsReader.close(); } } - } else if (t1 == 1) { + } else if (t1 == 3) { if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading Block terms dict"); @@ -330,7 +374,7 @@ public final class MockRandomPostingsFormat extends PostingsFormat { } } } - } else if (t1 == 2) { + } else if (t1 == 4) { // Use OrdsBlockTree terms dict if (LuceneTestCase.VERBOSE) { System.out.println("MockRandomCodec: reading OrdsBlockTree terms dict"); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java index a3b7da996ea..8bb9a070268 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java @@ -45,6 +45,8 @@ import org.apache.lucene.codecs.bloom.TestBloomFilteredLucenePostings; import org.apache.lucene.codecs.lucene60.Lucene60PointsReader; import org.apache.lucene.codecs.lucene60.Lucene60PointsWriter; import org.apache.lucene.codecs.memory.DirectPostingsFormat; +import org.apache.lucene.codecs.memory.FSTOrdPostingsFormat; +import org.apache.lucene.codecs.memory.FSTPostingsFormat; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; import org.apache.lucene.index.PointValues.IntersectVisitor; import org.apache.lucene.store.Directory; @@ -187,6 +189,8 @@ public class RandomCodec extends AssertingCodec { add(avoidCodecs, TestUtil.getDefaultPostingsFormat(minItemsPerBlock, maxItemsPerBlock, RandomPicks.randomFrom(random, BlockTreeTermsReader.FSTLoadMode.values())), + new FSTPostingsFormat(), + new FSTOrdPostingsFormat(), new DirectPostingsFormat(LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : maxItemsPerBlock), LuceneTestCase.rarely(random) ? 1 : (LuceneTestCase.rarely(random) ? Integer.MAX_VALUE : lowFreqCutoff)), //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucenePostings to be constructed