mirror of https://github.com/apache/lucene.git
LUCENE-10035: Simple text codec add multi level skip list data (#224)
This commit is contained in:
parent
e470535072
commit
6ade29c71a
|
@ -15,6 +15,9 @@ New Features
|
||||||
|
|
||||||
* LUCENE-9589: Swedish Minimal Stemmer (janhoy)
|
* LUCENE-9589: Swedish Minimal Stemmer (janhoy)
|
||||||
|
|
||||||
|
* LUCENE-10035: The SimpleText codec now writes postings lists.
|
||||||
|
(wuda via Adrien Grand)
|
||||||
|
|
||||||
System Requirements
|
System Requirements
|
||||||
|
|
||||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||||
|
|
|
@ -25,6 +25,7 @@ import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.PAYLOAD
|
||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.POS;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.POS;
|
||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.START_OFFSET;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.START_OFFSET;
|
||||||
import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.TERM;
|
import static org.apache.lucene.codecs.simpletext.SimpleTextFieldsWriter.TERM;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.SKIP_LIST;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
@ -38,6 +39,7 @@ import org.apache.lucene.codecs.FieldsProducer;
|
||||||
import org.apache.lucene.index.BaseTermsEnum;
|
import org.apache.lucene.index.BaseTermsEnum;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.Impacts;
|
||||||
import org.apache.lucene.index.ImpactsEnum;
|
import org.apache.lucene.index.ImpactsEnum;
|
||||||
import org.apache.lucene.index.IndexOptions;
|
import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
@ -45,6 +47,7 @@ import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SlowImpactsEnum;
|
import org.apache.lucene.index.SlowImpactsEnum;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
import org.apache.lucene.store.BufferedChecksumIndexInput;
|
import org.apache.lucene.store.BufferedChecksumIndexInput;
|
||||||
import org.apache.lucene.store.ChecksumIndexInput;
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
@ -120,11 +123,15 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private int docFreq;
|
private int docFreq;
|
||||||
private long totalTermFreq;
|
private long totalTermFreq;
|
||||||
private long docsStart;
|
private long docsStart;
|
||||||
|
private long skipPointer;
|
||||||
private boolean ended;
|
private boolean ended;
|
||||||
private final BytesRefFSTEnum<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> fstEnum;
|
private final BytesRefFSTEnum<
|
||||||
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>>
|
||||||
|
fstEnum;
|
||||||
|
|
||||||
public SimpleTextTermsEnum(
|
public SimpleTextTermsEnum(
|
||||||
FST<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> fst, IndexOptions indexOptions) {
|
FST<PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>> fst,
|
||||||
|
IndexOptions indexOptions) {
|
||||||
this.indexOptions = indexOptions;
|
this.indexOptions = indexOptions;
|
||||||
fstEnum = new BytesRefFSTEnum<>(fst);
|
fstEnum = new BytesRefFSTEnum<>(fst);
|
||||||
}
|
}
|
||||||
|
@ -132,12 +139,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
@Override
|
@Override
|
||||||
public boolean seekExact(BytesRef text) throws IOException {
|
public boolean seekExact(BytesRef text) throws IOException {
|
||||||
|
|
||||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>
|
final BytesRefFSTEnum.InputOutput<
|
||||||
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>>
|
||||||
result = fstEnum.seekExact(text);
|
result = fstEnum.seekExact(text);
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>> pair1 = result.output;
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> pair =
|
||||||
PairOutputs.Pair<Long, Long> pair2 = pair1.output2;
|
result.output;
|
||||||
|
PairOutputs.Pair<Long, Long> pair1 = pair.output1;
|
||||||
|
PairOutputs.Pair<Long, Long> pair2 = pair.output2;
|
||||||
docsStart = pair1.output1;
|
docsStart = pair1.output1;
|
||||||
|
skipPointer = pair1.output2;
|
||||||
docFreq = pair2.output1.intValue();
|
docFreq = pair2.output1.intValue();
|
||||||
totalTermFreq = pair2.output2;
|
totalTermFreq = pair2.output2;
|
||||||
return true;
|
return true;
|
||||||
|
@ -150,16 +161,20 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
public SeekStatus seekCeil(BytesRef text) throws IOException {
|
||||||
|
|
||||||
// System.out.println("seek to text=" + text.utf8ToString());
|
// System.out.println("seek to text=" + text.utf8ToString());
|
||||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>
|
final BytesRefFSTEnum.InputOutput<
|
||||||
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>>
|
||||||
result = fstEnum.seekCeil(text);
|
result = fstEnum.seekCeil(text);
|
||||||
if (result == null) {
|
if (result == null) {
|
||||||
// System.out.println(" end");
|
// System.out.println(" end");
|
||||||
return SeekStatus.END;
|
return SeekStatus.END;
|
||||||
} else {
|
} else {
|
||||||
// System.out.println(" got text=" + term.utf8ToString());
|
// System.out.println(" got text=" + term.utf8ToString());
|
||||||
PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>> pair1 = result.output;
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> pair =
|
||||||
PairOutputs.Pair<Long, Long> pair2 = pair1.output2;
|
result.output;
|
||||||
|
PairOutputs.Pair<Long, Long> pair1 = pair.output1;
|
||||||
|
PairOutputs.Pair<Long, Long> pair2 = pair.output2;
|
||||||
docsStart = pair1.output1;
|
docsStart = pair1.output1;
|
||||||
|
skipPointer = pair1.output2;
|
||||||
docFreq = pair2.output1.intValue();
|
docFreq = pair2.output1.intValue();
|
||||||
totalTermFreq = pair2.output2;
|
totalTermFreq = pair2.output2;
|
||||||
|
|
||||||
|
@ -176,12 +191,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
@Override
|
@Override
|
||||||
public BytesRef next() throws IOException {
|
public BytesRef next() throws IOException {
|
||||||
assert !ended;
|
assert !ended;
|
||||||
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>
|
final BytesRefFSTEnum.InputOutput<
|
||||||
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>>
|
||||||
result = fstEnum.next();
|
result = fstEnum.next();
|
||||||
if (result != null) {
|
if (result != null) {
|
||||||
PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>> pair1 = result.output;
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> pair =
|
||||||
PairOutputs.Pair<Long, Long> pair2 = pair1.output2;
|
result.output;
|
||||||
|
PairOutputs.Pair<Long, Long> pair1 = pair.output1;
|
||||||
|
PairOutputs.Pair<Long, Long> pair2 = pair.output2;
|
||||||
docsStart = pair1.output1;
|
docsStart = pair1.output1;
|
||||||
|
skipPointer = pair1.output2;
|
||||||
docFreq = pair2.output1.intValue();
|
docFreq = pair2.output1.intValue();
|
||||||
totalTermFreq = pair2.output2;
|
totalTermFreq = pair2.output2;
|
||||||
return result.input;
|
return result.input;
|
||||||
|
@ -229,7 +248,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
} else {
|
} else {
|
||||||
docsAndPositionsEnum = new SimpleTextPostingsEnum();
|
docsAndPositionsEnum = new SimpleTextPostingsEnum();
|
||||||
}
|
}
|
||||||
return docsAndPositionsEnum.reset(docsStart, indexOptions, docFreq);
|
return docsAndPositionsEnum.reset(docsStart, indexOptions, docFreq, skipPointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
SimpleTextDocsEnum docsEnum;
|
SimpleTextDocsEnum docsEnum;
|
||||||
|
@ -240,16 +259,20 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
} else {
|
} else {
|
||||||
docsEnum = new SimpleTextDocsEnum();
|
docsEnum = new SimpleTextDocsEnum();
|
||||||
}
|
}
|
||||||
return docsEnum.reset(docsStart, indexOptions == IndexOptions.DOCS, docFreq);
|
return docsEnum.reset(docsStart, indexOptions == IndexOptions.DOCS, docFreq, skipPointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ImpactsEnum impacts(int flags) throws IOException {
|
public ImpactsEnum impacts(int flags) throws IOException {
|
||||||
return new SlowImpactsEnum(postings(null, flags));
|
if (docFreq <= SimpleTextSkipWriter.BLOCK_SIZE) {
|
||||||
|
// no skip data
|
||||||
|
return new SlowImpactsEnum(postings(null, flags));
|
||||||
|
}
|
||||||
|
return (ImpactsEnum) postings(null, flags);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class SimpleTextDocsEnum extends PostingsEnum {
|
private class SimpleTextDocsEnum extends ImpactsEnum {
|
||||||
private final IndexInput inStart;
|
private final IndexInput inStart;
|
||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
private boolean omitTF;
|
private boolean omitTF;
|
||||||
|
@ -259,21 +282,31 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private final CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
|
private final CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
|
||||||
private int cost;
|
private int cost;
|
||||||
|
|
||||||
|
// for skip list data
|
||||||
|
private SimpleTextSkipReader skipReader;
|
||||||
|
private int nextSkipDoc = 0;
|
||||||
|
private long seekTo = -1;
|
||||||
|
|
||||||
public SimpleTextDocsEnum() {
|
public SimpleTextDocsEnum() {
|
||||||
this.inStart = SimpleTextFieldsReader.this.in;
|
this.inStart = SimpleTextFieldsReader.this.in;
|
||||||
this.in = this.inStart.clone();
|
this.in = this.inStart.clone();
|
||||||
|
this.skipReader = new SimpleTextSkipReader(this.inStart.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean canReuse(IndexInput in) {
|
public boolean canReuse(IndexInput in) {
|
||||||
return in == inStart;
|
return in == inStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SimpleTextDocsEnum reset(long fp, boolean omitTF, int docFreq) throws IOException {
|
public SimpleTextDocsEnum reset(long fp, boolean omitTF, int docFreq, long skipPointer)
|
||||||
|
throws IOException {
|
||||||
in.seek(fp);
|
in.seek(fp);
|
||||||
this.omitTF = omitTF;
|
this.omitTF = omitTF;
|
||||||
docID = -1;
|
docID = -1;
|
||||||
tf = 1;
|
tf = 1;
|
||||||
cost = docFreq;
|
cost = docFreq;
|
||||||
|
skipReader.reset(skipPointer, docFreq);
|
||||||
|
nextSkipDoc = 0;
|
||||||
|
seekTo = -1;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -309,6 +342,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
|
return advance(docID + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readDoc() throws IOException {
|
||||||
if (docID == NO_MORE_DOCS) {
|
if (docID == NO_MORE_DOCS) {
|
||||||
return docID;
|
return docID;
|
||||||
}
|
}
|
||||||
|
@ -341,7 +378,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
} else if (StringHelper.startsWith(scratch.get(), PAYLOAD)) {
|
} else if (StringHelper.startsWith(scratch.get(), PAYLOAD)) {
|
||||||
// skip
|
// skip
|
||||||
} else {
|
} else {
|
||||||
assert StringHelper.startsWith(scratch.get(), TERM)
|
assert StringHelper.startsWith(scratch.get(), SimpleTextSkipWriter.SKIP_LIST)
|
||||||
|
|| StringHelper.startsWith(scratch.get(), TERM)
|
||||||
|| StringHelper.startsWith(scratch.get(), FIELD)
|
|| StringHelper.startsWith(scratch.get(), FIELD)
|
||||||
|| StringHelper.startsWith(scratch.get(), END)
|
|| StringHelper.startsWith(scratch.get(), END)
|
||||||
: "scratch=" + scratch.get().utf8ToString();
|
: "scratch=" + scratch.get().utf8ToString();
|
||||||
|
@ -357,19 +395,50 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int advanceTarget(int target) throws IOException {
|
||||||
|
if (seekTo > 0) {
|
||||||
|
in.seek(seekTo);
|
||||||
|
seekTo = -1;
|
||||||
|
}
|
||||||
|
assert docID() < target;
|
||||||
|
int doc;
|
||||||
|
do {
|
||||||
|
doc = readDoc();
|
||||||
|
} while (doc < target);
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int advance(int target) throws IOException {
|
public int advance(int target) throws IOException {
|
||||||
// Naive -- better to index skip data
|
advanceShallow(target);
|
||||||
return slowAdvance(target);
|
return advanceTarget(target);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long cost() {
|
public long cost() {
|
||||||
return cost;
|
return cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void advanceShallow(int target) throws IOException {
|
||||||
|
if (target > nextSkipDoc) {
|
||||||
|
skipReader.skipTo(target);
|
||||||
|
if (skipReader.getNextSkipDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
seekTo = skipReader.getNextSkipDocFP();
|
||||||
|
}
|
||||||
|
nextSkipDoc = skipReader.getNextSkipDoc();
|
||||||
|
}
|
||||||
|
assert nextSkipDoc >= target;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Impacts getImpacts() throws IOException {
|
||||||
|
advanceShallow(docID);
|
||||||
|
return skipReader.getImpacts();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class SimpleTextPostingsEnum extends PostingsEnum {
|
private class SimpleTextPostingsEnum extends ImpactsEnum {
|
||||||
private final IndexInput inStart;
|
private final IndexInput inStart;
|
||||||
private final IndexInput in;
|
private final IndexInput in;
|
||||||
private int docID = -1;
|
private int docID = -1;
|
||||||
|
@ -387,16 +456,23 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private int endOffset;
|
private int endOffset;
|
||||||
private int cost;
|
private int cost;
|
||||||
|
|
||||||
|
// for skip list data
|
||||||
|
private SimpleTextSkipReader skipReader;
|
||||||
|
private int nextSkipDoc = 0;
|
||||||
|
private long seekTo = -1;
|
||||||
|
|
||||||
public SimpleTextPostingsEnum() {
|
public SimpleTextPostingsEnum() {
|
||||||
this.inStart = SimpleTextFieldsReader.this.in;
|
this.inStart = SimpleTextFieldsReader.this.in;
|
||||||
this.in = inStart.clone();
|
this.in = inStart.clone();
|
||||||
|
this.skipReader = new SimpleTextSkipReader(this.inStart.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean canReuse(IndexInput in) {
|
public boolean canReuse(IndexInput in) {
|
||||||
return in == inStart;
|
return in == inStart;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SimpleTextPostingsEnum reset(long fp, IndexOptions indexOptions, int docFreq) {
|
public SimpleTextPostingsEnum reset(
|
||||||
|
long fp, IndexOptions indexOptions, int docFreq, long skipPointer) throws IOException {
|
||||||
nextDocStart = fp;
|
nextDocStart = fp;
|
||||||
docID = -1;
|
docID = -1;
|
||||||
readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
|
@ -407,6 +483,9 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
endOffset = -1;
|
endOffset = -1;
|
||||||
}
|
}
|
||||||
cost = docFreq;
|
cost = docFreq;
|
||||||
|
skipReader.reset(skipPointer, docFreq);
|
||||||
|
nextSkipDoc = 0;
|
||||||
|
seekTo = -1;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -422,6 +501,10 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int nextDoc() throws IOException {
|
public int nextDoc() throws IOException {
|
||||||
|
return advance(docID + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readDoc() throws IOException {
|
||||||
boolean first = true;
|
boolean first = true;
|
||||||
in.seek(nextDocStart);
|
in.seek(nextDocStart);
|
||||||
long posStart = 0;
|
long posStart = 0;
|
||||||
|
@ -452,7 +535,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
} else if (StringHelper.startsWith(scratch.get(), PAYLOAD)) {
|
} else if (StringHelper.startsWith(scratch.get(), PAYLOAD)) {
|
||||||
// skip
|
// skip
|
||||||
} else {
|
} else {
|
||||||
assert StringHelper.startsWith(scratch.get(), TERM)
|
assert StringHelper.startsWith(scratch.get(), SimpleTextSkipWriter.SKIP_LIST)
|
||||||
|
|| StringHelper.startsWith(scratch.get(), TERM)
|
||||||
|| StringHelper.startsWith(scratch.get(), FIELD)
|
|| StringHelper.startsWith(scratch.get(), FIELD)
|
||||||
|| StringHelper.startsWith(scratch.get(), END);
|
|| StringHelper.startsWith(scratch.get(), END);
|
||||||
if (!first) {
|
if (!first) {
|
||||||
|
@ -465,10 +549,23 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int advanceTarget(int target) throws IOException {
|
||||||
|
if (seekTo > 0) {
|
||||||
|
nextDocStart = seekTo;
|
||||||
|
seekTo = -1;
|
||||||
|
}
|
||||||
|
assert docID() < target;
|
||||||
|
int doc;
|
||||||
|
do {
|
||||||
|
doc = readDoc();
|
||||||
|
} while (doc < target);
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int advance(int target) throws IOException {
|
public int advance(int target) throws IOException {
|
||||||
// Naive -- better to index skip data
|
advanceShallow(target);
|
||||||
return slowAdvance(target);
|
return advanceTarget(target);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -533,6 +630,24 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
public long cost() {
|
public long cost() {
|
||||||
return cost;
|
return cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void advanceShallow(int target) throws IOException {
|
||||||
|
if (target > nextSkipDoc) {
|
||||||
|
skipReader.skipTo(target);
|
||||||
|
if (skipReader.getNextSkipDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
seekTo = skipReader.getNextSkipDocFP();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nextSkipDoc = skipReader.getNextSkipDoc();
|
||||||
|
assert nextSkipDoc >= target;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Impacts getImpacts() throws IOException {
|
||||||
|
advanceShallow(docID);
|
||||||
|
return skipReader.getImpacts();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final long TERMS_BASE_RAM_BYTES_USED =
|
private static final long TERMS_BASE_RAM_BYTES_USED =
|
||||||
|
@ -547,7 +662,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
private long sumTotalTermFreq;
|
private long sumTotalTermFreq;
|
||||||
private long sumDocFreq;
|
private long sumDocFreq;
|
||||||
private int docCount;
|
private int docCount;
|
||||||
private FST<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> fst;
|
private FST<PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>> fst;
|
||||||
private int termCount;
|
private int termCount;
|
||||||
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
private final CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
|
private final CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
|
||||||
|
@ -561,10 +676,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
|
|
||||||
private void loadTerms() throws IOException {
|
private void loadTerms() throws IOException {
|
||||||
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
|
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
|
||||||
final FSTCompiler<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> fstCompiler;
|
final FSTCompiler<
|
||||||
|
PairOutputs.Pair<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>>>
|
||||||
|
fstCompiler;
|
||||||
|
final PairOutputs<Long, Long> outputsOuter = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
||||||
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
final PairOutputs<Long, Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
|
||||||
final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
|
final PairOutputs<PairOutputs.Pair<Long, Long>, PairOutputs.Pair<Long, Long>> outputs =
|
||||||
new PairOutputs<>(posIntOutputs, outputsInner);
|
new PairOutputs<>(outputsOuter, outputsInner);
|
||||||
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
IndexInput in = SimpleTextFieldsReader.this.in.clone();
|
||||||
in.seek(termsStart);
|
in.seek(termsStart);
|
||||||
|
@ -572,6 +690,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
long lastDocsStart = -1;
|
long lastDocsStart = -1;
|
||||||
int docFreq = 0;
|
int docFreq = 0;
|
||||||
long totalTermFreq = 0;
|
long totalTermFreq = 0;
|
||||||
|
long skipPointer = 0;
|
||||||
FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
|
FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
|
||||||
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
|
final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -581,7 +700,8 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
fstCompiler.add(
|
fstCompiler.add(
|
||||||
Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
||||||
outputs.newPair(
|
outputs.newPair(
|
||||||
lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
|
outputsOuter.newPair(lastDocsStart, skipPointer),
|
||||||
|
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||||
sumTotalTermFreq += totalTermFreq;
|
sumTotalTermFreq += totalTermFreq;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -595,12 +715,15 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
} else if (StringHelper.startsWith(scratch.get(), FREQ)) {
|
} else if (StringHelper.startsWith(scratch.get(), FREQ)) {
|
||||||
scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length() - FREQ.length);
|
scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length() - FREQ.length);
|
||||||
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
|
totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), SKIP_LIST)) {
|
||||||
|
skipPointer = in.getFilePointer();
|
||||||
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
|
} else if (StringHelper.startsWith(scratch.get(), TERM)) {
|
||||||
if (lastDocsStart != -1) {
|
if (lastDocsStart != -1) {
|
||||||
fstCompiler.add(
|
fstCompiler.add(
|
||||||
Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
Util.toIntsRef(lastTerm.get(), scratchIntsRef),
|
||||||
outputs.newPair(
|
outputs.newPair(
|
||||||
lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
|
outputsOuter.newPair(lastDocsStart, skipPointer),
|
||||||
|
outputsInner.newPair((long) docFreq, totalTermFreq)));
|
||||||
}
|
}
|
||||||
lastDocsStart = in.getFilePointer();
|
lastDocsStart = in.getFilePointer();
|
||||||
final int len = scratch.length() - TERM.length;
|
final int len = scratch.length() - TERM.length;
|
||||||
|
@ -611,6 +734,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
||||||
sumTotalTermFreq += totalTermFreq;
|
sumTotalTermFreq += totalTermFreq;
|
||||||
totalTermFreq = 0;
|
totalTermFreq = 0;
|
||||||
termCount++;
|
termCount++;
|
||||||
|
skipPointer = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
docCount = visitedDocs.cardinality();
|
docCount = visitedDocs.cardinality();
|
||||||
|
|
|
@ -17,11 +17,13 @@
|
||||||
package org.apache.lucene.codecs.simpletext;
|
package org.apache.lucene.codecs.simpletext;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||||
import org.apache.lucene.codecs.FieldsConsumer;
|
import org.apache.lucene.codecs.FieldsConsumer;
|
||||||
import org.apache.lucene.codecs.NormsProducer;
|
import org.apache.lucene.codecs.NormsProducer;
|
||||||
import org.apache.lucene.index.FieldInfo;
|
import org.apache.lucene.index.FieldInfo;
|
||||||
import org.apache.lucene.index.FieldInfos;
|
import org.apache.lucene.index.FieldInfos;
|
||||||
import org.apache.lucene.index.Fields;
|
import org.apache.lucene.index.Fields;
|
||||||
|
import org.apache.lucene.index.NumericDocValues;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
|
@ -37,6 +39,14 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
private final SegmentWriteState writeState;
|
private final SegmentWriteState writeState;
|
||||||
final String segment;
|
final String segment;
|
||||||
|
|
||||||
|
/** for write skip data. */
|
||||||
|
private int docCount = 0;
|
||||||
|
|
||||||
|
private final SimpleTextSkipWriter skipWriter;
|
||||||
|
private final CompetitiveImpactAccumulator competitiveImpactAccumulator =
|
||||||
|
new CompetitiveImpactAccumulator();
|
||||||
|
private long lastDocFilePointer = -1;
|
||||||
|
|
||||||
static final BytesRef END = new BytesRef("END");
|
static final BytesRef END = new BytesRef("END");
|
||||||
static final BytesRef FIELD = new BytesRef("field ");
|
static final BytesRef FIELD = new BytesRef("field ");
|
||||||
static final BytesRef TERM = new BytesRef(" term ");
|
static final BytesRef TERM = new BytesRef(" term ");
|
||||||
|
@ -54,14 +64,16 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
segment = writeState.segmentInfo.name;
|
segment = writeState.segmentInfo.name;
|
||||||
out = writeState.directory.createOutput(fileName, writeState.context);
|
out = writeState.directory.createOutput(fileName, writeState.context);
|
||||||
this.writeState = writeState;
|
this.writeState = writeState;
|
||||||
|
this.skipWriter = new SimpleTextSkipWriter(writeState);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(Fields fields, NormsProducer norms) throws IOException {
|
public void write(Fields fields, NormsProducer norms) throws IOException {
|
||||||
write(writeState.fieldInfos, fields);
|
write(writeState.fieldInfos, fields, norms);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void write(FieldInfos fieldInfos, Fields fields) throws IOException {
|
public void write(FieldInfos fieldInfos, Fields fields, NormsProducer normsProducer)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
// for each field
|
// for each field
|
||||||
for (String field : fields) {
|
for (String field : fields) {
|
||||||
|
@ -78,6 +90,12 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
boolean hasFreqs = terms.hasFreqs();
|
boolean hasFreqs = terms.hasFreqs();
|
||||||
boolean hasPayloads = fieldInfo.hasPayloads();
|
boolean hasPayloads = fieldInfo.hasPayloads();
|
||||||
boolean hasOffsets = terms.hasOffsets();
|
boolean hasOffsets = terms.hasOffsets();
|
||||||
|
boolean fieldHasNorms = fieldInfo.hasNorms();
|
||||||
|
|
||||||
|
NumericDocValues norms = null;
|
||||||
|
if (fieldHasNorms && normsProducer != null) {
|
||||||
|
norms = normsProducer.getNorms(fieldInfo);
|
||||||
|
}
|
||||||
|
|
||||||
int flags = 0;
|
int flags = 0;
|
||||||
if (hasPositions) {
|
if (hasPositions) {
|
||||||
|
@ -103,6 +121,10 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
if (term == null) {
|
if (term == null) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
docCount = 0;
|
||||||
|
skipWriter.resetSkip();
|
||||||
|
competitiveImpactAccumulator.clear();
|
||||||
|
lastDocFilePointer = -1;
|
||||||
|
|
||||||
postingsEnum = termsEnum.postings(postingsEnum, flags);
|
postingsEnum = termsEnum.postings(postingsEnum, flags);
|
||||||
|
|
||||||
|
@ -136,7 +158,9 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
newline();
|
newline();
|
||||||
wroteTerm = true;
|
wroteTerm = true;
|
||||||
}
|
}
|
||||||
|
if (lastDocFilePointer == -1) {
|
||||||
|
lastDocFilePointer = out.getFilePointer();
|
||||||
|
}
|
||||||
write(DOC);
|
write(DOC);
|
||||||
write(Integer.toString(doc));
|
write(Integer.toString(doc));
|
||||||
newline();
|
newline();
|
||||||
|
@ -183,7 +207,19 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
competitiveImpactAccumulator.add(freq, getNorm(doc, norms));
|
||||||
|
} else {
|
||||||
|
competitiveImpactAccumulator.add(1, getNorm(doc, norms));
|
||||||
}
|
}
|
||||||
|
docCount++;
|
||||||
|
if (docCount != 0 && docCount % SimpleTextSkipWriter.BLOCK_SIZE == 0) {
|
||||||
|
skipWriter.bufferSkip(doc, lastDocFilePointer, docCount, competitiveImpactAccumulator);
|
||||||
|
competitiveImpactAccumulator.clear();
|
||||||
|
lastDocFilePointer = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (docCount >= SimpleTextSkipWriter.BLOCK_SIZE) {
|
||||||
|
skipWriter.writeSkip(out);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -214,4 +250,15 @@ class SimpleTextFieldsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private long getNorm(int doc, NumericDocValues norms) throws IOException {
|
||||||
|
if (norms == null) {
|
||||||
|
return 1L;
|
||||||
|
}
|
||||||
|
boolean found = norms.advanceExact(doc);
|
||||||
|
if (found == false) {
|
||||||
|
return 1L;
|
||||||
|
}
|
||||||
|
return norms.longValue();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,206 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.simpletext;
|
||||||
|
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.CHILD_POINTER;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.FREQ;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.IMPACT;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.IMPACTS;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.IMPACTS_END;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.LEVEL_LENGTH;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.NORM;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.SKIP_DOC;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.SKIP_DOC_FP;
|
||||||
|
import static org.apache.lucene.codecs.simpletext.SimpleTextSkipWriter.SKIP_LIST;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import org.apache.lucene.codecs.MultiLevelSkipListReader;
|
||||||
|
import org.apache.lucene.index.Impact;
|
||||||
|
import org.apache.lucene.index.Impacts;
|
||||||
|
import org.apache.lucene.search.DocIdSetIterator;
|
||||||
|
import org.apache.lucene.store.BufferedChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.ChecksumIndexInput;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class reads skip lists with multiple levels.
|
||||||
|
*
|
||||||
|
* <p>See {@link SimpleTextFieldsWriter} for the information about the encoding of the multi level
|
||||||
|
* skip lists.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
class SimpleTextSkipReader extends MultiLevelSkipListReader {
|
||||||
|
|
||||||
|
private final CharsRefBuilder scratchUTF16 = new CharsRefBuilder();
|
||||||
|
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
|
private Impacts impacts;
|
||||||
|
private List<List<Impact>> perLevelImpacts;
|
||||||
|
private long nextSkipDocFP = -1;
|
||||||
|
private int numLevels = 1;
|
||||||
|
private boolean hasSkipList = false;
|
||||||
|
|
||||||
|
SimpleTextSkipReader(IndexInput skipStream) {
|
||||||
|
super(
|
||||||
|
skipStream,
|
||||||
|
SimpleTextSkipWriter.maxSkipLevels,
|
||||||
|
SimpleTextSkipWriter.BLOCK_SIZE,
|
||||||
|
SimpleTextSkipWriter.skipMultiplier);
|
||||||
|
impacts =
|
||||||
|
new Impacts() {
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
return numLevels;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getDocIdUpTo(int level) {
|
||||||
|
return skipDoc[level];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Impact> getImpacts(int level) {
|
||||||
|
assert level < numLevels;
|
||||||
|
return perLevelImpacts.get(level);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
init();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int skipTo(int target) throws IOException {
|
||||||
|
if (!hasSkipList) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
int result = super.skipTo(target);
|
||||||
|
if (numberOfSkipLevels > 0) {
|
||||||
|
numLevels = numberOfSkipLevels;
|
||||||
|
} else {
|
||||||
|
// End of postings don't have skip data anymore, so we fill with dummy data
|
||||||
|
// like SlowImpactsEnum.
|
||||||
|
numLevels = 1;
|
||||||
|
perLevelImpacts.add(0, Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected int readSkipData(int level, IndexInput skipStream) throws IOException {
|
||||||
|
perLevelImpacts.get(level).clear();
|
||||||
|
int skipDoc = DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
ChecksumIndexInput input = new BufferedChecksumIndexInput(skipStream);
|
||||||
|
int freq = 1;
|
||||||
|
while (true) {
|
||||||
|
SimpleTextUtil.readLine(input, scratch);
|
||||||
|
if (scratch.get().equals(SimpleTextFieldsWriter.END)) {
|
||||||
|
SimpleTextUtil.checkFooter(input);
|
||||||
|
break;
|
||||||
|
} else if (scratch.get().equals(IMPACTS_END)
|
||||||
|
|| scratch.get().equals(SimpleTextFieldsWriter.TERM)
|
||||||
|
|| scratch.get().equals(SimpleTextFieldsWriter.FIELD)) {
|
||||||
|
break;
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), SKIP_LIST)) {
|
||||||
|
// continue
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), SKIP_DOC)) {
|
||||||
|
scratchUTF16.copyUTF8Bytes(
|
||||||
|
scratch.bytes(), SKIP_DOC.length, scratch.length() - SKIP_DOC.length);
|
||||||
|
skipDoc = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
|
||||||
|
// Because the MultiLevelSkipListReader stores doc id delta,but simple text codec stores doc
|
||||||
|
// id
|
||||||
|
skipDoc = skipDoc - super.skipDoc[level];
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), SKIP_DOC_FP)) {
|
||||||
|
scratchUTF16.copyUTF8Bytes(
|
||||||
|
scratch.bytes(), SKIP_DOC_FP.length, scratch.length() - SKIP_DOC_FP.length);
|
||||||
|
nextSkipDocFP = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), IMPACTS)
|
||||||
|
|| StringHelper.startsWith(scratch.get(), IMPACT)) {
|
||||||
|
// continue;
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), FREQ)) {
|
||||||
|
scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length() - FREQ.length);
|
||||||
|
freq = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
|
||||||
|
} else if (StringHelper.startsWith(scratch.get(), NORM)) {
|
||||||
|
scratchUTF16.copyUTF8Bytes(scratch.bytes(), NORM.length, scratch.length() - NORM.length);
|
||||||
|
long norm = Long.parseLong(scratchUTF16.toString());
|
||||||
|
Impact impact = new Impact(freq, norm);
|
||||||
|
perLevelImpacts.get(level).add(impact);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return skipDoc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected long readLevelLength(IndexInput skipStream) throws IOException {
|
||||||
|
SimpleTextUtil.readLine(skipStream, scratch);
|
||||||
|
scratchUTF16.copyUTF8Bytes(
|
||||||
|
scratch.bytes(), LEVEL_LENGTH.length, scratch.length() - LEVEL_LENGTH.length);
|
||||||
|
return Long.parseLong(scratchUTF16.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected long readChildPointer(IndexInput skipStream) throws IOException {
|
||||||
|
SimpleTextUtil.readLine(skipStream, scratch);
|
||||||
|
scratchUTF16.copyUTF8Bytes(
|
||||||
|
scratch.bytes(), CHILD_POINTER.length, scratch.length() - CHILD_POINTER.length);
|
||||||
|
return Long.parseLong(scratchUTF16.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset(long skipPointer, int docFreq) throws IOException {
|
||||||
|
init();
|
||||||
|
if (skipPointer > 0) {
|
||||||
|
super.init(skipPointer, docFreq);
|
||||||
|
hasSkipList = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void init() {
|
||||||
|
nextSkipDocFP = -1;
|
||||||
|
numLevels = 1;
|
||||||
|
perLevelImpacts = new ArrayList<>(maxNumberOfSkipLevels);
|
||||||
|
for (int level = 0; level < maxNumberOfSkipLevels; level++) {
|
||||||
|
List<Impact> impacts = new ArrayList<>();
|
||||||
|
impacts.add(new Impact(Integer.MAX_VALUE, 1L));
|
||||||
|
perLevelImpacts.add(level, impacts);
|
||||||
|
}
|
||||||
|
hasSkipList = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Impacts getImpacts() {
|
||||||
|
return impacts;
|
||||||
|
}
|
||||||
|
|
||||||
|
long getNextSkipDocFP() {
|
||||||
|
return nextSkipDocFP;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getNextSkipDoc() {
|
||||||
|
if (!hasSkipList) {
|
||||||
|
return DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
return skipDoc[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasSkipList() {
|
||||||
|
return hasSkipList;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,157 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.codecs.simpletext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
|
||||||
|
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
|
||||||
|
import org.apache.lucene.index.Impact;
|
||||||
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* plain text skip data.
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
class SimpleTextSkipWriter extends MultiLevelSkipListWriter {
|
||||||
|
|
||||||
|
static final int skipMultiplier = 3;
|
||||||
|
static final int maxSkipLevels = 4;
|
||||||
|
|
||||||
|
static final int BLOCK_SIZE = 8;
|
||||||
|
private Map<Integer, Boolean> wroteHeaderPerLevelMap = new HashMap<>();
|
||||||
|
private int curDoc;
|
||||||
|
private long curDocFilePointer;
|
||||||
|
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
|
||||||
|
private final BytesRefBuilder scratch = new BytesRefBuilder();
|
||||||
|
|
||||||
|
static final BytesRef SKIP_LIST = new BytesRef(" skipList ");
|
||||||
|
static final BytesRef LEVEL_LENGTH = new BytesRef(" levelLength ");
|
||||||
|
static final BytesRef LEVEL = new BytesRef(" level ");
|
||||||
|
static final BytesRef SKIP_DOC = new BytesRef(" skipDoc ");
|
||||||
|
static final BytesRef SKIP_DOC_FP = new BytesRef(" skipDocFP ");
|
||||||
|
static final BytesRef IMPACTS = new BytesRef(" impacts ");
|
||||||
|
static final BytesRef IMPACT = new BytesRef(" impact ");
|
||||||
|
static final BytesRef FREQ = new BytesRef(" freq ");
|
||||||
|
static final BytesRef NORM = new BytesRef(" norm ");
|
||||||
|
static final BytesRef IMPACTS_END = new BytesRef(" impactsEnd ");
|
||||||
|
static final BytesRef CHILD_POINTER = new BytesRef(" childPointer ");
|
||||||
|
|
||||||
|
SimpleTextSkipWriter(SegmentWriteState writeState) throws IOException {
|
||||||
|
super(BLOCK_SIZE, skipMultiplier, maxSkipLevels, writeState.segmentInfo.maxDoc());
|
||||||
|
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
|
||||||
|
for (int i = 0; i < maxSkipLevels; ++i) {
|
||||||
|
curCompetitiveFreqNorms[i] = new CompetitiveImpactAccumulator();
|
||||||
|
}
|
||||||
|
resetSkip();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeSkipData(int level, DataOutput skipBuffer) throws IOException {
|
||||||
|
Boolean wroteHeader = wroteHeaderPerLevelMap.get(level);
|
||||||
|
if (wroteHeader == null || !wroteHeader) {
|
||||||
|
SimpleTextUtil.write(skipBuffer, LEVEL);
|
||||||
|
SimpleTextUtil.write(skipBuffer, level + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
|
||||||
|
wroteHeaderPerLevelMap.put(level, true);
|
||||||
|
}
|
||||||
|
SimpleTextUtil.write(skipBuffer, SKIP_DOC);
|
||||||
|
SimpleTextUtil.write(skipBuffer, curDoc + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
|
||||||
|
SimpleTextUtil.write(skipBuffer, SKIP_DOC_FP);
|
||||||
|
SimpleTextUtil.write(skipBuffer, curDocFilePointer + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
|
||||||
|
CompetitiveImpactAccumulator competitiveFreqNorms = curCompetitiveFreqNorms[level];
|
||||||
|
Collection<Impact> impacts = competitiveFreqNorms.getCompetitiveFreqNormPairs();
|
||||||
|
assert impacts.size() > 0;
|
||||||
|
if (level + 1 < numberOfSkipLevels) {
|
||||||
|
curCompetitiveFreqNorms[level + 1].addAll(competitiveFreqNorms);
|
||||||
|
}
|
||||||
|
SimpleTextUtil.write(skipBuffer, IMPACTS);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
for (Impact impact : impacts) {
|
||||||
|
SimpleTextUtil.write(skipBuffer, IMPACT);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
SimpleTextUtil.write(skipBuffer, FREQ);
|
||||||
|
SimpleTextUtil.write(skipBuffer, impact.freq + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
SimpleTextUtil.write(skipBuffer, NORM);
|
||||||
|
SimpleTextUtil.write(skipBuffer, impact.norm + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
}
|
||||||
|
SimpleTextUtil.write(skipBuffer, IMPACTS_END);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
competitiveFreqNorms.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void resetSkip() {
|
||||||
|
super.resetSkip();
|
||||||
|
wroteHeaderPerLevelMap.clear();
|
||||||
|
this.curDoc = -1;
|
||||||
|
this.curDocFilePointer = -1;
|
||||||
|
for (CompetitiveImpactAccumulator acc : curCompetitiveFreqNorms) {
|
||||||
|
acc.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long writeSkip(IndexOutput output) throws IOException {
|
||||||
|
long skipOffset = output.getFilePointer();
|
||||||
|
SimpleTextUtil.write(output, SKIP_LIST);
|
||||||
|
SimpleTextUtil.writeNewline(output);
|
||||||
|
super.writeSkip(output);
|
||||||
|
return skipOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
void bufferSkip(
|
||||||
|
int doc,
|
||||||
|
long docFilePointer,
|
||||||
|
int numDocs,
|
||||||
|
final CompetitiveImpactAccumulator competitiveImpactAccumulator)
|
||||||
|
throws IOException {
|
||||||
|
assert doc > curDoc;
|
||||||
|
this.curDoc = doc;
|
||||||
|
this.curDocFilePointer = docFilePointer;
|
||||||
|
this.curCompetitiveFreqNorms[0].addAll(competitiveImpactAccumulator);
|
||||||
|
bufferSkip(numDocs);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeLevelLength(long levelLength, IndexOutput output) throws IOException {
|
||||||
|
SimpleTextUtil.write(output, LEVEL_LENGTH);
|
||||||
|
SimpleTextUtil.write(output, levelLength + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(output);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeChildPointer(long childPointer, DataOutput skipBuffer) throws IOException {
|
||||||
|
SimpleTextUtil.write(skipBuffer, CHILD_POINTER);
|
||||||
|
SimpleTextUtil.write(skipBuffer, childPointer + "", scratch);
|
||||||
|
SimpleTextUtil.writeNewline(skipBuffer);
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,7 +19,9 @@ package org.apache.lucene.codecs;
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.MathUtil;
|
import org.apache.lucene.util.MathUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -162,7 +164,7 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
||||||
|
|
||||||
if (level != 0) {
|
if (level != 0) {
|
||||||
// read the child pointer if we are not on the leaf level
|
// read the child pointer if we are not on the leaf level
|
||||||
childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1];
|
childPointer[level] = readChildPointer(skipStream[level]) + skipPointer[level - 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -174,7 +176,7 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
||||||
numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1];
|
numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1];
|
||||||
skipDoc[level] = lastDoc;
|
skipDoc[level] = lastDoc;
|
||||||
if (level > 0) {
|
if (level > 0) {
|
||||||
childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1];
|
childPointer[level] = readChildPointer(skipStream[level]) + skipPointer[level - 1];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -221,7 +223,7 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
||||||
|
|
||||||
for (int i = numberOfSkipLevels - 1; i > 0; i--) {
|
for (int i = numberOfSkipLevels - 1; i > 0; i--) {
|
||||||
// the length of the current level
|
// the length of the current level
|
||||||
long length = skipStream[0].readVLong();
|
long length = readLevelLength(skipStream[0]);
|
||||||
|
|
||||||
// the start pointer of the current level
|
// the start pointer of the current level
|
||||||
skipPointer[i] = skipStream[0].getFilePointer();
|
skipPointer[i] = skipStream[0].getFilePointer();
|
||||||
|
@ -250,6 +252,28 @@ public abstract class MultiLevelSkipListReader implements Closeable {
|
||||||
*/
|
*/
|
||||||
protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException;
|
protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* read the length of the current level written via {@link
|
||||||
|
* MultiLevelSkipListWriter#writeLevelLength(long, IndexOutput)}.
|
||||||
|
*
|
||||||
|
* @param skipStream the IndexInput the length shall be read from
|
||||||
|
* @return level length
|
||||||
|
*/
|
||||||
|
protected long readLevelLength(IndexInput skipStream) throws IOException {
|
||||||
|
return skipStream.readVLong();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* read the child pointer written via {@link MultiLevelSkipListWriter#writeChildPointer(long,
|
||||||
|
* DataOutput)}.
|
||||||
|
*
|
||||||
|
* @param skipStream the IndexInput the child pointer shall be read from
|
||||||
|
* @return child pointer
|
||||||
|
*/
|
||||||
|
protected long readChildPointer(IndexInput skipStream) throws IOException {
|
||||||
|
return skipStream.readVLong();
|
||||||
|
}
|
||||||
|
|
||||||
/** Copies the values of the last read skip entry on this level */
|
/** Copies the values of the last read skip entry on this level */
|
||||||
protected void setLastSkipData(int level) {
|
protected void setLastSkipData(int level) {
|
||||||
lastDoc = skipDoc[level];
|
lastDoc = skipDoc[level];
|
||||||
|
|
|
@ -147,7 +147,7 @@ public abstract class MultiLevelSkipListWriter {
|
||||||
|
|
||||||
if (level != 0) {
|
if (level != 0) {
|
||||||
// store child pointers for all levels except the lowest
|
// store child pointers for all levels except the lowest
|
||||||
skipBuffer[level].writeVLong(childPointer);
|
writeChildPointer(childPointer, skipBuffer[level]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// remember the childPointer for the next level
|
// remember the childPointer for the next level
|
||||||
|
@ -169,7 +169,7 @@ public abstract class MultiLevelSkipListWriter {
|
||||||
for (int level = numberOfSkipLevels - 1; level > 0; level--) {
|
for (int level = numberOfSkipLevels - 1; level > 0; level--) {
|
||||||
long length = skipBuffer[level].size();
|
long length = skipBuffer[level].size();
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
output.writeVLong(length);
|
writeLevelLength(length, output);
|
||||||
skipBuffer[level].copyTo(output);
|
skipBuffer[level].copyTo(output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -177,4 +177,24 @@ public abstract class MultiLevelSkipListWriter {
|
||||||
|
|
||||||
return skipPointer;
|
return skipPointer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the length of a level to the given output.
|
||||||
|
*
|
||||||
|
* @param levelLength the length of a level
|
||||||
|
* @param output the IndexOutput the length shall be written to
|
||||||
|
*/
|
||||||
|
protected void writeLevelLength(long levelLength, IndexOutput output) throws IOException {
|
||||||
|
output.writeVLong(levelLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the child pointer of a block to the given output.
|
||||||
|
*
|
||||||
|
* @param childPointer block of higher level point to the lower level
|
||||||
|
* @param skipBuffer the skip buffer to write to
|
||||||
|
*/
|
||||||
|
protected void writeChildPointer(long childPointer, DataOutput skipBuffer) throws IOException {
|
||||||
|
skipBuffer.writeVLong(childPointer);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue