mirror of https://github.com/apache/lucene.git
Cleanup TermsHashPerField (#1573)
Several classes within the IndexWriter indexing chain haven't been touched for several years. Most of these classes expose their internals through public members and are difficult to construct in tests since they depend on many other classes. This change tries to clean up TermsHashPerField and adds a dedicated standalone test for it to make it more accessible for other developers since it's simpler to understand. There are also attempts to make documentation better as a result of this refactoring.
This commit is contained in:
parent
a7792b129b
commit
c083e5414e
|
@ -26,7 +26,6 @@ import org.apache.lucene.util.ByteBlockPool;
|
||||||
* byte[]. This is used by DocumentsWriter to hold the
|
* byte[]. This is used by DocumentsWriter to hold the
|
||||||
* posting list for many terms in RAM.
|
* posting list for many terms in RAM.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
final class ByteSliceWriter extends DataOutput {
|
final class ByteSliceWriter extends DataOutput {
|
||||||
|
|
||||||
private byte[] slice;
|
private byte[] slice;
|
||||||
|
|
|
@ -929,7 +929,7 @@ final class DefaultIndexingChain extends DocConsumer {
|
||||||
// corrupt and should not be flushed to a
|
// corrupt and should not be flushed to a
|
||||||
// new segment:
|
// new segment:
|
||||||
try {
|
try {
|
||||||
termsHashPerField.add();
|
termsHashPerField.add(invertState.termAttribute.getBytesRef(), docState.docID);
|
||||||
} catch (MaxBytesLengthExceededException e) {
|
} catch (MaxBytesLengthExceededException e) {
|
||||||
byte[] prefix = new byte[30];
|
byte[] prefix = new byte[30];
|
||||||
BytesRef bigTerm = invertState.termAttribute.getBytesRef();
|
BytesRef bigTerm = invertState.termAttribute.getBytesRef();
|
||||||
|
|
|
@ -39,7 +39,7 @@ class FreqProxFields extends Fields {
|
||||||
public FreqProxFields(List<FreqProxTermsWriterPerField> fieldList) {
|
public FreqProxFields(List<FreqProxTermsWriterPerField> fieldList) {
|
||||||
// NOTE: fields are already sorted by field name
|
// NOTE: fields are already sorted by field name
|
||||||
for(FreqProxTermsWriterPerField field : fieldList) {
|
for(FreqProxTermsWriterPerField field : fieldList) {
|
||||||
fields.put(field.fieldInfo.name, field);
|
fields.put(field.getFieldName(), field);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,7 +55,6 @@ class FreqProxFields extends Fields {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int size() {
|
public int size() {
|
||||||
//return fields.size();
|
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,31 +74,27 @@ class FreqProxFields extends Fields {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public long size() {
|
||||||
//return terms.termsHashPerField.bytesHash.size();
|
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getSumTotalTermFreq() {
|
public long getSumTotalTermFreq() {
|
||||||
//return terms.sumTotalTermFreq;
|
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long getSumDocFreq() {
|
public long getSumDocFreq() {
|
||||||
//return terms.sumDocFreq;
|
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int getDocCount() {
|
public int getDocCount() {
|
||||||
//return terms.docCount;
|
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean hasFreqs() {
|
public boolean hasFreqs() {
|
||||||
return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
return terms.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -107,7 +102,7 @@ class FreqProxFields extends Fields {
|
||||||
// NOTE: the in-memory buffer may have indexed offsets
|
// NOTE: the in-memory buffer may have indexed offsets
|
||||||
// because that's what FieldInfo said when we started,
|
// because that's what FieldInfo said when we started,
|
||||||
// but during indexing this may have been downgraded:
|
// but during indexing this may have been downgraded:
|
||||||
return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
return terms.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -115,7 +110,7 @@ class FreqProxFields extends Fields {
|
||||||
// NOTE: the in-memory buffer may have indexed positions
|
// NOTE: the in-memory buffer may have indexed positions
|
||||||
// because that's what FieldInfo said when we started,
|
// because that's what FieldInfo said when we started,
|
||||||
// but during indexing this may have been downgraded:
|
// but during indexing this may have been downgraded:
|
||||||
return terms.fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
return terms.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -132,10 +127,10 @@ class FreqProxFields extends Fields {
|
||||||
final int numTerms;
|
final int numTerms;
|
||||||
int ord;
|
int ord;
|
||||||
|
|
||||||
public FreqProxTermsEnum(FreqProxTermsWriterPerField terms) {
|
FreqProxTermsEnum(FreqProxTermsWriterPerField terms) {
|
||||||
this.terms = terms;
|
this.terms = terms;
|
||||||
this.numTerms = terms.bytesHash.size();
|
this.numTerms = terms.getNumTerms();
|
||||||
sortedTermIDs = terms.sortedTermIDs;
|
sortedTermIDs = terms.getSortedTermIDs();
|
||||||
assert sortedTermIDs != null;
|
assert sortedTermIDs != null;
|
||||||
postingsArray = (FreqProxPostingsArray) terms.postingsArray;
|
postingsArray = (FreqProxPostingsArray) terms.postingsArray;
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,9 +75,9 @@ final class FreqProxTermsWriter extends TermsHash {
|
||||||
|
|
||||||
for (TermsHashPerField f : fieldsToFlush.values()) {
|
for (TermsHashPerField f : fieldsToFlush.values()) {
|
||||||
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
|
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) f;
|
||||||
if (perField.bytesHash.size() > 0) {
|
if (perField.getNumTerms() > 0) {
|
||||||
perField.sortPostings();
|
perField.sortTerms();
|
||||||
assert perField.fieldInfo.getIndexOptions() != IndexOptions.NONE;
|
assert perField.indexOptions != IndexOptions.NONE;
|
||||||
allFields.add(perField);
|
allFields.add(perField);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
// TODO: break into separate freq and prox writers as
|
// TODO: break into separate freq and prox writers as
|
||||||
|
@ -28,26 +29,25 @@ import org.apache.lucene.util.BytesRef;
|
||||||
final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
|
|
||||||
private FreqProxPostingsArray freqProxPostingsArray;
|
private FreqProxPostingsArray freqProxPostingsArray;
|
||||||
|
private final FieldInvertState fieldState;
|
||||||
|
private final FieldInfo fieldInfo;
|
||||||
|
|
||||||
final boolean hasFreq;
|
final boolean hasFreq;
|
||||||
final boolean hasProx;
|
final boolean hasProx;
|
||||||
final boolean hasOffsets;
|
final boolean hasOffsets;
|
||||||
PayloadAttribute payloadAttribute;
|
PayloadAttribute payloadAttribute;
|
||||||
OffsetAttribute offsetAttribute;
|
OffsetAttribute offsetAttribute;
|
||||||
long sumTotalTermFreq;
|
TermFrequencyAttribute termFreqAtt;
|
||||||
long sumDocFreq;
|
|
||||||
|
|
||||||
// How many docs have this field:
|
|
||||||
int docCount;
|
|
||||||
|
|
||||||
/** Set to true if any token had a payload in the current
|
/** Set to true if any token had a payload in the current
|
||||||
* segment. */
|
* segment. */
|
||||||
boolean sawPayloads;
|
boolean sawPayloads;
|
||||||
|
|
||||||
public FreqProxTermsWriterPerField(FieldInvertState invertState, TermsHash termsHash, FieldInfo fieldInfo, TermsHashPerField nextPerField) {
|
FreqProxTermsWriterPerField(FieldInvertState invertState, TermsHash termsHash, FieldInfo fieldInfo, TermsHashPerField nextPerField) {
|
||||||
super(fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? 2 : 1, invertState, termsHash, nextPerField, fieldInfo);
|
super(fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 ? 2 : 1,
|
||||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
termsHash.intPool, termsHash.bytePool, termsHash.termBytePool, termsHash.bytesUsed, nextPerField, fieldInfo.name, fieldInfo.getIndexOptions());
|
||||||
assert indexOptions != IndexOptions.NONE;
|
this.fieldState = invertState;
|
||||||
|
this.fieldInfo = fieldInfo;
|
||||||
hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
@ -56,12 +56,6 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
@Override
|
@Override
|
||||||
void finish() throws IOException {
|
void finish() throws IOException {
|
||||||
super.finish();
|
super.finish();
|
||||||
sumDocFreq += fieldState.uniqueTermCount;
|
|
||||||
sumTotalTermFreq += fieldState.length;
|
|
||||||
if (fieldState.length > 0) {
|
|
||||||
docCount++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sawPayloads) {
|
if (sawPayloads) {
|
||||||
fieldInfo.setStorePayloads();
|
fieldInfo.setStorePayloads();
|
||||||
}
|
}
|
||||||
|
@ -70,6 +64,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
@Override
|
@Override
|
||||||
boolean start(IndexableField f, boolean first) {
|
boolean start(IndexableField f, boolean first) {
|
||||||
super.start(f, first);
|
super.start(f, first);
|
||||||
|
termFreqAtt = fieldState.termFreqAttribute;
|
||||||
payloadAttribute = fieldState.payloadAttribute;
|
payloadAttribute = fieldState.payloadAttribute;
|
||||||
offsetAttribute = fieldState.offsetAttribute;
|
offsetAttribute = fieldState.offsetAttribute;
|
||||||
return true;
|
return true;
|
||||||
|
@ -104,18 +99,18 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void newTerm(final int termID) {
|
void newTerm(final int termID, final int docID) {
|
||||||
// First time we're seeing this term since the last
|
// First time we're seeing this term since the last
|
||||||
// flush
|
// flush
|
||||||
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
||||||
|
|
||||||
postings.lastDocIDs[termID] = docState.docID;
|
postings.lastDocIDs[termID] = docID;
|
||||||
if (!hasFreq) {
|
if (!hasFreq) {
|
||||||
assert postings.termFreqs == null;
|
assert postings.termFreqs == null;
|
||||||
postings.lastDocCodes[termID] = docState.docID;
|
postings.lastDocCodes[termID] = docID;
|
||||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||||
} else {
|
} else {
|
||||||
postings.lastDocCodes[termID] = docState.docID << 1;
|
postings.lastDocCodes[termID] = docID << 1;
|
||||||
postings.termFreqs[termID] = getTermFreq();
|
postings.termFreqs[termID] = getTermFreq();
|
||||||
if (hasProx) {
|
if (hasProx) {
|
||||||
writeProx(termID, fieldState.position);
|
writeProx(termID, fieldState.position);
|
||||||
|
@ -131,25 +126,25 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void addTerm(final int termID) {
|
void addTerm(final int termID, final int docID) {
|
||||||
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
final FreqProxPostingsArray postings = freqProxPostingsArray;
|
||||||
assert !hasFreq || postings.termFreqs[termID] > 0;
|
assert !hasFreq || postings.termFreqs[termID] > 0;
|
||||||
|
|
||||||
if (!hasFreq) {
|
if (!hasFreq) {
|
||||||
assert postings.termFreqs == null;
|
assert postings.termFreqs == null;
|
||||||
if (termFreqAtt.getTermFrequency() != 1) {
|
if (termFreqAtt.getTermFrequency() != 1) {
|
||||||
throw new IllegalStateException("field \"" + fieldInfo.name + "\": must index term freq while using custom TermFrequencyAttribute");
|
throw new IllegalStateException("field \"" + getFieldName() + "\": must index term freq while using custom TermFrequencyAttribute");
|
||||||
}
|
}
|
||||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
if (docID != postings.lastDocIDs[termID]) {
|
||||||
// New document; now encode docCode for previous doc:
|
// New document; now encode docCode for previous doc:
|
||||||
assert docState.docID > postings.lastDocIDs[termID];
|
assert docID > postings.lastDocIDs[termID];
|
||||||
writeVInt(0, postings.lastDocCodes[termID]);
|
writeVInt(0, postings.lastDocCodes[termID]);
|
||||||
postings.lastDocCodes[termID] = docState.docID - postings.lastDocIDs[termID];
|
postings.lastDocCodes[termID] = docID - postings.lastDocIDs[termID];
|
||||||
postings.lastDocIDs[termID] = docState.docID;
|
postings.lastDocIDs[termID] = docID;
|
||||||
fieldState.uniqueTermCount++;
|
fieldState.uniqueTermCount++;
|
||||||
}
|
}
|
||||||
} else if (docState.docID != postings.lastDocIDs[termID]) {
|
} else if (docID != postings.lastDocIDs[termID]) {
|
||||||
assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
|
assert docID > postings.lastDocIDs[termID]:"id: "+docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
|
||||||
// Term not yet seen in the current doc but previously
|
// Term not yet seen in the current doc but previously
|
||||||
// seen in other doc(s) since the last flush
|
// seen in other doc(s) since the last flush
|
||||||
|
|
||||||
|
@ -165,8 +160,8 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
// Init freq for the current document
|
// Init freq for the current document
|
||||||
postings.termFreqs[termID] = getTermFreq();
|
postings.termFreqs[termID] = getTermFreq();
|
||||||
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
|
fieldState.maxTermFrequency = Math.max(postings.termFreqs[termID], fieldState.maxTermFrequency);
|
||||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1;
|
||||||
postings.lastDocIDs[termID] = docState.docID;
|
postings.lastDocIDs[termID] = docID;
|
||||||
if (hasProx) {
|
if (hasProx) {
|
||||||
writeProx(termID, fieldState.position);
|
writeProx(termID, fieldState.position);
|
||||||
if (hasOffsets) {
|
if (hasOffsets) {
|
||||||
|
@ -193,7 +188,7 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
int freq = termFreqAtt.getTermFrequency();
|
int freq = termFreqAtt.getTermFrequency();
|
||||||
if (freq != 1) {
|
if (freq != 1) {
|
||||||
if (hasProx) {
|
if (hasProx) {
|
||||||
throw new IllegalStateException("field \"" + fieldInfo.name + "\": cannot index positions while using custom TermFrequencyAttribute");
|
throw new IllegalStateException("field \"" + getFieldName() + "\": cannot index positions while using custom TermFrequencyAttribute");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -207,8 +202,6 @@ final class FreqProxTermsWriterPerField extends TermsHashPerField {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
ParallelPostingsArray createPostingsArray(int size) {
|
ParallelPostingsArray createPostingsArray(int size) {
|
||||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
|
||||||
assert indexOptions != IndexOptions.NONE;
|
|
||||||
boolean hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
boolean hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||||
boolean hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
boolean hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
boolean hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
|
|
@ -22,14 +22,14 @@ class ParallelPostingsArray {
|
||||||
final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
|
final static int BYTES_PER_POSTING = 3 * Integer.BYTES;
|
||||||
|
|
||||||
final int size;
|
final int size;
|
||||||
final int[] textStarts;
|
final int[] textStarts; // maps term ID to the terms's text start in the bytesHash
|
||||||
final int[] intStarts;
|
final int[] addressOffset; // maps term ID to current stream address
|
||||||
final int[] byteStarts;
|
final int[] byteStarts; // maps term ID to stream start offset in the byte pool
|
||||||
|
|
||||||
ParallelPostingsArray(final int size) {
|
ParallelPostingsArray(final int size) {
|
||||||
this.size = size;
|
this.size = size;
|
||||||
textStarts = new int[size];
|
textStarts = new int[size];
|
||||||
intStarts = new int[size];
|
addressOffset = new int[size];
|
||||||
byteStarts = new int[size];
|
byteStarts = new int[size];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ class ParallelPostingsArray {
|
||||||
|
|
||||||
void copyTo(ParallelPostingsArray toArray, int numToCopy) {
|
void copyTo(ParallelPostingsArray toArray, int numToCopy) {
|
||||||
System.arraycopy(textStarts, 0, toArray.textStarts, 0, numToCopy);
|
System.arraycopy(textStarts, 0, toArray.textStarts, 0, numToCopy);
|
||||||
System.arraycopy(intStarts, 0, toArray.intStarts, 0, numToCopy);
|
System.arraycopy(addressOffset, 0, toArray.addressOffset, 0, numToCopy);
|
||||||
System.arraycopy(byteStarts, 0, toArray.byteStarts, 0, numToCopy);
|
System.arraycopy(byteStarts, 0, toArray.byteStarts, 0, numToCopy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,11 +44,11 @@ class TermVectorsConsumer extends TermsHash {
|
||||||
final ByteSliceReader vectorSliceReaderOff = new ByteSliceReader();
|
final ByteSliceReader vectorSliceReaderOff = new ByteSliceReader();
|
||||||
|
|
||||||
boolean hasVectors;
|
boolean hasVectors;
|
||||||
int numVectorFields;
|
private int numVectorFields;
|
||||||
int lastDocID;
|
int lastDocID;
|
||||||
private TermVectorsConsumerPerField[] perFields = new TermVectorsConsumerPerField[1];
|
private TermVectorsConsumerPerField[] perFields = new TermVectorsConsumerPerField[1];
|
||||||
|
|
||||||
public TermVectorsConsumer(DocumentsWriterPerThread docWriter) {
|
TermVectorsConsumer(DocumentsWriterPerThread docWriter) {
|
||||||
super(docWriter, false, null);
|
super(docWriter, false, null);
|
||||||
this.docWriter = docWriter;
|
this.docWriter = docWriter;
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,27 +20,37 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
||||||
import org.apache.lucene.codecs.TermVectorsWriter;
|
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||||
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
final class TermVectorsConsumerPerField extends TermsHashPerField {
|
final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
|
|
||||||
private TermVectorsPostingsArray termVectorsPostingsArray;
|
private TermVectorsPostingsArray termVectorsPostingsArray;
|
||||||
|
|
||||||
final TermVectorsConsumer termsWriter;
|
private final TermVectorsConsumer termsWriter;
|
||||||
|
private final FieldInvertState fieldState;
|
||||||
|
private final FieldInfo fieldInfo;
|
||||||
|
|
||||||
boolean doVectors;
|
private boolean doVectors;
|
||||||
boolean doVectorPositions;
|
private boolean doVectorPositions;
|
||||||
boolean doVectorOffsets;
|
private boolean doVectorOffsets;
|
||||||
boolean doVectorPayloads;
|
private boolean doVectorPayloads;
|
||||||
|
|
||||||
OffsetAttribute offsetAttribute;
|
private OffsetAttribute offsetAttribute;
|
||||||
PayloadAttribute payloadAttribute;
|
private PayloadAttribute payloadAttribute;
|
||||||
boolean hasPayloads; // if enabled, and we actually saw any for this field
|
private TermFrequencyAttribute termFreqAtt;
|
||||||
|
private final ByteBlockPool termBytePool;
|
||||||
|
|
||||||
public TermVectorsConsumerPerField(FieldInvertState invertState, TermVectorsConsumer termsWriter, FieldInfo fieldInfo) {
|
private boolean hasPayloads; // if enabled, and we actually saw any for this field
|
||||||
super(2, invertState, termsWriter, null, fieldInfo);
|
|
||||||
this.termsWriter = termsWriter;
|
TermVectorsConsumerPerField(FieldInvertState invertState, TermVectorsConsumer termsHash, FieldInfo fieldInfo) {
|
||||||
|
super(2, termsHash.intPool, termsHash.bytePool, termsHash.termBytePool, termsHash.bytesUsed, null, fieldInfo.name, fieldInfo.getIndexOptions());
|
||||||
|
this.termsWriter = termsHash;
|
||||||
|
this.fieldInfo = fieldInfo;
|
||||||
|
this.fieldState = invertState;
|
||||||
|
termBytePool = termsHash.termBytePool;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Called once per field per document if term vectors
|
/** Called once per field per document if term vectors
|
||||||
|
@ -48,7 +58,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
* RAMOutputStream, which is then quickly flushed to
|
* RAMOutputStream, which is then quickly flushed to
|
||||||
* the real term vectors files in the Directory. */ @Override
|
* the real term vectors files in the Directory. */ @Override
|
||||||
void finish() {
|
void finish() {
|
||||||
if (!doVectors || bytesHash.size() == 0) {
|
if (!doVectors || getNumTerms() == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
termsWriter.addFieldToFlush(this);
|
termsWriter.addFieldToFlush(this);
|
||||||
|
@ -61,7 +71,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
|
|
||||||
doVectors = false;
|
doVectors = false;
|
||||||
|
|
||||||
final int numPostings = bytesHash.size();
|
final int numPostings = getNumTerms();
|
||||||
|
|
||||||
final BytesRef flushTerm = termsWriter.flushTerm;
|
final BytesRef flushTerm = termsWriter.flushTerm;
|
||||||
|
|
||||||
|
@ -74,7 +84,8 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||||
final TermVectorsWriter tv = termsWriter.writer;
|
final TermVectorsWriter tv = termsWriter.writer;
|
||||||
|
|
||||||
final int[] termIDs = sortPostings();
|
sortTerms();
|
||||||
|
final int[] termIDs = getSortedTermIDs();
|
||||||
|
|
||||||
tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads);
|
tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads);
|
||||||
|
|
||||||
|
@ -110,18 +121,19 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
@Override
|
@Override
|
||||||
boolean start(IndexableField field, boolean first) {
|
boolean start(IndexableField field, boolean first) {
|
||||||
super.start(field, first);
|
super.start(field, first);
|
||||||
|
termFreqAtt = fieldState.termFreqAttribute;
|
||||||
assert field.fieldType().indexOptions() != IndexOptions.NONE;
|
assert field.fieldType().indexOptions() != IndexOptions.NONE;
|
||||||
|
|
||||||
if (first) {
|
if (first) {
|
||||||
|
|
||||||
if (bytesHash.size() != 0) {
|
if (getNumTerms() != 0) {
|
||||||
// Only necessary if previous doc hit a
|
// Only necessary if previous doc hit a
|
||||||
// non-aborting exception while writing vectors in
|
// non-aborting exception while writing vectors in
|
||||||
// this field:
|
// this field:
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
bytesHash.reinit();
|
reinitHash();
|
||||||
|
|
||||||
hasPayloads = false;
|
hasPayloads = false;
|
||||||
|
|
||||||
|
@ -222,7 +234,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void newTerm(final int termID) {
|
void newTerm(final int termID, final int docID) {
|
||||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||||
|
|
||||||
postings.freqs[termID] = getTermFreq();
|
postings.freqs[termID] = getTermFreq();
|
||||||
|
@ -233,7 +245,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void addTerm(final int termID) {
|
void addTerm(final int termID, final int docID) {
|
||||||
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
TermVectorsPostingsArray postings = termVectorsPostingsArray;
|
||||||
|
|
||||||
postings.freqs[termID] += getTermFreq();
|
postings.freqs[termID] += getTermFreq();
|
||||||
|
@ -245,10 +257,10 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
int freq = termFreqAtt.getTermFrequency();
|
int freq = termFreqAtt.getTermFrequency();
|
||||||
if (freq != 1) {
|
if (freq != 1) {
|
||||||
if (doVectorPositions) {
|
if (doVectorPositions) {
|
||||||
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
|
throw new IllegalArgumentException("field \"" + getFieldName() + "\": cannot index term vector positions while using custom TermFrequencyAttribute");
|
||||||
}
|
}
|
||||||
if (doVectorOffsets) {
|
if (doVectorOffsets) {
|
||||||
throw new IllegalArgumentException("field \"" + fieldInfo.name + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
|
throw new IllegalArgumentException("field \"" + getFieldName() + "\": cannot index term vector offsets while using custom TermFrequencyAttribute");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -266,7 +278,7 @@ final class TermVectorsConsumerPerField extends TermsHashPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
static final class TermVectorsPostingsArray extends ParallelPostingsArray {
|
static final class TermVectorsPostingsArray extends ParallelPostingsArray {
|
||||||
public TermVectorsPostingsArray(int size) {
|
TermVectorsPostingsArray(int size) {
|
||||||
super(size);
|
super(size);
|
||||||
freqs = new int[size];
|
freqs = new int[size];
|
||||||
lastOffsets = new int[size];
|
lastOffsets = new int[size];
|
||||||
|
|
|
@ -82,7 +82,7 @@ abstract class TermsHash {
|
||||||
if (nextTermsHash != null) {
|
if (nextTermsHash != null) {
|
||||||
Map<String,TermsHashPerField> nextChildFields = new HashMap<>();
|
Map<String,TermsHashPerField> nextChildFields = new HashMap<>();
|
||||||
for (final Map.Entry<String,TermsHashPerField> entry : fieldsToFlush.entrySet()) {
|
for (final Map.Entry<String,TermsHashPerField> entry : fieldsToFlush.entrySet()) {
|
||||||
nextChildFields.put(entry.getKey(), entry.getValue().nextPerField);
|
nextChildFields.put(entry.getKey(), entry.getValue().getNextPerField());
|
||||||
}
|
}
|
||||||
nextTermsHash.flush(nextChildFields, state, sortMap, norms);
|
nextTermsHash.flush(nextChildFields, state, sortMap, norms);
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,182 +19,186 @@ package org.apache.lucene.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
|
||||||
import org.apache.lucene.util.ByteBlockPool;
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
|
import org.apache.lucene.util.BytesRefHash.BytesStartArray;
|
||||||
import org.apache.lucene.util.BytesRefHash;
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
import org.apache.lucene.util.Counter;
|
import org.apache.lucene.util.Counter;
|
||||||
import org.apache.lucene.util.IntBlockPool;
|
import org.apache.lucene.util.IntBlockPool;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class stores streams of information per term without knowing
|
||||||
|
* the size of the stream ahead of time. Each stream typically encodes one level
|
||||||
|
* of information like term frequency per document or term proximity. Internally
|
||||||
|
* this class allocates a linked list of slices that can be read by a {@link ByteSliceReader}
|
||||||
|
* for each term. Terms are first deduplicated in a {@link BytesRefHash} once this is done
|
||||||
|
* internal data-structures point to the current offset of each stream that can be written to.
|
||||||
|
*/
|
||||||
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
private static final int HASH_INIT_SIZE = 4;
|
private static final int HASH_INIT_SIZE = 4;
|
||||||
|
|
||||||
final TermsHash termsHash;
|
private final TermsHashPerField nextPerField;
|
||||||
|
private final IntBlockPool intPool;
|
||||||
final TermsHashPerField nextPerField;
|
|
||||||
protected final DocumentsWriterPerThread.DocState docState;
|
|
||||||
protected final FieldInvertState fieldState;
|
|
||||||
TermToBytesRefAttribute termAtt;
|
|
||||||
protected TermFrequencyAttribute termFreqAtt;
|
|
||||||
|
|
||||||
// Copied from our perThread
|
|
||||||
final IntBlockPool intPool;
|
|
||||||
final ByteBlockPool bytePool;
|
final ByteBlockPool bytePool;
|
||||||
final ByteBlockPool termBytePool;
|
// for each term we store an integer per stream that points into the bytePool above
|
||||||
|
// the address is updated once data is written to the stream to point to the next free offset
|
||||||
final int streamCount;
|
// in the terms stream. The start address for the stream is stored in postingsArray.byteStarts[termId]
|
||||||
final int numPostingInt;
|
// This is initialized in the #addTerm method, either to a brand new per term stream if the term is new or
|
||||||
|
// to the addresses where the term stream was written to when we saw it the last time.
|
||||||
protected final FieldInfo fieldInfo;
|
private int[] termStreamAddressBuffer;
|
||||||
|
private int streamAddressOffset;
|
||||||
final BytesRefHash bytesHash;
|
private final int streamCount;
|
||||||
|
private final String fieldName;
|
||||||
|
final IndexOptions indexOptions;
|
||||||
|
/* This stores the actual term bytes for postings and offsets into the parent hash in the case that this
|
||||||
|
* TermsHashPerField is hashing term vectors.*/
|
||||||
|
private final BytesRefHash bytesHash;
|
||||||
|
|
||||||
ParallelPostingsArray postingsArray;
|
ParallelPostingsArray postingsArray;
|
||||||
private final Counter bytesUsed;
|
private int lastDocID; // only with assert
|
||||||
|
|
||||||
/** streamCount: how many streams this field stores per term.
|
/** streamCount: how many streams this field stores per term.
|
||||||
* E.g. doc(+freq) is 1 stream, prox+offset is a second. */
|
* E.g. doc(+freq) is 1 stream, prox+offset is a second. */
|
||||||
|
TermsHashPerField(int streamCount, IntBlockPool intPool, ByteBlockPool bytePool, ByteBlockPool termBytePool,
|
||||||
public TermsHashPerField(int streamCount, FieldInvertState fieldState, TermsHash termsHash, TermsHashPerField nextPerField, FieldInfo fieldInfo) {
|
Counter bytesUsed, TermsHashPerField nextPerField, String fieldName, IndexOptions indexOptions) {
|
||||||
intPool = termsHash.intPool;
|
this.intPool = intPool;
|
||||||
bytePool = termsHash.bytePool;
|
this.bytePool = bytePool;
|
||||||
termBytePool = termsHash.termBytePool;
|
|
||||||
docState = termsHash.docState;
|
|
||||||
this.termsHash = termsHash;
|
|
||||||
bytesUsed = termsHash.bytesUsed;
|
|
||||||
this.fieldState = fieldState;
|
|
||||||
this.streamCount = streamCount;
|
this.streamCount = streamCount;
|
||||||
numPostingInt = 2*streamCount;
|
this.fieldName = fieldName;
|
||||||
this.fieldInfo = fieldInfo;
|
|
||||||
this.nextPerField = nextPerField;
|
this.nextPerField = nextPerField;
|
||||||
|
assert indexOptions != IndexOptions.NONE;
|
||||||
|
this.indexOptions = indexOptions;
|
||||||
PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
|
PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
|
||||||
bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
|
bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
bytesHash.clear(false);
|
bytesHash.clear(false);
|
||||||
|
sortedTermIDs = null;
|
||||||
if (nextPerField != null) {
|
if (nextPerField != null) {
|
||||||
nextPerField.reset();
|
nextPerField.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void initReader(ByteSliceReader reader, int termID, int stream) {
|
final void initReader(ByteSliceReader reader, int termID, int stream) {
|
||||||
assert stream < streamCount;
|
assert stream < streamCount;
|
||||||
int intStart = postingsArray.intStarts[termID];
|
int streamStartOffset = postingsArray.addressOffset[termID];
|
||||||
final int[] ints = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
|
final int[] streamAddressBuffer = intPool.buffers[streamStartOffset >> IntBlockPool.INT_BLOCK_SHIFT];
|
||||||
final int upto = intStart & IntBlockPool.INT_BLOCK_MASK;
|
final int offsetInAddressBuffer = streamStartOffset & IntBlockPool.INT_BLOCK_MASK;
|
||||||
reader.init(bytePool,
|
reader.init(bytePool,
|
||||||
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
|
postingsArray.byteStarts[termID]+stream*ByteBlockPool.FIRST_LEVEL_SIZE,
|
||||||
ints[upto+stream]);
|
streamAddressBuffer[offsetInAddressBuffer+stream]);
|
||||||
}
|
}
|
||||||
|
|
||||||
int[] sortedTermIDs;
|
private int[] sortedTermIDs;
|
||||||
|
|
||||||
/** Collapse the hash table and sort in-place; also sets
|
/** Collapse the hash table and sort in-place; also sets
|
||||||
* this.sortedTermIDs to the results */
|
* this.sortedTermIDs to the results
|
||||||
public int[] sortPostings() {
|
* This method must not be called twice unless {@link #reset()}
|
||||||
|
* or {@link #reinitHash()} was called. */
|
||||||
|
final void sortTerms() {
|
||||||
|
assert sortedTermIDs == null;
|
||||||
sortedTermIDs = bytesHash.sort();
|
sortedTermIDs = bytesHash.sort();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the sorted term IDs. {@link #sortTerms()} must be called before
|
||||||
|
*/
|
||||||
|
final int[] getSortedTermIDs() {
|
||||||
|
assert sortedTermIDs != null;
|
||||||
return sortedTermIDs;
|
return sortedTermIDs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final void reinitHash() {
|
||||||
|
sortedTermIDs = null;
|
||||||
|
bytesHash.reinit();
|
||||||
|
}
|
||||||
|
|
||||||
private boolean doNextCall;
|
private boolean doNextCall;
|
||||||
|
|
||||||
// Secondary entry point (for 2nd & subsequent TermsHash),
|
// Secondary entry point (for 2nd & subsequent TermsHash),
|
||||||
// because token text has already been "interned" into
|
// because token text has already been "interned" into
|
||||||
// textStart, so we hash by textStart. term vectors use
|
// textStart, so we hash by textStart. term vectors use
|
||||||
// this API.
|
// this API.
|
||||||
public void add(int textStart) throws IOException {
|
private void add(int textStart, final int docID) throws IOException {
|
||||||
int termID = bytesHash.addByPoolOffset(textStart);
|
int termID = bytesHash.addByPoolOffset(textStart);
|
||||||
if (termID >= 0) { // New posting
|
if (termID >= 0) { // New posting
|
||||||
// First time we are seeing this token since we last
|
// First time we are seeing this token since we last
|
||||||
// flushed the hash.
|
// flushed the hash.
|
||||||
|
initStreamSlices(termID, docID);
|
||||||
|
} else {
|
||||||
|
positionStreamSlice(termID, docID);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initStreamSlices(int termID, int docID) throws IOException {
|
||||||
// Init stream slices
|
// Init stream slices
|
||||||
if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
|
// TODO: figure out why this is 2*streamCount here. streamCount should be enough?
|
||||||
|
if ((2*streamCount) + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
|
||||||
|
// can we fit all the streams in the current buffer?
|
||||||
intPool.nextBuffer();
|
intPool.nextBuffer();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
|
if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < (2*streamCount) * ByteBlockPool.FIRST_LEVEL_SIZE) {
|
||||||
|
// can we fit at least one byte per stream in the current buffer, if not allocate a new one
|
||||||
bytePool.nextBuffer();
|
bytePool.nextBuffer();
|
||||||
}
|
}
|
||||||
|
|
||||||
intUptos = intPool.buffer;
|
termStreamAddressBuffer = intPool.buffer;
|
||||||
intUptoStart = intPool.intUpto;
|
streamAddressOffset = intPool.intUpto;
|
||||||
intPool.intUpto += streamCount;
|
intPool.intUpto += streamCount; // advance the pool to reserve the N streams for this term
|
||||||
|
|
||||||
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
|
postingsArray.addressOffset[termID] = streamAddressOffset + intPool.intOffset;
|
||||||
|
|
||||||
for(int i=0;i<streamCount;i++) {
|
for (int i = 0; i < streamCount; i++) {
|
||||||
|
// initialize each stream with a slice we start with ByteBlockPool.FIRST_LEVEL_SIZE)
|
||||||
|
// and grow as we need more space. see ByteBlockPool.LEVEL_SIZE_ARRAY
|
||||||
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
||||||
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
|
termStreamAddressBuffer[streamAddressOffset + i] = upto + bytePool.byteOffset;
|
||||||
}
|
}
|
||||||
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
|
postingsArray.byteStarts[termID] = termStreamAddressBuffer[streamAddressOffset];
|
||||||
|
newTerm(termID, docID);
|
||||||
newTerm(termID);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
termID = (-termID)-1;
|
|
||||||
int intStart = postingsArray.intStarts[termID];
|
|
||||||
intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
|
|
||||||
intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
|
|
||||||
addTerm(termID);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean assertDocId(int docId) {
|
||||||
|
assert docId >= lastDocID : "docID must be >= " + lastDocID + " but was: " + docId;
|
||||||
|
lastDocID = docId;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Called once per inverted token. This is the primary
|
/** Called once per inverted token. This is the primary
|
||||||
* entry point (for first TermsHash); postings use this
|
* entry point (for first TermsHash); postings use this
|
||||||
* API. */
|
* API. */
|
||||||
void add() throws IOException {
|
void add(BytesRef termBytes, final int docID) throws IOException {
|
||||||
|
assert assertDocId(docID);
|
||||||
// We are first in the chain so we must "intern" the
|
// We are first in the chain so we must "intern" the
|
||||||
// term text into textStart address
|
// term text into textStart address
|
||||||
// Get the text & hash of this term.
|
// Get the text & hash of this term.
|
||||||
int termID = bytesHash.add(termAtt.getBytesRef());
|
int termID = bytesHash.add(termBytes);
|
||||||
|
|
||||||
//System.out.println("add term=" + termBytesRef.utf8ToString() + " doc=" + docState.docID + " termID=" + termID);
|
//System.out.println("add term=" + termBytesRef.utf8ToString() + " doc=" + docState.docID + " termID=" + termID);
|
||||||
|
if (termID >= 0) { // New posting
|
||||||
if (termID >= 0) {// New posting
|
|
||||||
bytesHash.byteStart(termID);
|
|
||||||
// Init stream slices
|
// Init stream slices
|
||||||
if (numPostingInt + intPool.intUpto > IntBlockPool.INT_BLOCK_SIZE) {
|
initStreamSlices(termID, docID);
|
||||||
intPool.nextBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
|
|
||||||
bytePool.nextBuffer();
|
|
||||||
}
|
|
||||||
|
|
||||||
intUptos = intPool.buffer;
|
|
||||||
intUptoStart = intPool.intUpto;
|
|
||||||
intPool.intUpto += streamCount;
|
|
||||||
|
|
||||||
postingsArray.intStarts[termID] = intUptoStart + intPool.intOffset;
|
|
||||||
|
|
||||||
for(int i=0;i<streamCount;i++) {
|
|
||||||
final int upto = bytePool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
|
|
||||||
intUptos[intUptoStart+i] = upto + bytePool.byteOffset;
|
|
||||||
}
|
|
||||||
postingsArray.byteStarts[termID] = intUptos[intUptoStart];
|
|
||||||
|
|
||||||
newTerm(termID);
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
termID = (-termID)-1;
|
termID = positionStreamSlice(termID, docID);
|
||||||
int intStart = postingsArray.intStarts[termID];
|
|
||||||
intUptos = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
|
|
||||||
intUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK;
|
|
||||||
addTerm(termID);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doNextCall) {
|
if (doNextCall) {
|
||||||
nextPerField.add(postingsArray.textStarts[termID]);
|
nextPerField.add(postingsArray.textStarts[termID], docID);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int[] intUptos;
|
private int positionStreamSlice(int termID, final int docID) throws IOException {
|
||||||
int intUptoStart;
|
termID = (-termID) - 1;
|
||||||
|
int intStart = postingsArray.addressOffset[termID];
|
||||||
|
termStreamAddressBuffer = intPool.buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT];
|
||||||
|
streamAddressOffset = intStart & IntBlockPool.INT_BLOCK_MASK;
|
||||||
|
addTerm(termID, docID);
|
||||||
|
return termID;
|
||||||
|
}
|
||||||
|
|
||||||
void writeByte(int stream, byte b) {
|
final void writeByte(int stream, byte b) {
|
||||||
int upto = intUptos[intUptoStart+stream];
|
int streamAddress = streamAddressOffset + stream;
|
||||||
|
int upto = termStreamAddressBuffer[streamAddress];
|
||||||
byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
|
byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
|
||||||
assert bytes != null;
|
assert bytes != null;
|
||||||
int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
|
int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
|
||||||
|
@ -202,20 +206,20 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
// End of slice; allocate a new one
|
// End of slice; allocate a new one
|
||||||
offset = bytePool.allocSlice(bytes, offset);
|
offset = bytePool.allocSlice(bytes, offset);
|
||||||
bytes = bytePool.buffer;
|
bytes = bytePool.buffer;
|
||||||
intUptos[intUptoStart+stream] = offset + bytePool.byteOffset;
|
termStreamAddressBuffer[streamAddress] = offset + bytePool.byteOffset;
|
||||||
}
|
}
|
||||||
bytes[offset] = b;
|
bytes[offset] = b;
|
||||||
(intUptos[intUptoStart+stream])++;
|
(termStreamAddressBuffer[streamAddress])++;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeBytes(int stream, byte[] b, int offset, int len) {
|
final void writeBytes(int stream, byte[] b, int offset, int len) {
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
final int end = offset + len;
|
final int end = offset + len;
|
||||||
for(int i=offset;i<end;i++)
|
for(int i=offset;i<end;i++)
|
||||||
writeByte(stream, b[i]);
|
writeByte(stream, b[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void writeVInt(int stream, int i) {
|
final void writeVInt(int stream, int i) {
|
||||||
assert stream < streamCount;
|
assert stream < streamCount;
|
||||||
while ((i & ~0x7F) != 0) {
|
while ((i & ~0x7F) != 0) {
|
||||||
writeByte(stream, (byte)((i & 0x7f) | 0x80));
|
writeByte(stream, (byte)((i & 0x7f) | 0x80));
|
||||||
|
@ -224,6 +228,14 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
writeByte(stream, (byte) i);
|
writeByte(stream, (byte) i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final TermsHashPerField getNextPerField() {
|
||||||
|
return nextPerField;
|
||||||
|
}
|
||||||
|
|
||||||
|
final String getFieldName() {
|
||||||
|
return fieldName;
|
||||||
|
}
|
||||||
|
|
||||||
private static final class PostingsBytesStartArray extends BytesStartArray {
|
private static final class PostingsBytesStartArray extends BytesStartArray {
|
||||||
|
|
||||||
private final TermsHashPerField perField;
|
private final TermsHashPerField perField;
|
||||||
|
@ -272,8 +284,8 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(TermsHashPerField other) {
|
public final int compareTo(TermsHashPerField other) {
|
||||||
return fieldInfo.name.compareTo(other.fieldInfo.name);
|
return fieldName.compareTo(other.fieldName);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Finish adding all instances of this field to the
|
/** Finish adding all instances of this field to the
|
||||||
|
@ -284,24 +296,25 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final int getNumTerms() {
|
||||||
|
return bytesHash.size();
|
||||||
|
}
|
||||||
|
|
||||||
/** Start adding a new field instance; first is true if
|
/** Start adding a new field instance; first is true if
|
||||||
* this is the first time this field name was seen in the
|
* this is the first time this field name was seen in the
|
||||||
* document. */
|
* document. */
|
||||||
boolean start(IndexableField field, boolean first) {
|
boolean start(IndexableField field, boolean first) {
|
||||||
termAtt = fieldState.termAttribute;
|
|
||||||
termFreqAtt = fieldState.termFreqAttribute;
|
|
||||||
if (nextPerField != null) {
|
if (nextPerField != null) {
|
||||||
doNextCall = nextPerField.start(field, first);
|
doNextCall = nextPerField.start(field, first);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Called when a term is seen for the first time. */
|
/** Called when a term is seen for the first time. */
|
||||||
abstract void newTerm(int termID) throws IOException;
|
abstract void newTerm(int termID, final int docID) throws IOException;
|
||||||
|
|
||||||
/** Called when a previously seen term is seen again. */
|
/** Called when a previously seen term is seen again. */
|
||||||
abstract void addTerm(int termID) throws IOException;
|
abstract void addTerm(int termID, final int docID) throws IOException;
|
||||||
|
|
||||||
/** Called when the postings array is initialized or
|
/** Called when the postings array is initialized or
|
||||||
* resized. */
|
* resized. */
|
||||||
|
|
|
@ -175,7 +175,7 @@ public final class IntBlockPool {
|
||||||
return upto;
|
return upto;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final boolean assertSliceBuffer(int[] buffer) {
|
private static boolean assertSliceBuffer(int[] buffer) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
for (int i = 0; i < buffer.length; i++) {
|
for (int i = 0; i < buffer.length; i++) {
|
||||||
count += buffer[i]; // for slices the buffer must only have 0 values
|
count += buffer[i]; // for slices the buffer must only have 0 values
|
||||||
|
|
|
@ -0,0 +1,209 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||||
|
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||||
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.Counter;
|
||||||
|
import org.apache.lucene.util.IntBlockPool;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestTermsHashPerField extends LuceneTestCase {
|
||||||
|
|
||||||
|
private static TermsHashPerField createNewHash(AtomicInteger newCalled, AtomicInteger addCalled) {
|
||||||
|
IntBlockPool intBlockPool = new IntBlockPool();
|
||||||
|
ByteBlockPool byteBlockPool = new ByteBlockPool(new ByteBlockPool.DirectAllocator());
|
||||||
|
ByteBlockPool termBlockPool = new ByteBlockPool(new ByteBlockPool.DirectAllocator());
|
||||||
|
|
||||||
|
TermsHashPerField hash = new TermsHashPerField(1, intBlockPool, byteBlockPool, termBlockPool, Counter.newCounter(),
|
||||||
|
null, "testfield", IndexOptions.DOCS_AND_FREQS) {
|
||||||
|
|
||||||
|
private FreqProxTermsWriterPerField.FreqProxPostingsArray freqProxPostingsArray;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void newTerm(int termID, int docID) {
|
||||||
|
newCalled.incrementAndGet();
|
||||||
|
FreqProxTermsWriterPerField.FreqProxPostingsArray postings = freqProxPostingsArray;
|
||||||
|
postings.lastDocIDs[termID] = docID;
|
||||||
|
postings.lastDocCodes[termID] = docID << 1;
|
||||||
|
postings.termFreqs[termID] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void addTerm(int termID, int docID) {
|
||||||
|
addCalled.incrementAndGet();
|
||||||
|
FreqProxTermsWriterPerField.FreqProxPostingsArray postings = freqProxPostingsArray;
|
||||||
|
if (docID != postings.lastDocIDs[termID]) {
|
||||||
|
if (1 == postings.termFreqs[termID]) {
|
||||||
|
writeVInt(0, postings.lastDocCodes[termID]|1);
|
||||||
|
} else {
|
||||||
|
writeVInt(0, postings.lastDocCodes[termID]);
|
||||||
|
writeVInt(0, postings.termFreqs[termID]);
|
||||||
|
}
|
||||||
|
postings.termFreqs[termID] = 1;
|
||||||
|
postings.lastDocCodes[termID] = (docID - postings.lastDocIDs[termID]) << 1;
|
||||||
|
postings.lastDocIDs[termID] = docID;
|
||||||
|
} else {
|
||||||
|
postings.termFreqs[termID] = Math.addExact(postings.termFreqs[termID], 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void newPostingsArray() {
|
||||||
|
freqProxPostingsArray = (FreqProxTermsWriterPerField.FreqProxPostingsArray) postingsArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
ParallelPostingsArray createPostingsArray(int size) {
|
||||||
|
return new FreqProxTermsWriterPerField.FreqProxPostingsArray(size, true, false, false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean assertDocAndFreq(ByteSliceReader reader, FreqProxTermsWriterPerField.FreqProxPostingsArray postingsArray, int prevDoc, int termId, int doc, int frequency) throws IOException {
|
||||||
|
int docId = prevDoc;
|
||||||
|
int freq;
|
||||||
|
boolean eof = reader.eof();
|
||||||
|
if (eof) {
|
||||||
|
docId = postingsArray.lastDocIDs[termId];
|
||||||
|
freq = postingsArray.termFreqs[termId];
|
||||||
|
} else {
|
||||||
|
int code = reader.readVInt();
|
||||||
|
docId += code >>> 1;
|
||||||
|
if ((code & 1) != 0) {
|
||||||
|
freq = 1;
|
||||||
|
} else {
|
||||||
|
freq = reader.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals("docID mismatch eof: " + eof, doc, docId);
|
||||||
|
assertEquals("freq mismatch eof: " + eof, frequency, freq);
|
||||||
|
return eof;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAddAndUpdateTerm() throws IOException {
|
||||||
|
AtomicInteger newCalled = new AtomicInteger(0);
|
||||||
|
AtomicInteger addCalled = new AtomicInteger(0);
|
||||||
|
TermsHashPerField hash = createNewHash(newCalled, addCalled);
|
||||||
|
hash.start(null, true);
|
||||||
|
|
||||||
|
hash.add(new BytesRef("start"), 0); // tid = 0;
|
||||||
|
hash.add(new BytesRef("foo"), 0); // tid = 1;
|
||||||
|
hash.add(new BytesRef("bar"), 0); // tid = 2;
|
||||||
|
hash.finish();
|
||||||
|
hash.add(new BytesRef("bar"), 1);
|
||||||
|
hash.add(new BytesRef("foobar"), 1); // tid = 3;
|
||||||
|
hash.add(new BytesRef("bar"), 1);
|
||||||
|
hash.add(new BytesRef("bar"), 1);
|
||||||
|
hash.add(new BytesRef("foobar"), 1);
|
||||||
|
hash.add(new BytesRef("verylongfoobarbaz"), 1); // tid = 4;
|
||||||
|
hash.finish();
|
||||||
|
hash.add(new BytesRef("verylongfoobarbaz"), 2);
|
||||||
|
hash.add(new BytesRef("boom"), 2); // tid = 5;
|
||||||
|
hash.finish();
|
||||||
|
hash.add(new BytesRef("verylongfoobarbaz"), 3);
|
||||||
|
hash.add(new BytesRef("end"), 3); // tid = 6;
|
||||||
|
hash.finish();
|
||||||
|
|
||||||
|
assertEquals(7, newCalled.get());
|
||||||
|
assertEquals(6, addCalled.get());
|
||||||
|
final ByteSliceReader reader = new ByteSliceReader();
|
||||||
|
hash.initReader(reader, 0, 0);
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 0, 0, 1));
|
||||||
|
hash.initReader(reader, 1, 0);
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 1, 0, 1));
|
||||||
|
hash.initReader(reader, 2, 0);
|
||||||
|
assertFalse(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 2, 0, 1));
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 2, 2, 1, 3));
|
||||||
|
hash.initReader(reader, 3, 0);
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 3, 1, 2));
|
||||||
|
hash.initReader(reader, 4, 0);
|
||||||
|
assertFalse(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 4, 1, 1));
|
||||||
|
assertFalse(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 1, 4, 2, 1));
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 2, 4, 3, 1));
|
||||||
|
hash.initReader(reader, 5, 0);
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 5, 2, 1));
|
||||||
|
hash.initReader(reader, 6, 0);
|
||||||
|
assertTrue(assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray, 0, 6, 3, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAddAndUpdateRandom() throws IOException {
|
||||||
|
AtomicInteger newCalled = new AtomicInteger(0);
|
||||||
|
AtomicInteger addCalled = new AtomicInteger(0);
|
||||||
|
TermsHashPerField hash = createNewHash(newCalled, addCalled);
|
||||||
|
hash.start(null, true);
|
||||||
|
class Posting {
|
||||||
|
int termId = -1;
|
||||||
|
final TreeMap<Integer, Integer> docAndFreq = new TreeMap<>();
|
||||||
|
}
|
||||||
|
Map<BytesRef, Posting> postingMap = new HashMap<>();
|
||||||
|
int numStrings = 1 + random().nextInt(200);
|
||||||
|
for (int i = 0; i < numStrings; i++) {
|
||||||
|
String randomString = RandomStrings.randomRealisticUnicodeOfCodepointLengthBetween(random(), 1, 10);
|
||||||
|
postingMap.putIfAbsent(new BytesRef(randomString), new Posting());
|
||||||
|
}
|
||||||
|
List<BytesRef> bytesRefs = Arrays.asList(postingMap.keySet().toArray(new BytesRef[0]));
|
||||||
|
Collections.sort(bytesRefs);
|
||||||
|
int numDocs = 1 + random().nextInt(200);
|
||||||
|
int termOrd = 0;
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
int numTerms = 1 + random().nextInt(200);
|
||||||
|
int doc = i;
|
||||||
|
for (int j = 0; i < numTerms; i++) {
|
||||||
|
BytesRef ref = RandomPicks.randomFrom(random(), bytesRefs);
|
||||||
|
Posting posting = postingMap.get(ref);
|
||||||
|
if (posting.termId == -1) {
|
||||||
|
posting.termId = termOrd++;
|
||||||
|
}
|
||||||
|
posting.docAndFreq.putIfAbsent(doc, 0);
|
||||||
|
posting.docAndFreq.compute(doc, (key, oldVal) -> oldVal+1);
|
||||||
|
hash.add(ref, doc);
|
||||||
|
}
|
||||||
|
hash.finish();
|
||||||
|
}
|
||||||
|
List<Posting> values = postingMap.values().stream().filter( x -> x.termId != -1)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
Collections.shuffle(values, random()); // term order doesn't matter
|
||||||
|
final ByteSliceReader reader = new ByteSliceReader();
|
||||||
|
for (Posting p : values) {
|
||||||
|
hash.initReader(reader, p.termId, 0);
|
||||||
|
boolean eof = false;
|
||||||
|
int prefDoc = 0;
|
||||||
|
for (Map.Entry<Integer, Integer> entry : p.docAndFreq.entrySet()) {
|
||||||
|
assertFalse("the reader must not be EOF here", eof);
|
||||||
|
eof = assertDocAndFreq(reader, (FreqProxTermsWriterPerField.FreqProxPostingsArray) hash.postingsArray,
|
||||||
|
prefDoc, p.termId, entry.getKey(), entry.getValue());
|
||||||
|
prefDoc = entry.getKey();
|
||||||
|
}
|
||||||
|
assertTrue("the last posting must be EOF on the reader", eof);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue