LUCENE-3706: add offsets into lucene40 postings

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1235022 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-23 22:03:08 +00:00
parent edd8b91895
commit b1da6f5041
9 changed files with 291 additions and 89 deletions

View File

@ -206,7 +206,7 @@ public class SegmentTermDocs {
skipListReader = new Lucene40SkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone
if (!haveSkipped) { // lazily initialize skip stream
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads);
skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads, false);
haveSkipped = true;
}

View File

@ -85,11 +85,11 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
storePayloads = false;
}
hasVectors |= storeTermVector;
hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
hasProx |= isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY;
// DV Types are packed in one byte
byte val = input.readByte();

View File

@ -58,7 +58,7 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
output.writeVInt(FORMAT_CURRENT);
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads;
assert fi.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.storePayloads;
byte bits = 0x0;
if (fi.isIndexed) bits |= IS_INDEXED;
if (fi.storeTermVector) bits |= STORE_TERMVECTOR;

View File

@ -197,7 +197,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
// undefined
}
if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
if (isFirstTerm) {
termState.proxOffset = termState.bytesReader.readVLong();
} else {
@ -245,23 +245,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
DocsAndPositionsEnum reuse, boolean needsOffsets)
throws IOException {
if (needsOffsets) {
// TODO: once we index offsets into postings fix this!
return null;
boolean hasOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (needsOffsets && !hasOffsets) {
return null; // not available
}
// TODO: refactor
if (fieldInfo.storePayloads) {
SegmentDocsAndPositionsAndPayloadsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentDocsAndPositionsAndPayloadsEnum)) {
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
if (fieldInfo.storePayloads || hasOffsets) {
SegmentFullPositionsEnum docsEnum;
if (reuse == null || !(reuse instanceof SegmentFullPositionsEnum)) {
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
} else {
docsEnum = (SegmentDocsAndPositionsAndPayloadsEnum) reuse;
docsEnum = (SegmentFullPositionsEnum) reuse;
if (docsEnum.startFreqIn != freqIn) {
// If you are using ParellelReader, and pass in a
// reused DocsEnum, it could have come from another
// reader also using standard codec
docsEnum = new SegmentDocsAndPositionsAndPayloadsEnum(freqIn, proxIn);
docsEnum = new SegmentFullPositionsEnum(freqIn, proxIn);
}
}
return docsEnum.reset(fieldInfo, (StandardTermState) termState, liveDocs);
@ -295,6 +295,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
protected boolean indexOmitsTF; // does current field omit term freq?
protected boolean storePayloads; // does current field store payloads?
protected boolean storeOffsets; // does current field store offsets?
protected int limit; // number of docs in this posting
protected int ord; // how many docs we've read
@ -324,6 +325,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
DocsEnum reset(FieldInfo fieldInfo, StandardTermState termState) throws IOException {
indexOmitsTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY;
storePayloads = fieldInfo.storePayloads;
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
freqOffset = termState.freqOffset;
skipOffset = termState.skipOffset;
@ -471,7 +473,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
skipper.init(freqOffset + skipOffset,
freqOffset, 0,
limit, storePayloads);
limit, storePayloads, storeOffsets);
skipped = true;
}
@ -665,7 +667,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
// TODO specialize DocsAndPosEnum too
// Decodes docs & positions. payloads are not present.
// Decodes docs & positions. payloads nor offsets are present.
private final class SegmentDocsAndPositionsEnum extends DocsAndPositionsEnum {
final IndexInput startFreqIn;
private final IndexInput freqIn;
@ -792,7 +794,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
limit, false);
limit, false, false);
skipped = true;
}
@ -868,8 +870,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
}
// Decodes docs & positions & payloads
private class SegmentDocsAndPositionsAndPayloadsEnum extends DocsAndPositionsEnum {
// Decodes docs & positions & (payloads and/or offsets)
private class SegmentFullPositionsEnum extends DocsAndPositionsEnum {
final IndexInput startFreqIn;
private final IndexInput freqIn;
private final IndexInput proxIn;
@ -896,15 +898,23 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
private BytesRef payload;
private long lazyProxPointer;
public SegmentDocsAndPositionsAndPayloadsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
boolean storePayloads;
boolean storeOffsets;
int offsetLength;
int startOffset;
public SegmentFullPositionsEnum(IndexInput freqIn, IndexInput proxIn) throws IOException {
startFreqIn = freqIn;
this.freqIn = (IndexInput) freqIn.clone();
this.proxIn = (IndexInput) proxIn.clone();
}
public SegmentDocsAndPositionsAndPayloadsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
assert fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
assert fieldInfo.storePayloads;
public SegmentFullPositionsEnum reset(FieldInfo fieldInfo, StandardTermState termState, Bits liveDocs) throws IOException {
storeOffsets = fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
storePayloads = fieldInfo.storePayloads;
assert fieldInfo.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
assert storePayloads || storeOffsets;
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[1];
@ -923,6 +933,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
doc = -1;
accum = 0;
position = 0;
startOffset = 0;
skipped = false;
posPendingCount = 0;
@ -963,6 +974,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
position = 0;
startOffset = 0;
//System.out.println("StandardR.D&PE nextDoc seg=" + segment + " return doc=" + doc);
return (doc = accum);
@ -1001,7 +1013,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
//System.out.println(" init skipper freqOffset=" + freqOffset + " skipOffset=" + skipOffset + " vs len=" + freqIn.length());
skipper.init(freqOffset+skipOffset,
freqOffset, proxOffset,
limit, true);
limit, storePayloads, storeOffsets);
skipped = true;
}
@ -1016,8 +1028,10 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
lazyProxPointer = skipper.getProxPointer();
posPendingCount = 0;
position = 0;
startOffset = 0;
payloadPending = false;
payloadLength = skipper.getPayloadLength();
offsetLength = skipper.getOffsetLength();
}
}
@ -1038,27 +1052,38 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
}
if (payloadPending && payloadLength > 0) {
// payload of last position as never retrieved -- skip it
// payload of last position was never retrieved -- skip it
proxIn.seek(proxIn.getFilePointer() + payloadLength);
payloadPending = false;
}
// scan over any docs that were iterated without their positions
while(posPendingCount > freq) {
final int code = proxIn.readVInt();
if (storePayloads) {
if ((code & 1) != 0) {
// new payload length
payloadLength = proxIn.readVInt();
assert payloadLength >= 0;
}
assert payloadLength != -1;
}
if (storeOffsets) {
if ((proxIn.readVInt() & 1) != 0) {
// new offset length
offsetLength = proxIn.readVInt();
}
}
if (storePayloads) {
proxIn.seek(proxIn.getFilePointer() + payloadLength);
}
posPendingCount--;
position = 0;
startOffset = 0;
payloadPending = false;
//System.out.println("StandardR.D&PE skipPos");
}
@ -1069,7 +1094,8 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
proxIn.seek(proxIn.getFilePointer()+payloadLength);
}
final int code = proxIn.readVInt();
int code = proxIn.readVInt();
if (storePayloads) {
if ((code & 1) != 0) {
// new payload length
payloadLength = proxIn.readVInt();
@ -1078,7 +1104,18 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
assert payloadLength != -1;
payloadPending = true;
position += code >>> 1;
code >>>= 1;
}
position += code;
if (storeOffsets) {
int offsetCode = proxIn.readVInt();
if ((offsetCode & 1) != 0) {
// new offset length
offsetLength = proxIn.readVInt();
}
startOffset += offsetCode >>> 1;
}
posPendingCount--;
@ -1090,18 +1127,19 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
@Override
public int startOffset() throws IOException {
return -1;
return storeOffsets ? startOffset : -1;
}
@Override
public int endOffset() throws IOException {
return -1;
return storeOffsets ? startOffset + offsetLength : -1;
}
/** Returns the payload at this position, or null if no
* payload was indexed. */
@Override
public BytesRef getPayload() throws IOException {
if (storePayloads) {
assert lazyProxPointer == -1;
assert posPendingCount < freq;
if (!payloadPending) {
@ -1116,6 +1154,9 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
payloadPending = false;
return payload;
} else {
throw new IOException("No payloads exist for this field!");
}
}
@Override

View File

@ -73,12 +73,15 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
IndexOptions indexOptions;
boolean storePayloads;
boolean storeOffsets;
// Starts a new term
long freqStart;
long proxStart;
FieldInfo fieldInfo;
int lastPayloadLength;
int lastOffsetLength;
int lastPosition;
int lastOffset;
// private String segment;
@ -137,6 +140,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
proxStart = proxOut.getFilePointer();
// force first payload to write its length
lastPayloadLength = -1;
// force first offset to write its length
lastOffsetLength = -1;
}
skipListWriter.resetSkip();
}
@ -155,10 +160,8 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
*/
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.indexOptions;
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
storeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
storePayloads = fieldInfo.storePayloads;
//System.out.println(" set init blockFreqStart=" + freqStart);
//System.out.println(" set init blockProxStart=" + proxStart);
@ -180,7 +183,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
}
if ((++df % skipInterval) == 0) {
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, storeOffsets, lastOffsetLength);
skipListWriter.bufferSkip(df);
}
@ -197,31 +200,26 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
}
lastPosition = 0;
lastOffset = 0;
}
/** Add a new position & payload */
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
//if (DEBUG) System.out.println("SPW: addPos pos=" + position + " payload=" + (payload == null ? "null" : (payload.length + " bytes")) + " proxFP=" + proxOut.getFilePointer());
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS: "invalid indexOptions: " + indexOptions;
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 : "invalid indexOptions: " + indexOptions;
assert proxOut != null;
// TODO: when we add offsets... often
// endOffset-startOffset will be constant or near
// constant for all docs (eg if the term wasn't stemmed
// then this will usually be the utf16 length of the
// term); would be nice to write that length once up
// front and then not encode endOffset for each
// position..
final int delta = position - lastPosition;
assert delta >= 0: "position=" + position + " lastPosition=" + lastPosition; // not quite right (if pos=0 is repeated twice we don't catch it)
lastPosition = position;
int payloadLength = 0;
if (storePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
@ -230,13 +228,28 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
} else {
proxOut.writeVInt(delta << 1);
}
} else {
proxOut.writeVInt(delta);
}
if (storeOffsets) {
// don't use startOffset - lastEndOffset, because this creates lots of negative vints for synonyms,
// and the numbers aren't that much smaller anyways.
int offsetDelta = startOffset - lastOffset;
int offsetLength = endOffset - startOffset;
if (offsetLength != lastOffsetLength) {
proxOut.writeVInt(offsetDelta << 1 | 1);
proxOut.writeVInt(offsetLength);
} else {
proxOut.writeVInt(offsetDelta << 1);
}
lastOffset = startOffset;
lastOffsetLength = offsetLength;
}
if (payloadLength > 0) {
proxOut.writeBytes(payload.bytes, payload.offset, payloadLength);
}
} else {
proxOut.writeVInt(delta);
}
}
@Override
@ -304,7 +317,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
assert firstTerm.skipOffset > 0;
bytesWriter.writeVInt(firstTerm.skipOffset);
}
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
bytesWriter.writeVLong(firstTerm.proxStart);
}
long lastFreqStart = firstTerm.freqStart;
@ -319,7 +332,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
assert term.skipOffset > 0;
bytesWriter.writeVInt(term.skipOffset);
}
if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) {
bytesWriter.writeVLong(term.proxStart - lastProxStart);
lastProxStart = term.proxStart;
}

View File

@ -30,13 +30,16 @@ import org.apache.lucene.store.IndexInput;
*/
public class Lucene40SkipListReader extends MultiLevelSkipListReader {
private boolean currentFieldStoresPayloads;
private boolean currentFieldStoresOffsets;
private long freqPointer[];
private long proxPointer[];
private int payloadLength[];
private int offsetLength[];
private long lastFreqPointer;
private long lastProxPointer;
private int lastPayloadLength;
private int lastOffsetLength;
public Lucene40SkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) {
@ -44,17 +47,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
freqPointer = new long[maxSkipLevels];
proxPointer = new long[maxSkipLevels];
payloadLength = new int[maxSkipLevels];
offsetLength = new int[maxSkipLevels];
}
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) {
public void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads, boolean storesOffsets) {
super.init(skipPointer, df);
this.currentFieldStoresPayloads = storesPayloads;
this.currentFieldStoresOffsets = storesOffsets;
lastFreqPointer = freqBasePointer;
lastProxPointer = proxBasePointer;
Arrays.fill(freqPointer, freqBasePointer);
Arrays.fill(proxPointer, proxBasePointer);
Arrays.fill(payloadLength, 0);
Arrays.fill(offsetLength, 0);
}
/** Returns the freq pointer of the doc to which the last call of
@ -76,12 +82,20 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
return lastPayloadLength;
}
/** Returns the offset length (endOffset-startOffset) of the position stored just before
* the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)}
* has skipped. */
public int getOffsetLength() {
return lastOffsetLength;
}
@Override
protected void seekChild(int level) throws IOException {
super.seekChild(level);
freqPointer[level] = lastFreqPointer;
proxPointer[level] = lastProxPointer;
payloadLength[level] = lastPayloadLength;
offsetLength[level] = lastOffsetLength;
}
@Override
@ -90,6 +104,7 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
lastFreqPointer = freqPointer[level];
lastProxPointer = proxPointer[level];
lastPayloadLength = payloadLength[level];
lastOffsetLength = offsetLength[level];
}
@ -110,6 +125,11 @@ public class Lucene40SkipListReader extends MultiLevelSkipListReader {
} else {
delta = skipStream.readVInt();
}
if (currentFieldStoresOffsets) {
offsetLength[level] = skipStream.readVInt();
}
freqPointer[level] += skipStream.readVInt();
proxPointer[level] += skipStream.readVInt();

View File

@ -40,7 +40,9 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
private int curDoc;
private boolean curStorePayloads;
private boolean curStoreOffsets;
private int curPayloadLength;
private int curOffsetLength;
private long curFreqPointer;
private long curProxPointer;
@ -58,10 +60,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
/**
* Sets the values for the current skip data.
*/
public void setSkipData(int doc, boolean storePayloads, int payloadLength) {
public void setSkipData(int doc, boolean storePayloads, int payloadLength, boolean storeOffsets, int offsetLength) {
this.curDoc = doc;
this.curStorePayloads = storePayloads;
this.curPayloadLength = payloadLength;
this.curStoreOffsets = storeOffsets;
this.curOffsetLength = offsetLength;
this.curFreqPointer = freqOutput.getFilePointer();
if (proxOutput != null)
this.curProxPointer = proxOutput.getFilePointer();
@ -116,6 +120,12 @@ public class Lucene40SkipListWriter extends MultiLevelSkipListWriter {
// current field does not store payloads
skipBuffer.writeVInt(curDoc - lastSkipDoc[level]);
}
// TODO: not sure it really helps to shove this somewhere else if its the same as the last skip
if (curStoreOffsets) {
skipBuffer.writeVInt(curOffsetLength);
}
skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level]));
skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level]));

View File

@ -137,7 +137,7 @@ class PreFlexRWFieldsWriter extends FieldsConsumer {
}
if ((++df % termsOut.skipInterval) == 0) {
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength, false, 0);
skipListWriter.bufferSkip(df);
}

View File

@ -22,29 +22,46 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedAnalyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockPayloadAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assume;
import org.apache.lucene.util._TestUtil;
public class TestPostingsOffsets extends LuceneTestCase {
IndexWriterConfig iwc;
public void setUp() throws Exception {
super.setUp();
// Currently only SimpleText and Lucene40 can index offsets into postings:
assumeTrue("codec does not support offsets", Codec.getDefault().getName().equals("SimpleText") || Codec.getDefault().getName().equals("Lucene40"));
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
}
}
public void testBasic() throws Exception {
// Currently only SimpleText can index offsets into postings:
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir);
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
Document doc = new Document();
FieldType ft = new FieldType(TextField.TYPE_UNSTORED);
@ -95,15 +112,116 @@ public class TestPostingsOffsets extends LuceneTestCase {
dir.close();
}
public void testRandom() throws Exception {
// Currently only SimpleText can index offsets into postings:
Assume.assumeTrue(Codec.getDefault().getName().equals("SimpleText"));
public void testSkipping() throws Exception {
doTestNumbers(false);
}
public void testPayloads() throws Exception {
doTestNumbers(true);
}
public void doTestNumbers(boolean withPayloads) throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = withPayloads ? new MockPayloadAnalyzer() : new MockAnalyzer(random);
iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
if (Codec.getDefault().getName().equals("Lucene40")) {
// pulsing etc are not implemented
iwc.setCodec(_TestUtil.alwaysPostingsFormat(new Lucene40PostingsFormat()));
}
iwc.setMergePolicy(newLogMergePolicy()); // will rely on docids a bit for skipping
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random.nextBoolean()) {
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(random.nextBoolean());
ft.setStoreTermVectorPositions(random.nextBoolean());
}
int numDocs = atLeast(500);
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
doc.add(new Field("numbers", English.intToEnglish(i), ft));
doc.add(new Field("oddeven", (i % 2) == 0 ? "even" : "odd", ft));
doc.add(new StringField("id", "" + i));
w.addDocument(doc);
}
IndexReader reader = w.getReader();
w.close();
String terms[] = { "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred" };
for (String term : terms) {
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef(term), true);
int doc;
while((doc = dp.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
String storedNumbers = reader.document(doc).get("numbers");
int freq = dp.freq();
for (int i = 0; i < freq; i++) {
dp.nextPosition();
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
assert end >= 0 && end >= start;
// check that the offsets correspond to the term in the src text
assertTrue(storedNumbers.substring(start, end).equals(term));
if (withPayloads) {
// check that we have a payload and it starts with "pos"
assertTrue(dp.hasPayload());
BytesRef payload = dp.getPayload();
assertTrue(payload.utf8ToString().startsWith("pos:"));
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
}
}
}
// check we can skip correctly
int numSkippingTests = atLeast(50);
for (int j = 0; j < numSkippingTests; j++) {
int num = _TestUtil.nextInt(random, 100, Math.min(numDocs-1, 999));
DocsAndPositionsEnum dp = MultiFields.getTermPositionsEnum(reader, null, "numbers", new BytesRef("hundred"), true);
int doc = dp.advance(num);
assertEquals(num, doc);
int freq = dp.freq();
for (int i = 0; i < freq; i++) {
String storedNumbers = reader.document(doc).get("numbers");
dp.nextPosition();
int start = dp.startOffset();
assert start >= 0;
int end = dp.endOffset();
assert end >= 0 && end >= start;
// check that the offsets correspond to the term in the src text
assertTrue(storedNumbers.substring(start, end).equals("hundred"));
if (withPayloads) {
// check that we have a payload and it starts with "pos"
assertTrue(dp.hasPayload());
BytesRef payload = dp.getPayload();
assertTrue(payload.utf8ToString().startsWith("pos:"));
} // note: withPayloads=false doesnt necessarily mean we dont have them from MockAnalyzer!
}
}
// check that other fields (without offsets) work correctly
for (int i = 0; i < numDocs; i++) {
DocsEnum dp = MultiFields.getTermDocsEnum(reader, null, "id", new BytesRef("" + i), false);
assertEquals(i, dp.nextDoc());
assertEquals(DocIdSetIterator.NO_MORE_DOCS, dp.nextDoc());
}
reader.close();
dir.close();
}
public void testRandom() throws Exception {
// token -> docID -> tokens
final Map<String,Map<Integer,List<Token>>> actualTokens = new HashMap<String,Map<Integer,List<Token>>>();
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, dir);
RandomIndexWriter w = new RandomIndexWriter(random, dir, iwc);
final int numDocs = atLeast(20);
//final int numDocs = atLeast(5);