LUCENE-1542: properly index first token(s) with 0 position increment

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@780220 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-05-30 09:36:10 +00:00
parent 5f6d0c7bd6
commit 80a79f5bee
8 changed files with 230 additions and 7 deletions

View File

@ -71,6 +71,17 @@ Changes in runtime behavior
with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
internally by docID). (Shai Erera via Michael McCandless)
6. LUCENE-1542: When the first token(s) have 0 position increment,
IndexWriter used to incorrectly record the position as -1, if no
payload is present, or Integer.MAX_VALUE if a payload is present.
This causes positional queries to fail to match. The bug is now
fixed, but if your app relies on the buggy behavior then you must
call IndexWriter.setAllowMinus1Position(). That API is deprecated
so you must fix your application, and rebuild your index, to not
rely on this behavior by the 3.0 release of Lucene. (Jonathan
Mamou, Mark Miller via Mike McCandless)
API Changes
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is
@ -186,6 +197,16 @@ Bug fixes
10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
the segment's deletion count to be incorrect. (Mike McCandless)
11. LUCENE-1542: When the first token(s) have 0 position increment,
IndexWriter used to incorrectly record the position as -1, if no
payload is present, or Integer.MAX_VALUE if a payload is present.
This causes positional queries to fail to match. The bug is now
fixed, but if your app relies on the buggy behavior then you must
call IndexWriter.setAllowMinus1Position(). That API is deprecated
so you must fix your application, and rebuild your index, to not
rely on this behavior by the 3.0 release of Lucene. (Jonathan
Mamou, Mark Miller via Mike McCandless)
New features
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior

View File

@ -42,7 +42,7 @@
<property name="Name" value="Lucene"/>
<property name="dev.version" value="2.9-dev"/>
<property name="version" value="${dev.version}"/>
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090526"/>
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090530"/>
<property name="spec.version" value="${version}"/>
<property name="year" value="2000-${current.year}"/>
<property name="final.name" value="lucene-${name}-${version}"/>

View File

@ -126,6 +126,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
// reset the TokenStream to the first token
stream.reset();
// deprecated
final boolean allowMinus1Position = docState.allowMinus1Position;
try {
int offsetEnd = fieldState.offset-1;
@ -162,7 +165,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
}
final int posIncr = posIncrAttribute.getPositionIncrement();
fieldState.position += posIncr - 1;
fieldState.position += posIncr;
if (allowMinus1Position || fieldState.position > 0) {
fieldState.position--;
}
if (posIncr == 0)
fieldState.numOverlap++;

View File

@ -150,6 +150,9 @@ final class DocumentsWriter {
Document doc;
String maxTermPrefix;
// deprecated
boolean allowMinus1Position;
// Only called by asserts
public boolean testPoint(String name) {
return docWriter.writer.testPoint(name);
@ -298,6 +301,11 @@ final class DocumentsWriter {
threadStates[i].docState.similarity = similarity;
}
synchronized void setAllowMinus1Position() {
for(int i=0;i<threadStates.length;i++)
threadStates[i].docState.allowMinus1Position = true;;
}
/** Set how much RAM we can use before flushing. */
synchronized void setRAMBufferSizeMB(double mb) {
if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {

View File

@ -40,6 +40,7 @@ final class DocumentsWriterThreadState {
docState.infoStream = docWriter.infoStream;
docState.similarity = docWriter.similarity;
docState.docWriter = docWriter;
docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position();
consumer = docWriter.consumer.addThread(this);
}

View File

@ -5485,6 +5485,22 @@ public class IndexWriter {
throw oom;
}
// deprecated
private boolean allowMinus1Position;
/** Deprecated: emulates IndexWriter's buggy behavior when
* first token(s) have positionIncrement==0 (ie, prior to
* fixing LUCENE-1542) */
public void setAllowMinus1Position() {
allowMinus1Position = true;
docWriter.setAllowMinus1Position();
}
// deprecated
boolean getAllowMinus1Position() {
return allowMinus1Position;
}
// Used only by assert for testing. Current points:
// startDoFlush
// startCommitMerge

View File

@ -3594,7 +3594,7 @@ public class TestIndexWriter extends LuceneTestCase
TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
assertTrue(tps.next());
assertEquals(1, tps.freq());
assertEquals(-1, tps.nextPosition());
assertEquals(0, tps.nextPosition());
w.close();
assertTrue(_TestUtil.checkIndex(dir));

View File

@ -17,8 +17,11 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
@ -26,14 +29,27 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.MockRAMDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.index.Payload;
import org.apache.lucene.search.payloads.PayloadSpanUtil;
import org.apache.lucene.search.spans.PayloadSpans;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
/**
* Term position unit test.
@ -48,7 +64,7 @@ public class TestPositionIncrement extends LuceneTestCase {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new TokenStream() {
private final String[] TOKENS = {"1", "2", "3", "4", "5"};
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
private final int[] INCREMENTS = {0, 2, 1, 0, 1};
private int i = 0;
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
@ -67,7 +83,7 @@ public class TestPositionIncrement extends LuceneTestCase {
};
}
};
RAMDirectory store = new RAMDirectory();
Directory store = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(store, analyzer, true,
IndexWriter.MaxFieldLength.LIMITED);
Document d = new Document();
@ -76,7 +92,19 @@ public class TestPositionIncrement extends LuceneTestCase {
writer.optimize();
writer.close();
IndexSearcher searcher = new IndexSearcher(store);
TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1"));
pos.next();
// first token should be at position 0
assertEquals(0, pos.nextPosition());
pos = searcher.getIndexReader().termPositions(new Term("field", "2"));
pos.next();
// second token should be at position 2
assertEquals(2, pos.nextPosition());
PhraseQuery q;
ScoreDoc[] hits;
@ -202,4 +230,146 @@ public class TestPositionIncrement extends LuceneTestCase {
StopFilter.setEnablePositionIncrementsDefault(dflt);
}
}
public void testPayloadsPos0() throws Exception {
for(int x=0;x<2;x++) {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir,
new TestPayloadAnalyzer(), true,
IndexWriter.MaxFieldLength.LIMITED);
if (x == 1) {
writer.setAllowMinus1Position();
}
Document doc = new Document();
doc.add(new Field("content",
new StringReader("a a b c d e a f g h i j a b k k")));
writer.addDocument(doc);
IndexReader r = writer.getReader();
TermPositions tp = r.termPositions(new Term("content", "a"));
int count = 0;
assertTrue(tp.next());
// "a" occurs 4 times
assertEquals(4, tp.freq());
int expected;
if (x == 1) {
expected = Integer.MAX_VALUE;
} else {
expected = 0;
}
assertEquals(expected, tp.nextPosition());
if (x == 1) {
continue;
}
assertEquals(1, tp.nextPosition());
assertEquals(3, tp.nextPosition());
assertEquals(6, tp.nextPosition());
// only one doc has "a"
assertFalse(tp.next());
IndexSearcher is = new IndexSearcher(r);
SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
SpanQuery[] sqs = { stq1, stq2 };
SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
count = 0;
boolean sawZero = false;
//System.out.println("\ngetPayloadSpans test");
PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader());
while (pspans.next()) {
//System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
Collection payloads = pspans.getPayload();
sawZero |= pspans.start() == 0;
for (Iterator it = payloads.iterator(); it.hasNext();) {
count++;
it.next();
//System.out.println(new String((byte[]) it.next()));
}
}
assertEquals(5, count);
assertTrue(sawZero);
//System.out.println("\ngetSpans test");
Spans spans = snq.getSpans(is.getIndexReader());
count = 0;
sawZero = false;
while (spans.next()) {
count++;
sawZero |= spans.start() == 0;
//System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end());
}
assertEquals(4, count);
assertTrue(sawZero);
//System.out.println("\nPayloadSpanUtil test");
sawZero = false;
PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
Collection pls = psu.getPayloadsForQuery(snq);
count = pls.size();
for (Iterator it = pls.iterator(); it.hasNext();) {
String s = new String((byte[]) it.next());
//System.out.println(s);
sawZero |= s.equals("pos: 0");
}
assertEquals(5, count);
assertTrue(sawZero);
writer.close();
is.getIndexReader().close();
dir.close();
}
}
}
class TestPayloadAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new LowerCaseTokenizer(reader);
return new PayloadFilter(result, fieldName);
}
}
class PayloadFilter extends TokenFilter {
String fieldName;
int pos;
int i;
final PositionIncrementAttribute posIncrAttr;
final PayloadAttribute payloadAttr;
final TermAttribute termAttr;
public PayloadFilter(TokenStream input, String fieldName) {
super(input);
this.fieldName = fieldName;
pos = 0;
i = 0;
posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
termAttr = (TermAttribute) input.addAttribute(TermAttribute.class);
}
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes()));
int posIncr;
if (i % 2 == 1) {
posIncr = 1;
} else {
posIncr = 0;
}
posIncrAttr.setPositionIncrement(posIncr);
pos += posIncr;
// System.out.println("term=" + termAttr.term() + " pos=" + pos);
i++;
return true;
} else {
return false;
}
}
}