mirror of https://github.com/apache/lucene.git
LUCENE-1542: properly index first token(s) with 0 position increment
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@780220 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5f6d0c7bd6
commit
80a79f5bee
21
CHANGES.txt
21
CHANGES.txt
|
@ -71,6 +71,17 @@ Changes in runtime behavior
|
||||||
with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
|
with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
|
||||||
internally by docID). (Shai Erera via Michael McCandless)
|
internally by docID). (Shai Erera via Michael McCandless)
|
||||||
|
|
||||||
|
6. LUCENE-1542: When the first token(s) have 0 position increment,
|
||||||
|
IndexWriter used to incorrectly record the position as -1, if no
|
||||||
|
payload is present, or Integer.MAX_VALUE if a payload is present.
|
||||||
|
This causes positional queries to fail to match. The bug is now
|
||||||
|
fixed, but if your app relies on the buggy behavior then you must
|
||||||
|
call IndexWriter.setAllowMinus1Position(). That API is deprecated
|
||||||
|
so you must fix your application, and rebuild your index, to not
|
||||||
|
rely on this behavior by the 3.0 release of Lucene. (Jonathan
|
||||||
|
Mamou, Mark Miller via Mike McCandless)
|
||||||
|
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is
|
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is
|
||||||
|
@ -186,6 +197,16 @@ Bug fixes
|
||||||
10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
|
10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
|
||||||
the segment's deletion count to be incorrect. (Mike McCandless)
|
the segment's deletion count to be incorrect. (Mike McCandless)
|
||||||
|
|
||||||
|
11. LUCENE-1542: When the first token(s) have 0 position increment,
|
||||||
|
IndexWriter used to incorrectly record the position as -1, if no
|
||||||
|
payload is present, or Integer.MAX_VALUE if a payload is present.
|
||||||
|
This causes positional queries to fail to match. The bug is now
|
||||||
|
fixed, but if your app relies on the buggy behavior then you must
|
||||||
|
call IndexWriter.setAllowMinus1Position(). That API is deprecated
|
||||||
|
so you must fix your application, and rebuild your index, to not
|
||||||
|
rely on this behavior by the 3.0 release of Lucene. (Jonathan
|
||||||
|
Mamou, Mark Miller via Mike McCandless)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
|
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
|
||||||
|
|
|
@ -42,7 +42,7 @@
|
||||||
<property name="Name" value="Lucene"/>
|
<property name="Name" value="Lucene"/>
|
||||||
<property name="dev.version" value="2.9-dev"/>
|
<property name="dev.version" value="2.9-dev"/>
|
||||||
<property name="version" value="${dev.version}"/>
|
<property name="version" value="${dev.version}"/>
|
||||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090526"/>
|
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090530"/>
|
||||||
<property name="spec.version" value="${version}"/>
|
<property name="spec.version" value="${version}"/>
|
||||||
<property name="year" value="2000-${current.year}"/>
|
<property name="year" value="2000-${current.year}"/>
|
||||||
<property name="final.name" value="lucene-${name}-${version}"/>
|
<property name="final.name" value="lucene-${name}-${version}"/>
|
||||||
|
|
|
@ -126,6 +126,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
// reset the TokenStream to the first token
|
// reset the TokenStream to the first token
|
||||||
stream.reset();
|
stream.reset();
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
final boolean allowMinus1Position = docState.allowMinus1Position;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
int offsetEnd = fieldState.offset-1;
|
int offsetEnd = fieldState.offset-1;
|
||||||
|
|
||||||
|
@ -162,7 +165,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
}
|
}
|
||||||
|
|
||||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||||
fieldState.position += posIncr - 1;
|
fieldState.position += posIncr;
|
||||||
|
if (allowMinus1Position || fieldState.position > 0) {
|
||||||
|
fieldState.position--;
|
||||||
|
}
|
||||||
|
|
||||||
if (posIncr == 0)
|
if (posIncr == 0)
|
||||||
fieldState.numOverlap++;
|
fieldState.numOverlap++;
|
||||||
|
|
||||||
|
|
|
@ -150,6 +150,9 @@ final class DocumentsWriter {
|
||||||
Document doc;
|
Document doc;
|
||||||
String maxTermPrefix;
|
String maxTermPrefix;
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
boolean allowMinus1Position;
|
||||||
|
|
||||||
// Only called by asserts
|
// Only called by asserts
|
||||||
public boolean testPoint(String name) {
|
public boolean testPoint(String name) {
|
||||||
return docWriter.writer.testPoint(name);
|
return docWriter.writer.testPoint(name);
|
||||||
|
@ -298,6 +301,11 @@ final class DocumentsWriter {
|
||||||
threadStates[i].docState.similarity = similarity;
|
threadStates[i].docState.similarity = similarity;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synchronized void setAllowMinus1Position() {
|
||||||
|
for(int i=0;i<threadStates.length;i++)
|
||||||
|
threadStates[i].docState.allowMinus1Position = true;;
|
||||||
|
}
|
||||||
|
|
||||||
/** Set how much RAM we can use before flushing. */
|
/** Set how much RAM we can use before flushing. */
|
||||||
synchronized void setRAMBufferSizeMB(double mb) {
|
synchronized void setRAMBufferSizeMB(double mb) {
|
||||||
if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {
|
if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {
|
||||||
|
|
|
@ -40,6 +40,7 @@ final class DocumentsWriterThreadState {
|
||||||
docState.infoStream = docWriter.infoStream;
|
docState.infoStream = docWriter.infoStream;
|
||||||
docState.similarity = docWriter.similarity;
|
docState.similarity = docWriter.similarity;
|
||||||
docState.docWriter = docWriter;
|
docState.docWriter = docWriter;
|
||||||
|
docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position();
|
||||||
consumer = docWriter.consumer.addThread(this);
|
consumer = docWriter.consumer.addThread(this);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5485,6 +5485,22 @@ public class IndexWriter {
|
||||||
throw oom;
|
throw oom;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
private boolean allowMinus1Position;
|
||||||
|
|
||||||
|
/** Deprecated: emulates IndexWriter's buggy behavior when
|
||||||
|
* first token(s) have positionIncrement==0 (ie, prior to
|
||||||
|
* fixing LUCENE-1542) */
|
||||||
|
public void setAllowMinus1Position() {
|
||||||
|
allowMinus1Position = true;
|
||||||
|
docWriter.setAllowMinus1Position();
|
||||||
|
}
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
boolean getAllowMinus1Position() {
|
||||||
|
return allowMinus1Position;
|
||||||
|
}
|
||||||
|
|
||||||
// Used only by assert for testing. Current points:
|
// Used only by assert for testing. Current points:
|
||||||
// startDoFlush
|
// startDoFlush
|
||||||
// startCommitMerge
|
// startCommitMerge
|
||||||
|
|
|
@ -3594,7 +3594,7 @@ public class TestIndexWriter extends LuceneTestCase
|
||||||
TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
|
TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
|
||||||
assertTrue(tps.next());
|
assertTrue(tps.next());
|
||||||
assertEquals(1, tps.freq());
|
assertEquals(1, tps.freq());
|
||||||
assertEquals(-1, tps.nextPosition());
|
assertEquals(0, tps.nextPosition());
|
||||||
w.close();
|
w.close();
|
||||||
|
|
||||||
assertTrue(_TestUtil.checkIndex(dir));
|
assertTrue(_TestUtil.checkIndex(dir));
|
||||||
|
|
|
@ -17,8 +17,11 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
@ -26,14 +29,27 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermPositions;
|
||||||
import org.apache.lucene.queryParser.QueryParser;
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.MockRAMDirectory;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
import org.apache.lucene.search.payloads.PayloadSpanUtil;
|
||||||
|
import org.apache.lucene.search.spans.PayloadSpans;
|
||||||
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
|
import org.apache.lucene.search.spans.Spans;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Term position unit test.
|
* Term position unit test.
|
||||||
|
@ -48,7 +64,7 @@ public class TestPositionIncrement extends LuceneTestCase {
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new TokenStream() {
|
return new TokenStream() {
|
||||||
private final String[] TOKENS = {"1", "2", "3", "4", "5"};
|
private final String[] TOKENS = {"1", "2", "3", "4", "5"};
|
||||||
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
|
private final int[] INCREMENTS = {0, 2, 1, 0, 1};
|
||||||
private int i = 0;
|
private int i = 0;
|
||||||
|
|
||||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -67,7 +83,7 @@ public class TestPositionIncrement extends LuceneTestCase {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
RAMDirectory store = new RAMDirectory();
|
Directory store = new MockRAMDirectory();
|
||||||
IndexWriter writer = new IndexWriter(store, analyzer, true,
|
IndexWriter writer = new IndexWriter(store, analyzer, true,
|
||||||
IndexWriter.MaxFieldLength.LIMITED);
|
IndexWriter.MaxFieldLength.LIMITED);
|
||||||
Document d = new Document();
|
Document d = new Document();
|
||||||
|
@ -75,8 +91,20 @@ public class TestPositionIncrement extends LuceneTestCase {
|
||||||
writer.addDocument(d);
|
writer.addDocument(d);
|
||||||
writer.optimize();
|
writer.optimize();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
|
|
||||||
IndexSearcher searcher = new IndexSearcher(store);
|
IndexSearcher searcher = new IndexSearcher(store);
|
||||||
|
|
||||||
|
TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1"));
|
||||||
|
pos.next();
|
||||||
|
// first token should be at position 0
|
||||||
|
assertEquals(0, pos.nextPosition());
|
||||||
|
|
||||||
|
pos = searcher.getIndexReader().termPositions(new Term("field", "2"));
|
||||||
|
pos.next();
|
||||||
|
// second token should be at position 2
|
||||||
|
assertEquals(2, pos.nextPosition());
|
||||||
|
|
||||||
PhraseQuery q;
|
PhraseQuery q;
|
||||||
ScoreDoc[] hits;
|
ScoreDoc[] hits;
|
||||||
|
|
||||||
|
@ -202,4 +230,146 @@ public class TestPositionIncrement extends LuceneTestCase {
|
||||||
StopFilter.setEnablePositionIncrementsDefault(dflt);
|
StopFilter.setEnablePositionIncrementsDefault(dflt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPayloadsPos0() throws Exception {
|
||||||
|
for(int x=0;x<2;x++) {
|
||||||
|
Directory dir = new MockRAMDirectory();
|
||||||
|
IndexWriter writer = new IndexWriter(dir,
|
||||||
|
new TestPayloadAnalyzer(), true,
|
||||||
|
IndexWriter.MaxFieldLength.LIMITED);
|
||||||
|
if (x == 1) {
|
||||||
|
writer.setAllowMinus1Position();
|
||||||
|
}
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("content",
|
||||||
|
new StringReader("a a b c d e a f g h i j a b k k")));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
IndexReader r = writer.getReader();
|
||||||
|
|
||||||
|
TermPositions tp = r.termPositions(new Term("content", "a"));
|
||||||
|
int count = 0;
|
||||||
|
assertTrue(tp.next());
|
||||||
|
// "a" occurs 4 times
|
||||||
|
assertEquals(4, tp.freq());
|
||||||
|
int expected;
|
||||||
|
if (x == 1) {
|
||||||
|
expected = Integer.MAX_VALUE;
|
||||||
|
} else {
|
||||||
|
expected = 0;
|
||||||
|
}
|
||||||
|
assertEquals(expected, tp.nextPosition());
|
||||||
|
if (x == 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assertEquals(1, tp.nextPosition());
|
||||||
|
assertEquals(3, tp.nextPosition());
|
||||||
|
assertEquals(6, tp.nextPosition());
|
||||||
|
|
||||||
|
// only one doc has "a"
|
||||||
|
assertFalse(tp.next());
|
||||||
|
|
||||||
|
IndexSearcher is = new IndexSearcher(r);
|
||||||
|
|
||||||
|
SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
|
||||||
|
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
|
||||||
|
SpanQuery[] sqs = { stq1, stq2 };
|
||||||
|
SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
|
||||||
|
|
||||||
|
count = 0;
|
||||||
|
boolean sawZero = false;
|
||||||
|
//System.out.println("\ngetPayloadSpans test");
|
||||||
|
PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader());
|
||||||
|
while (pspans.next()) {
|
||||||
|
//System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
|
||||||
|
Collection payloads = pspans.getPayload();
|
||||||
|
sawZero |= pspans.start() == 0;
|
||||||
|
for (Iterator it = payloads.iterator(); it.hasNext();) {
|
||||||
|
count++;
|
||||||
|
it.next();
|
||||||
|
//System.out.println(new String((byte[]) it.next()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(5, count);
|
||||||
|
assertTrue(sawZero);
|
||||||
|
|
||||||
|
//System.out.println("\ngetSpans test");
|
||||||
|
Spans spans = snq.getSpans(is.getIndexReader());
|
||||||
|
count = 0;
|
||||||
|
sawZero = false;
|
||||||
|
while (spans.next()) {
|
||||||
|
count++;
|
||||||
|
sawZero |= spans.start() == 0;
|
||||||
|
//System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end());
|
||||||
|
}
|
||||||
|
assertEquals(4, count);
|
||||||
|
assertTrue(sawZero);
|
||||||
|
|
||||||
|
//System.out.println("\nPayloadSpanUtil test");
|
||||||
|
|
||||||
|
sawZero = false;
|
||||||
|
PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
|
||||||
|
Collection pls = psu.getPayloadsForQuery(snq);
|
||||||
|
count = pls.size();
|
||||||
|
for (Iterator it = pls.iterator(); it.hasNext();) {
|
||||||
|
String s = new String((byte[]) it.next());
|
||||||
|
//System.out.println(s);
|
||||||
|
sawZero |= s.equals("pos: 0");
|
||||||
|
}
|
||||||
|
assertEquals(5, count);
|
||||||
|
assertTrue(sawZero);
|
||||||
|
writer.close();
|
||||||
|
is.getIndexReader().close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class TestPayloadAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream result = new LowerCaseTokenizer(reader);
|
||||||
|
return new PayloadFilter(result, fieldName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class PayloadFilter extends TokenFilter {
|
||||||
|
String fieldName;
|
||||||
|
|
||||||
|
int pos;
|
||||||
|
|
||||||
|
int i;
|
||||||
|
|
||||||
|
final PositionIncrementAttribute posIncrAttr;
|
||||||
|
final PayloadAttribute payloadAttr;
|
||||||
|
final TermAttribute termAttr;
|
||||||
|
|
||||||
|
public PayloadFilter(TokenStream input, String fieldName) {
|
||||||
|
super(input);
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
pos = 0;
|
||||||
|
i = 0;
|
||||||
|
posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
|
||||||
|
termAttr = (TermAttribute) input.addAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes()));
|
||||||
|
int posIncr;
|
||||||
|
if (i % 2 == 1) {
|
||||||
|
posIncr = 1;
|
||||||
|
} else {
|
||||||
|
posIncr = 0;
|
||||||
|
}
|
||||||
|
posIncrAttr.setPositionIncrement(posIncr);
|
||||||
|
pos += posIncr;
|
||||||
|
// System.out.println("term=" + termAttr.term() + " pos=" + pos);
|
||||||
|
i++;
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue