mirror of https://github.com/apache/lucene.git
LUCENE-1542: properly index first token(s) with 0 position increment
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@780220 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5f6d0c7bd6
commit
80a79f5bee
21
CHANGES.txt
21
CHANGES.txt
|
@ -71,6 +71,17 @@ Changes in runtime behavior
|
|||
with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties
|
||||
internally by docID). (Shai Erera via Michael McCandless)
|
||||
|
||||
6. LUCENE-1542: When the first token(s) have 0 position increment,
|
||||
IndexWriter used to incorrectly record the position as -1, if no
|
||||
payload is present, or Integer.MAX_VALUE if a payload is present.
|
||||
This causes positional queries to fail to match. The bug is now
|
||||
fixed, but if your app relies on the buggy behavior then you must
|
||||
call IndexWriter.setAllowMinus1Position(). That API is deprecated
|
||||
so you must fix your application, and rebuild your index, to not
|
||||
rely on this behavior by the 3.0 release of Lucene. (Jonathan
|
||||
Mamou, Mark Miller via Mike McCandless)
|
||||
|
||||
|
||||
API Changes
|
||||
|
||||
1. LUCENE-1419: Add expert API to set custom indexing chain. This API is
|
||||
|
@ -186,6 +197,16 @@ Bug fixes
|
|||
10. LUCENE-1647: Fix case where IndexReader.undeleteAll would cause
|
||||
the segment's deletion count to be incorrect. (Mike McCandless)
|
||||
|
||||
11. LUCENE-1542: When the first token(s) have 0 position increment,
|
||||
IndexWriter used to incorrectly record the position as -1, if no
|
||||
payload is present, or Integer.MAX_VALUE if a payload is present.
|
||||
This causes positional queries to fail to match. The bug is now
|
||||
fixed, but if your app relies on the buggy behavior then you must
|
||||
call IndexWriter.setAllowMinus1Position(). That API is deprecated
|
||||
so you must fix your application, and rebuild your index, to not
|
||||
rely on this behavior by the 3.0 release of Lucene. (Jonathan
|
||||
Mamou, Mark Miller via Mike McCandless)
|
||||
|
||||
New features
|
||||
|
||||
1. LUCENE-1411: Added expert API to open an IndexWriter on a prior
|
||||
|
|
|
@ -42,7 +42,7 @@
|
|||
<property name="Name" value="Lucene"/>
|
||||
<property name="dev.version" value="2.9-dev"/>
|
||||
<property name="version" value="${dev.version}"/>
|
||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090526"/>
|
||||
<property name="compatibility.tag" value="lucene_2_4_back_compat_tests_20090530"/>
|
||||
<property name="spec.version" value="${version}"/>
|
||||
<property name="year" value="2000-${current.year}"/>
|
||||
<property name="final.name" value="lucene-${name}-${version}"/>
|
||||
|
|
|
@ -126,6 +126,9 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
// deprecated
|
||||
final boolean allowMinus1Position = docState.allowMinus1Position;
|
||||
|
||||
try {
|
||||
int offsetEnd = fieldState.offset-1;
|
||||
|
||||
|
@ -162,7 +165,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
|||
}
|
||||
|
||||
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||
fieldState.position += posIncr - 1;
|
||||
fieldState.position += posIncr;
|
||||
if (allowMinus1Position || fieldState.position > 0) {
|
||||
fieldState.position--;
|
||||
}
|
||||
|
||||
if (posIncr == 0)
|
||||
fieldState.numOverlap++;
|
||||
|
||||
|
|
|
@ -150,6 +150,9 @@ final class DocumentsWriter {
|
|||
Document doc;
|
||||
String maxTermPrefix;
|
||||
|
||||
// deprecated
|
||||
boolean allowMinus1Position;
|
||||
|
||||
// Only called by asserts
|
||||
public boolean testPoint(String name) {
|
||||
return docWriter.writer.testPoint(name);
|
||||
|
@ -298,6 +301,11 @@ final class DocumentsWriter {
|
|||
threadStates[i].docState.similarity = similarity;
|
||||
}
|
||||
|
||||
synchronized void setAllowMinus1Position() {
|
||||
for(int i=0;i<threadStates.length;i++)
|
||||
threadStates[i].docState.allowMinus1Position = true;;
|
||||
}
|
||||
|
||||
/** Set how much RAM we can use before flushing. */
|
||||
synchronized void setRAMBufferSizeMB(double mb) {
|
||||
if (mb == IndexWriter.DISABLE_AUTO_FLUSH) {
|
||||
|
|
|
@ -40,6 +40,7 @@ final class DocumentsWriterThreadState {
|
|||
docState.infoStream = docWriter.infoStream;
|
||||
docState.similarity = docWriter.similarity;
|
||||
docState.docWriter = docWriter;
|
||||
docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position();
|
||||
consumer = docWriter.consumer.addThread(this);
|
||||
}
|
||||
|
||||
|
|
|
@ -5485,6 +5485,22 @@ public class IndexWriter {
|
|||
throw oom;
|
||||
}
|
||||
|
||||
// deprecated
|
||||
private boolean allowMinus1Position;
|
||||
|
||||
/** Deprecated: emulates IndexWriter's buggy behavior when
|
||||
* first token(s) have positionIncrement==0 (ie, prior to
|
||||
* fixing LUCENE-1542) */
|
||||
public void setAllowMinus1Position() {
|
||||
allowMinus1Position = true;
|
||||
docWriter.setAllowMinus1Position();
|
||||
}
|
||||
|
||||
// deprecated
|
||||
boolean getAllowMinus1Position() {
|
||||
return allowMinus1Position;
|
||||
}
|
||||
|
||||
// Used only by assert for testing. Current points:
|
||||
// startDoFlush
|
||||
// startCommitMerge
|
||||
|
|
|
@ -3594,7 +3594,7 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
TermPositions tps = s.getIndexReader().termPositions(new Term("field", "a"));
|
||||
assertTrue(tps.next());
|
||||
assertEquals(1, tps.freq());
|
||||
assertEquals(-1, tps.nextPosition());
|
||||
assertEquals(0, tps.nextPosition());
|
||||
w.close();
|
||||
|
||||
assertTrue(_TestUtil.checkIndex(dir));
|
||||
|
|
|
@ -17,8 +17,11 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
|
@ -26,14 +29,27 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.search.payloads.PayloadSpanUtil;
|
||||
import org.apache.lucene.search.spans.PayloadSpans;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
|
||||
/**
|
||||
* Term position unit test.
|
||||
|
@ -48,7 +64,7 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new TokenStream() {
|
||||
private final String[] TOKENS = {"1", "2", "3", "4", "5"};
|
||||
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
|
||||
private final int[] INCREMENTS = {0, 2, 1, 0, 1};
|
||||
private int i = 0;
|
||||
|
||||
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -67,7 +83,7 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
};
|
||||
}
|
||||
};
|
||||
RAMDirectory store = new RAMDirectory();
|
||||
Directory store = new MockRAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(store, analyzer, true,
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
Document d = new Document();
|
||||
|
@ -76,7 +92,19 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
|
||||
IndexSearcher searcher = new IndexSearcher(store);
|
||||
|
||||
TermPositions pos = searcher.getIndexReader().termPositions(new Term("field", "1"));
|
||||
pos.next();
|
||||
// first token should be at position 0
|
||||
assertEquals(0, pos.nextPosition());
|
||||
|
||||
pos = searcher.getIndexReader().termPositions(new Term("field", "2"));
|
||||
pos.next();
|
||||
// second token should be at position 2
|
||||
assertEquals(2, pos.nextPosition());
|
||||
|
||||
PhraseQuery q;
|
||||
ScoreDoc[] hits;
|
||||
|
||||
|
@ -202,4 +230,146 @@ public class TestPositionIncrement extends LuceneTestCase {
|
|||
StopFilter.setEnablePositionIncrementsDefault(dflt);
|
||||
}
|
||||
}
|
||||
|
||||
public void testPayloadsPos0() throws Exception {
|
||||
for(int x=0;x<2;x++) {
|
||||
Directory dir = new MockRAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir,
|
||||
new TestPayloadAnalyzer(), true,
|
||||
IndexWriter.MaxFieldLength.LIMITED);
|
||||
if (x == 1) {
|
||||
writer.setAllowMinus1Position();
|
||||
}
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("content",
|
||||
new StringReader("a a b c d e a f g h i j a b k k")));
|
||||
writer.addDocument(doc);
|
||||
|
||||
IndexReader r = writer.getReader();
|
||||
|
||||
TermPositions tp = r.termPositions(new Term("content", "a"));
|
||||
int count = 0;
|
||||
assertTrue(tp.next());
|
||||
// "a" occurs 4 times
|
||||
assertEquals(4, tp.freq());
|
||||
int expected;
|
||||
if (x == 1) {
|
||||
expected = Integer.MAX_VALUE;
|
||||
} else {
|
||||
expected = 0;
|
||||
}
|
||||
assertEquals(expected, tp.nextPosition());
|
||||
if (x == 1) {
|
||||
continue;
|
||||
}
|
||||
assertEquals(1, tp.nextPosition());
|
||||
assertEquals(3, tp.nextPosition());
|
||||
assertEquals(6, tp.nextPosition());
|
||||
|
||||
// only one doc has "a"
|
||||
assertFalse(tp.next());
|
||||
|
||||
IndexSearcher is = new IndexSearcher(r);
|
||||
|
||||
SpanTermQuery stq1 = new SpanTermQuery(new Term("content", "a"));
|
||||
SpanTermQuery stq2 = new SpanTermQuery(new Term("content", "k"));
|
||||
SpanQuery[] sqs = { stq1, stq2 };
|
||||
SpanNearQuery snq = new SpanNearQuery(sqs, 30, false);
|
||||
|
||||
count = 0;
|
||||
boolean sawZero = false;
|
||||
//System.out.println("\ngetPayloadSpans test");
|
||||
PayloadSpans pspans = snq.getPayloadSpans(is.getIndexReader());
|
||||
while (pspans.next()) {
|
||||
//System.out.println(pspans.doc() + " - " + pspans.start() + " - "+ pspans.end());
|
||||
Collection payloads = pspans.getPayload();
|
||||
sawZero |= pspans.start() == 0;
|
||||
for (Iterator it = payloads.iterator(); it.hasNext();) {
|
||||
count++;
|
||||
it.next();
|
||||
//System.out.println(new String((byte[]) it.next()));
|
||||
}
|
||||
}
|
||||
assertEquals(5, count);
|
||||
assertTrue(sawZero);
|
||||
|
||||
//System.out.println("\ngetSpans test");
|
||||
Spans spans = snq.getSpans(is.getIndexReader());
|
||||
count = 0;
|
||||
sawZero = false;
|
||||
while (spans.next()) {
|
||||
count++;
|
||||
sawZero |= spans.start() == 0;
|
||||
//System.out.println(spans.doc() + " - " + spans.start() + " - " + spans.end());
|
||||
}
|
||||
assertEquals(4, count);
|
||||
assertTrue(sawZero);
|
||||
|
||||
//System.out.println("\nPayloadSpanUtil test");
|
||||
|
||||
sawZero = false;
|
||||
PayloadSpanUtil psu = new PayloadSpanUtil(is.getIndexReader());
|
||||
Collection pls = psu.getPayloadsForQuery(snq);
|
||||
count = pls.size();
|
||||
for (Iterator it = pls.iterator(); it.hasNext();) {
|
||||
String s = new String((byte[]) it.next());
|
||||
//System.out.println(s);
|
||||
sawZero |= s.equals("pos: 0");
|
||||
}
|
||||
assertEquals(5, count);
|
||||
assertTrue(sawZero);
|
||||
writer.close();
|
||||
is.getIndexReader().close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class TestPayloadAnalyzer extends Analyzer {
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new LowerCaseTokenizer(reader);
|
||||
return new PayloadFilter(result, fieldName);
|
||||
}
|
||||
}
|
||||
|
||||
class PayloadFilter extends TokenFilter {
|
||||
String fieldName;
|
||||
|
||||
int pos;
|
||||
|
||||
int i;
|
||||
|
||||
final PositionIncrementAttribute posIncrAttr;
|
||||
final PayloadAttribute payloadAttr;
|
||||
final TermAttribute termAttr;
|
||||
|
||||
public PayloadFilter(TokenStream input, String fieldName) {
|
||||
super(input);
|
||||
this.fieldName = fieldName;
|
||||
pos = 0;
|
||||
i = 0;
|
||||
posIncrAttr = (PositionIncrementAttribute) input.addAttribute(PositionIncrementAttribute.class);
|
||||
payloadAttr = (PayloadAttribute) input.addAttribute(PayloadAttribute.class);
|
||||
termAttr = (TermAttribute) input.addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
payloadAttr.setPayload(new Payload(("pos: " + pos).getBytes()));
|
||||
int posIncr;
|
||||
if (i % 2 == 1) {
|
||||
posIncr = 1;
|
||||
} else {
|
||||
posIncr = 0;
|
||||
}
|
||||
posIncrAttr.setPositionIncrement(posIncr);
|
||||
pos += posIncr;
|
||||
// System.out.println("term=" + termAttr.term() + " pos=" + pos);
|
||||
i++;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue