mirror of https://github.com/apache/lucene.git
LUCENE-4139: compute offsets correctly for multivalued fields
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1349452 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0057c039be
commit
362a046571
|
@ -252,6 +252,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
// and the numbers aren't that much smaller anyways.
|
||||
int offsetDelta = startOffset - lastOffset;
|
||||
int offsetLength = endOffset - startOffset;
|
||||
assert offsetDelta >= 0 && offsetLength >= 0 : "startOffset=" + startOffset + ",lastOffset=" + lastOffset + ",endOffset=" + endOffset;
|
||||
if (offsetLength != lastOffsetLength) {
|
||||
proxOut.writeVInt(offsetDelta << 1 | 1);
|
||||
proxOut.writeVInt(offsetLength);
|
||||
|
|
|
@ -148,15 +148,16 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
postings.lastPositions[termID] = fieldState.position;
|
||||
}
|
||||
|
||||
void writeOffsets(final int termID, int prevOffset) {
|
||||
void writeOffsets(final int termID, int offsetAccum) {
|
||||
assert hasOffsets;
|
||||
final int startOffset = offsetAttribute.startOffset();
|
||||
final int endOffset = offsetAttribute.endOffset();
|
||||
final int startOffset = offsetAccum + offsetAttribute.startOffset();
|
||||
final int endOffset = offsetAccum + offsetAttribute.endOffset();
|
||||
//System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
|
||||
termsHashPerField.writeVInt(1, startOffset - prevOffset);
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
assert startOffset - postings.lastOffsets[termID] >= 0;
|
||||
termsHashPerField.writeVInt(1, startOffset - postings.lastOffsets[termID]);
|
||||
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
||||
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
postings.lastOffsets[termID] = startOffset;
|
||||
}
|
||||
|
||||
|
@ -224,6 +225,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
if (hasProx) {
|
||||
writeProx(termID, fieldState.position);
|
||||
if (hasOffsets) {
|
||||
postings.lastOffsets[termID] = 0;
|
||||
writeOffsets(termID, fieldState.offset);
|
||||
}
|
||||
} else {
|
||||
|
@ -236,7 +238,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||
}
|
||||
if (hasOffsets) {
|
||||
writeOffsets(termID, postings.lastOffsets[termID]);
|
||||
writeOffsets(termID, fieldState.offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -523,14 +525,15 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
if (readOffsets) {
|
||||
final int startOffset = offset + prox.readVInt();
|
||||
final int endOffset = startOffset + prox.readVInt();
|
||||
offset = startOffset;
|
||||
if (writePositions) {
|
||||
if (writeOffsets) {
|
||||
assert startOffset >=0 && endOffset >= startOffset : "startOffset=" + startOffset + ",endOffset=" + endOffset + ",offset=" + offset;
|
||||
postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
|
||||
} else {
|
||||
postingsConsumer.addPosition(position, thisPayload, -1, -1);
|
||||
}
|
||||
}
|
||||
offset = startOffset;
|
||||
} else if (writePositions) {
|
||||
postingsConsumer.addPosition(position, thisPayload, -1, -1);
|
||||
}
|
||||
|
|
|
@ -385,6 +385,22 @@ public class TestPostingsOffsets extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
public void testAddFieldTwice() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
|
||||
Document doc = new Document();
|
||||
FieldType customType3 = new FieldType(TextField.TYPE_STORED);
|
||||
customType3.setStoreTermVectors(true);
|
||||
customType3.setStoreTermVectorPositions(true);
|
||||
customType3.setStoreTermVectorOffsets(true);
|
||||
customType3.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
|
||||
doc.add(new Field("content3", "here is more content with aaa aaa aaa", customType3));
|
||||
iw.addDocument(doc);
|
||||
iw.close();
|
||||
dir.close(); // checkindex
|
||||
}
|
||||
|
||||
// NOTE: the next two tests aren't that good as we need an EvilToken...
|
||||
public void testNegativeOffsets() throws Exception {
|
||||
try {
|
||||
|
|
|
@ -26,12 +26,22 @@ import java.io.StringReader;
|
|||
import java.io.StringWriter;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
import org.apache.lucene.util.AttributeImpl;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
@ -384,13 +394,14 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
final boolean useCharFilter;
|
||||
final boolean simple;
|
||||
final boolean offsetsAreCorrect;
|
||||
final RandomIndexWriter iw;
|
||||
|
||||
// NOTE: not volatile because we don't want the tests to
|
||||
// add memory barriers (ie alter how threads
|
||||
// interact)... so this is just "best effort":
|
||||
public boolean failed;
|
||||
|
||||
AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) {
|
||||
AnalysisThread(long seed, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) {
|
||||
this.seed = seed;
|
||||
this.a = a;
|
||||
this.iterations = iterations;
|
||||
|
@ -398,6 +409,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
this.useCharFilter = useCharFilter;
|
||||
this.simple = simple;
|
||||
this.offsetsAreCorrect = offsetsAreCorrect;
|
||||
this.iw = iw;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -406,7 +418,7 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
try {
|
||||
// see the part in checkRandomData where it replays the same text again
|
||||
// to verify reproducability/reuse: hopefully this would catch thread hazards.
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
|
||||
success = true;
|
||||
} catch (IOException e) {
|
||||
Rethrow.rethrow(e);
|
||||
|
@ -423,34 +435,88 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
public static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||
long seed = random.nextLong();
|
||||
boolean useCharFilter = random.nextBoolean();
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
|
||||
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
|
||||
// so this should only really fail from another thread if its an actual thread problem
|
||||
int numThreads = _TestUtil.nextInt(random, 2, 4);
|
||||
AnalysisThread threads[] = new AnalysisThread[numThreads];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect);
|
||||
Directory dir = null;
|
||||
RandomIndexWriter iw = null;
|
||||
if (rarely(random)) {
|
||||
dir = newFSDirectory(_TestUtil.getTempDir("bttc"));
|
||||
iw = new RandomIndexWriter(new Random(seed), dir, a);
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i].start();
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
try {
|
||||
threads[i].join();
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
boolean success = false;
|
||||
try {
|
||||
checkRandomData(new Random(seed), a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
|
||||
// now test with multiple threads: note we do the EXACT same thing we did before in each thread,
|
||||
// so this should only really fail from another thread if its an actual thread problem
|
||||
int numThreads = _TestUtil.nextInt(random, 2, 4);
|
||||
AnalysisThread threads[] = new AnalysisThread[numThreads];
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i] = new AnalysisThread(seed, a, iterations, maxWordLength, useCharFilter, simple, offsetsAreCorrect, iw);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
if (threads[i].failed) {
|
||||
throw new RuntimeException("some thread(s) failed");
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
threads[i].start();
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
try {
|
||||
threads[i].join();
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < threads.length; i++) {
|
||||
if (threads[i].failed) {
|
||||
throw new RuntimeException("some thread(s) failed");
|
||||
}
|
||||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(iw, dir);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(iw, dir); // checkindex
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect) throws IOException {
|
||||
static final Set<String> doesntSupportOffsets = new HashSet<String>() {{
|
||||
add("Lucene3x");
|
||||
add("MockFixedIntBlock");
|
||||
add("MockVariableIntBlock");
|
||||
add("MockSep");
|
||||
add("MockRandom");
|
||||
}};
|
||||
|
||||
private static void checkRandomData(Random random, Analyzer a, int iterations, int maxWordLength, boolean useCharFilter, boolean simple, boolean offsetsAreCorrect, RandomIndexWriter iw) throws IOException {
|
||||
|
||||
final LineFileDocs docs = new LineFileDocs(random);
|
||||
Document doc = null;
|
||||
Field field = null, currentField = null;
|
||||
StringReader bogus = new StringReader("");
|
||||
if (iw != null) {
|
||||
doc = new Document();
|
||||
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
|
||||
if (random.nextBoolean()) {
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(random.nextBoolean());
|
||||
ft.setStoreTermVectorPositions(random.nextBoolean());
|
||||
}
|
||||
if (random.nextBoolean()) {
|
||||
ft.setOmitNorms(true);
|
||||
}
|
||||
String pf = _TestUtil.getPostingsFormat("dummy");
|
||||
boolean supportsOffsets = !doesntSupportOffsets.contains(pf);
|
||||
switch(random.nextInt(4)) {
|
||||
case 0: ft.setIndexOptions(IndexOptions.DOCS_ONLY); break;
|
||||
case 1: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS); break;
|
||||
case 2: ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); break;
|
||||
default:
|
||||
if (supportsOffsets && offsetsAreCorrect) {
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
|
||||
} else {
|
||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
}
|
||||
}
|
||||
currentField = field = new Field("dummy", bogus, ft);
|
||||
doc.add(currentField);
|
||||
}
|
||||
|
||||
try {
|
||||
for (int i = 0; i < iterations; i++) {
|
||||
|
@ -481,7 +547,23 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
try {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect);
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, currentField);
|
||||
if (iw != null) {
|
||||
if (random.nextInt(7) == 0) {
|
||||
// pile up a multivalued field
|
||||
FieldType ft = field.fieldType();
|
||||
currentField = new Field("dummy", bogus, ft);
|
||||
doc.add(currentField);
|
||||
} else {
|
||||
iw.addDocument(doc);
|
||||
if (doc.getFields().size() > 1) {
|
||||
// back to 1 field
|
||||
currentField = field;
|
||||
doc.removeFields("dummy");
|
||||
doc.add(currentField);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
// TODO: really we should pass a random seed to
|
||||
// checkAnalysisConsistency then print it here too:
|
||||
|
@ -528,6 +610,10 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect) throws IOException {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text, offsetsAreCorrect, null);
|
||||
}
|
||||
|
||||
private static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text, boolean offsetsAreCorrect, Field field) throws IOException {
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: get first token stream now text=" + text);
|
||||
|
@ -649,6 +735,8 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
}
|
||||
reader = new StringReader(text);
|
||||
|
||||
long seed = random.nextLong();
|
||||
random = new Random(seed);
|
||||
if (random.nextInt(30) == 7) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||
|
@ -718,6 +806,20 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
|
||||
if (field != null) {
|
||||
reader = new StringReader(text);
|
||||
random = new Random(seed);
|
||||
if (random.nextInt(30) == 7) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: indexing using spoon-feed reader");
|
||||
}
|
||||
|
||||
reader = new MockReaderWrapper(random, reader);
|
||||
}
|
||||
|
||||
field.setReaderValue(useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
}
|
||||
}
|
||||
|
||||
private static String randomAnalysisString(Random random, int maxLength, boolean simple) {
|
||||
|
|
Loading…
Reference in New Issue