LUCENE-3576: add surrogates shuffle for 3.x term vectors, test surrogates in TestBackwardsCompatibility

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232974 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-01-18 17:57:05 +00:00
parent 3d66fa922a
commit 98e5328845
9 changed files with 173 additions and 80 deletions

View File

@ -19,6 +19,7 @@ package org.apache.lucene.codecs;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.DocsEnum;
@ -273,4 +274,8 @@ public abstract class TermVectorsWriter implements Closeable {
assert termCount == numTerms; assert termCount == numTerms;
} }
} }
/** Return the BytesRef Comparator used to sort terms
* before feeding to this API. */
public abstract Comparator<BytesRef> getComparator() throws IOException;
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene3x;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -265,11 +266,13 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
private class TVTerms extends Terms { private class TVTerms extends Terms {
private final int numTerms; private final int numTerms;
private final long tvfFPStart; private final long tvfFPStart;
private final boolean unicodeSortOrder;
public TVTerms(long tvfFP) throws IOException { public TVTerms(long tvfFP) throws IOException {
tvf.seek(tvfFP); tvf.seek(tvfFP);
numTerms = tvf.readVInt(); numTerms = tvf.readVInt();
tvfFPStart = tvf.getFilePointer(); tvfFPStart = tvf.getFilePointer();
unicodeSortOrder = sortTermsByUnicode();
} }
@Override @Override
@ -283,7 +286,7 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
} else { } else {
termsEnum = new TVTermsEnum(); termsEnum = new TVTermsEnum();
} }
termsEnum.reset(numTerms, tvfFPStart); termsEnum.reset(numTerms, tvfFPStart, unicodeSortOrder);
return termsEnum; return termsEnum;
} }
@ -310,27 +313,32 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
// TODO: really indexer hardwires if (unicodeSortOrder) {
// this...? I guess codec could buffer and re-sort...
return BytesRef.getUTF8SortedAsUnicodeComparator(); return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
} }
} }
}
static class TermAndPostings {
BytesRef term;
int freq;
int[] positions;
int[] startOffsets;
int[] endOffsets;
}
private class TVTermsEnum extends TermsEnum { private class TVTermsEnum extends TermsEnum {
private boolean unicodeSortOrder;
private final IndexInput origTVF; private final IndexInput origTVF;
private final IndexInput tvf; private final IndexInput tvf;
private int numTerms; private int numTerms;
private int nextTerm; private int currentTerm;
private int freq;
private BytesRef lastTerm = new BytesRef();
private BytesRef term = new BytesRef();
private boolean storePositions; private boolean storePositions;
private boolean storeOffsets; private boolean storeOffsets;
private long tvfFP;
private int[] positions; private TermAndPostings[] termAndPostings;
private int[] startOffsets;
private int[] endOffsets;
// NOTE: tvf is pre-positioned by caller // NOTE: tvf is pre-positioned by caller
public TVTermsEnum() throws IOException { public TVTermsEnum() throws IOException {
@ -342,37 +350,81 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
return tvf == origTVF; return tvf == origTVF;
} }
public void reset(int numTerms, long tvfFPStart) throws IOException { public void reset(int numTerms, long tvfFPStart, boolean unicodeSortOrder) throws IOException {
this.numTerms = numTerms; this.numTerms = numTerms;
nextTerm = 0; currentTerm = -1;
tvf.seek(tvfFPStart); tvf.seek(tvfFPStart);
final byte bits = tvf.readByte(); final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFP = 1+tvfFPStart; this.unicodeSortOrder = unicodeSortOrder;
positions = null; readVectors();
startOffsets = null; if (unicodeSortOrder) {
endOffsets = null; Arrays.sort(termAndPostings, new Comparator<TermAndPostings>() {
public int compare(TermAndPostings left, TermAndPostings right) {
return left.term.compareTo(right.term);
}
});
}
}
private void readVectors() throws IOException {
termAndPostings = new TermAndPostings[numTerms];
BytesRef lastTerm = new BytesRef();
for (int i = 0; i < numTerms; i++) {
TermAndPostings t = new TermAndPostings();
BytesRef term = new BytesRef();
term.copyBytes(lastTerm);
final int start = tvf.readVInt();
final int deltaLen = tvf.readVInt();
term.length = start + deltaLen;
term.grow(term.length);
tvf.readBytes(term.bytes, start, deltaLen);
t.term = term;
int freq = tvf.readVInt();
t.freq = freq;
if (storePositions) {
int positions[] = new int[freq];
int pos = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
pos += tvf.readVInt();
positions[posUpto] = pos;
}
t.positions = positions;
}
if (storeOffsets) {
int startOffsets[] = new int[freq];
int endOffsets[] = new int[freq];
int offset = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
startOffsets[posUpto] = offset + tvf.readVInt();
offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
}
t.startOffsets = startOffsets;
t.endOffsets = endOffsets;
}
lastTerm.copyBytes(term);
termAndPostings[i] = t;
}
} }
// NOTE: slow! (linear scan) // NOTE: slow! (linear scan)
@Override @Override
public SeekStatus seekCeil(BytesRef text, boolean useCache) public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException {
throws IOException { Comparator<BytesRef> comparator = getComparator();
if (nextTerm != 0 && text.compareTo(term) < 0) { for (int i = 0; i < numTerms; i++) {
nextTerm = 0; int cmp = comparator.compare(text, termAndPostings[i].term);
tvf.seek(tvfFP);
}
while (next() != null) {
final int cmp = text.compareTo(term);
if (cmp < 0) { if (cmp < 0) {
currentTerm = i;
return SeekStatus.NOT_FOUND; return SeekStatus.NOT_FOUND;
} else if (cmp == 0) { } else if (cmp == 0) {
currentTerm = i;
return SeekStatus.FOUND; return SeekStatus.FOUND;
} }
} }
currentTerm = termAndPostings.length;
return SeekStatus.END; return SeekStatus.END;
} }
@ -383,47 +435,15 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
if (nextTerm >= numTerms) { if (++currentTerm >= numTerms) {
return null; return null;
} }
term.copyBytes(lastTerm); return term();
final int start = tvf.readVInt();
final int deltaLen = tvf.readVInt();
term.length = start + deltaLen;
term.grow(term.length);
tvf.readBytes(term.bytes, start, deltaLen);
freq = tvf.readVInt();
if (storePositions) {
// TODO: we could maybe reuse last array, if we can
// somehow be careful about consumer never using two
// D&PEnums at once...
positions = new int[freq];
int pos = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
pos += tvf.readVInt();
positions[posUpto] = pos;
}
}
if (storeOffsets) {
startOffsets = new int[freq];
endOffsets = new int[freq];
int offset = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
startOffsets[posUpto] = offset + tvf.readVInt();
offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
}
}
lastTerm.copyBytes(term);
nextTerm++;
return term;
} }
@Override @Override
public BytesRef term() { public BytesRef term() {
return term; return termAndPostings[currentTerm].term;
} }
@Override @Override
@ -438,7 +458,7 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
@Override @Override
public long totalTermFreq() { public long totalTermFreq() {
return freq; return termAndPostings[currentTerm].freq;
} }
@Override @Override
@ -449,7 +469,7 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
} else { } else {
docsEnum = new TVDocsEnum(); docsEnum = new TVDocsEnum();
} }
docsEnum.reset(liveDocs, freq); docsEnum.reset(liveDocs, termAndPostings[currentTerm]);
return docsEnum; return docsEnum;
} }
@ -469,15 +489,17 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
} else { } else {
docsAndPositionsEnum = new TVDocsAndPositionsEnum(); docsAndPositionsEnum = new TVDocsAndPositionsEnum();
} }
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets); docsAndPositionsEnum.reset(liveDocs, termAndPostings[currentTerm]);
return docsAndPositionsEnum; return docsAndPositionsEnum;
} }
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
// TODO: really indexer hardwires if (unicodeSortOrder) {
// this...? I guess codec could buffer and re-sort...
return BytesRef.getUTF8SortedAsUnicodeComparator(); return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
} }
} }
@ -518,9 +540,9 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
} }
} }
public void reset(Bits liveDocs, int freq) { public void reset(Bits liveDocs, TermAndPostings termAndPostings) {
this.liveDocs = liveDocs; this.liveDocs = liveDocs;
this.freq = freq; this.freq = termAndPostings.freq;
this.doc = -1; this.doc = -1;
didNext = false; didNext = false;
} }
@ -569,11 +591,11 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
} }
} }
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) { public void reset(Bits liveDocs, TermAndPostings termAndPostings) {
this.liveDocs = liveDocs; this.liveDocs = liveDocs;
this.positions = positions; this.positions = termAndPostings.positions;
this.startOffsets = startOffsets; this.startOffsets = termAndPostings.startOffsets;
this.endOffsets = endOffsets; this.endOffsets = termAndPostings.endOffsets;
this.doc = -1; this.doc = -1;
didNext = false; didNext = false;
nextPos = 0; nextPos = 0;
@ -668,5 +690,14 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader {
} }
} }
} }
// If this returns, we do the surrogates shuffle so that the
// terms are sorted by unicode sort order. This should be
// true when segments are used for "normal" searching;
// it's only false during testing, to create a pre-flex
// index, using the test-only PreFlexRW.
protected boolean sortTermsByUnicode() {
return true;
}
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene40;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.TermVectorsWriter;
@ -365,4 +366,9 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
IOUtils.close(tvx, tvd, tvf); IOUtils.close(tvx, tvd, tvf);
tvx = tvd = tvf = null; tvx = tvd = tvf = null;
} }
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.simpletext;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfo;
@ -170,6 +171,11 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
} }
} }
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
private void write(String s) throws IOException { private void write(String s) throws IOException {
SimpleTextUtil.write(out, s, scratch); SimpleTextUtil.write(out, s, scratch);
} }

View File

@ -118,9 +118,7 @@ final class TermVectorsConsumerPerField extends TermsHashConsumerPerField {
TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;
final TermVectorsWriter tv = termsWriter.writer; final TermVectorsWriter tv = termsWriter.writer;
// TODO: we may want to make this sort in same order final int[] termIDs = termsHashPerField.sortPostings(tv.getComparator());
// as Codec's terms dict?
final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets); tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets);

View File

@ -19,10 +19,15 @@ package org.apache.lucene.codecs.preflexrw;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat; import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat;
import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.LuceneTestCase;
public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat { public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat {
@ -30,4 +35,30 @@ public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat {
public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException { public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
return new PreFlexRWTermVectorsWriter(directory, segment, context); return new PreFlexRWTermVectorsWriter(directory, segment, context);
} }
@Override
public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
return new Lucene3xTermVectorsReader(directory, segmentInfo, fieldInfos, context) {
@Override
protected boolean sortTermsByUnicode() {
// We carefully peek into stack track above us: if
// we are part of a "merge", we must sort by UTF16:
boolean unicodeSortOrder = true;
StackTraceElement[] trace = new Exception().getStackTrace();
for (int i = 0; i < trace.length; i++) {
//System.out.println(trace[i].getClassName());
if ("merge".equals(trace[i].getMethodName())) {
unicodeSortOrder = false;
if (LuceneTestCase.VERBOSE) {
System.out.println("NOTE: PreFlexRW codec: forcing legacy UTF16 vector term sort order");
}
break;
}
}
return unicodeSortOrder;
}
};
}
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.codecs.preflexrw;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader; import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
@ -32,7 +33,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.StringHelper;
// TODO: surrogates dance!
public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter { public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter {
private final Directory directory; private final Directory directory;
private final String segment; private final String segment;
@ -213,4 +213,9 @@ public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter {
IOUtils.close(tvx, tvd, tvf); IOUtils.close(tvx, tvd, tvf);
tvx = tvd = tvf = null; tvx = tvd = tvf = null;
} }
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
} }

View File

@ -735,4 +735,15 @@ public class TestBackwardsCompatibility extends LuceneTestCase {
} }
} }
public static final String surrogatesIndexName = "index.36.surrogates.zip";
public void testSurrogates() throws Exception {
File oldIndexDir = _TestUtil.getTempDir("surrogates");
_TestUtil.unzip(getDataFile(surrogatesIndexName), oldIndexDir);
Directory dir = newFSDirectory(oldIndexDir);
// TODO: more tests
_TestUtil.checkIndex(dir);
dir.close();
}
} }