diff --git a/lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java b/lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java index 7d6086f6163..e22e1740d0a 100644 --- a/lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/TermVectorsWriter.java @@ -19,6 +19,7 @@ package org.apache.lucene.codecs; import java.io.Closeable; import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; @@ -273,4 +274,8 @@ public abstract class TermVectorsWriter implements Closeable { assert termCount == numTerms; } } + + /** Return the BytesRef Comparator used to sort terms + * before feeding to this API. */ + public abstract Comparator getComparator() throws IOException; } diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java index 55b9a5c66c2..25dd1f27a0d 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xTermVectorsReader.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene3x; */ import java.io.IOException; +import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Map; @@ -265,11 +266,13 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { private class TVTerms extends Terms { private final int numTerms; private final long tvfFPStart; + private final boolean unicodeSortOrder; public TVTerms(long tvfFP) throws IOException { tvf.seek(tvfFP); numTerms = tvf.readVInt(); tvfFPStart = tvf.getFilePointer(); + unicodeSortOrder = sortTermsByUnicode(); } @Override @@ -283,7 +286,7 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { } else { termsEnum = new TVTermsEnum(); } - termsEnum.reset(numTerms, tvfFPStart); + termsEnum.reset(numTerms, tvfFPStart, unicodeSortOrder); return termsEnum; } @@ -310,27 +313,32 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { @Override public Comparator getComparator() { - // TODO: really indexer hardwires - // this...? I guess codec could buffer and re-sort... - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (unicodeSortOrder) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } } + static class TermAndPostings { + BytesRef term; + int freq; + int[] positions; + int[] startOffsets; + int[] endOffsets; + } + private class TVTermsEnum extends TermsEnum { + private boolean unicodeSortOrder; private final IndexInput origTVF; private final IndexInput tvf; private int numTerms; - private int nextTerm; - private int freq; - private BytesRef lastTerm = new BytesRef(); - private BytesRef term = new BytesRef(); + private int currentTerm; private boolean storePositions; private boolean storeOffsets; - private long tvfFP; - - private int[] positions; - private int[] startOffsets; - private int[] endOffsets; + + private TermAndPostings[] termAndPostings; // NOTE: tvf is pre-positioned by caller public TVTermsEnum() throws IOException { @@ -342,37 +350,81 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { return tvf == origTVF; } - public void reset(int numTerms, long tvfFPStart) throws IOException { + public void reset(int numTerms, long tvfFPStart, boolean unicodeSortOrder) throws IOException { this.numTerms = numTerms; - nextTerm = 0; + currentTerm = -1; tvf.seek(tvfFPStart); final byte bits = tvf.readByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; - tvfFP = 1+tvfFPStart; - positions = null; - startOffsets = null; - endOffsets = null; + this.unicodeSortOrder = unicodeSortOrder; + readVectors(); + if (unicodeSortOrder) { + Arrays.sort(termAndPostings, new Comparator() { + public int compare(TermAndPostings left, TermAndPostings right) { + return left.term.compareTo(right.term); + } + }); + } + } + + private void readVectors() throws IOException { + termAndPostings = new TermAndPostings[numTerms]; + BytesRef lastTerm = new BytesRef(); + for (int i = 0; i < numTerms; i++) { + TermAndPostings t = new TermAndPostings(); + BytesRef term = new BytesRef(); + term.copyBytes(lastTerm); + final int start = tvf.readVInt(); + final int deltaLen = tvf.readVInt(); + term.length = start + deltaLen; + term.grow(term.length); + tvf.readBytes(term.bytes, start, deltaLen); + t.term = term; + int freq = tvf.readVInt(); + t.freq = freq; + + if (storePositions) { + int positions[] = new int[freq]; + int pos = 0; + for(int posUpto=0;posUpto comparator = getComparator(); + for (int i = 0; i < numTerms; i++) { + int cmp = comparator.compare(text, termAndPostings[i].term); if (cmp < 0) { + currentTerm = i; return SeekStatus.NOT_FOUND; } else if (cmp == 0) { + currentTerm = i; return SeekStatus.FOUND; } } - + currentTerm = termAndPostings.length; return SeekStatus.END; } @@ -383,47 +435,15 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { @Override public BytesRef next() throws IOException { - if (nextTerm >= numTerms) { + if (++currentTerm >= numTerms) { return null; } - term.copyBytes(lastTerm); - final int start = tvf.readVInt(); - final int deltaLen = tvf.readVInt(); - term.length = start + deltaLen; - term.grow(term.length); - tvf.readBytes(term.bytes, start, deltaLen); - freq = tvf.readVInt(); - - if (storePositions) { - // TODO: we could maybe reuse last array, if we can - // somehow be careful about consumer never using two - // D&PEnums at once... - positions = new int[freq]; - int pos = 0; - for(int posUpto=0;posUpto getComparator() { - // TODO: really indexer hardwires - // this...? I guess codec could buffer and re-sort... - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (unicodeSortOrder) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } } @@ -518,9 +540,9 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { } } - public void reset(Bits liveDocs, int freq) { + public void reset(Bits liveDocs, TermAndPostings termAndPostings) { this.liveDocs = liveDocs; - this.freq = freq; + this.freq = termAndPostings.freq; this.doc = -1; didNext = false; } @@ -569,11 +591,11 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { } } - public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) { + public void reset(Bits liveDocs, TermAndPostings termAndPostings) { this.liveDocs = liveDocs; - this.positions = positions; - this.startOffsets = startOffsets; - this.endOffsets = endOffsets; + this.positions = termAndPostings.positions; + this.startOffsets = termAndPostings.startOffsets; + this.endOffsets = termAndPostings.endOffsets; this.doc = -1; didNext = false; nextPos = 0; @@ -668,5 +690,14 @@ public class Lucene3xTermVectorsReader extends TermVectorsReader { } } } + + // If this returns, we do the surrogates shuffle so that the + // terms are sorted by unicode sort order. This should be + // true when segments are used for "normal" searching; + // it's only false during testing, to create a pre-flex + // index, using the test-only PreFlexRW. + protected boolean sortTermsByUnicode() { + return true; + } } diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java index 0ccb62efac5..fc002ceebd6 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40TermVectorsWriter.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.lucene40; */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; @@ -365,4 +366,9 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter { IOUtils.close(tvx, tvd, tvf); tvx = tvd = tvf = null; } + + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } } diff --git a/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java b/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java index 7257ac1622d..cabb56a08aa 100644 --- a/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/simpletext/SimpleTextTermVectorsWriter.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.simpletext; */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.index.FieldInfo; @@ -170,6 +171,11 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter { } } + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } + private void write(String s) throws IOException { SimpleTextUtil.write(out, s, scratch); } diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java b/lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java index 7aa131ba9ec..729839105cf 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsConsumerPerField.java @@ -118,9 +118,7 @@ final class TermVectorsConsumerPerField extends TermsHashConsumerPerField { TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray; final TermVectorsWriter tv = termsWriter.writer; - // TODO: we may want to make this sort in same order - // as Codec's terms dict? - final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); + final int[] termIDs = termsHashPerField.sortPostings(tv.getComparator()); tv.startField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets); diff --git a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java index d78e0a99d9b..8646f97c158 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java +++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsFormat.java @@ -19,10 +19,15 @@ package org.apache.lucene.codecs.preflexrw; import java.io.IOException; +import org.apache.lucene.codecs.TermVectorsReader; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat; +import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.LuceneTestCase; public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat { @@ -30,4 +35,30 @@ public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat { public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException { return new PreFlexRWTermVectorsWriter(directory, segment, context); } + + @Override + public TermVectorsReader vectorsReader(Directory directory, SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException { + return new Lucene3xTermVectorsReader(directory, segmentInfo, fieldInfos, context) { + @Override + protected boolean sortTermsByUnicode() { + // We carefully peek into stack track above us: if + // we are part of a "merge", we must sort by UTF16: + boolean unicodeSortOrder = true; + + StackTraceElement[] trace = new Exception().getStackTrace(); + for (int i = 0; i < trace.length; i++) { + //System.out.println(trace[i].getClassName()); + if ("merge".equals(trace[i].getMethodName())) { + unicodeSortOrder = false; + if (LuceneTestCase.VERBOSE) { + System.out.println("NOTE: PreFlexRW codec: forcing legacy UTF16 vector term sort order"); + } + break; + } + } + + return unicodeSortOrder; + } + }; + } } diff --git a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java index bf30f4178ce..1bb97d32b87 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java +++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWTermVectorsWriter.java @@ -18,6 +18,7 @@ package org.apache.lucene.codecs.preflexrw; */ import java.io.IOException; +import java.util.Comparator; import org.apache.lucene.codecs.TermVectorsWriter; import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader; @@ -32,7 +33,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.StringHelper; -// TODO: surrogates dance! public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter { private final Directory directory; private final String segment; @@ -213,4 +213,9 @@ public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter { IOUtils.close(tvx, tvd, tvf); tvx = tvd = tvf = null; } + + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index 78933575ea8..6def7697165 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -734,5 +734,16 @@ public class TestBackwardsCompatibility extends LuceneTestCase { dir.close(); } } + + public static final String surrogatesIndexName = "index.36.surrogates.zip"; + + public void testSurrogates() throws Exception { + File oldIndexDir = _TestUtil.getTempDir("surrogates"); + _TestUtil.unzip(getDataFile(surrogatesIndexName), oldIndexDir); + Directory dir = newFSDirectory(oldIndexDir); + // TODO: more tests + _TestUtil.checkIndex(dir); + dir.close(); + } } diff --git a/lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip b/lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip new file mode 100644 index 00000000000..6bd7f2027d1 Binary files /dev/null and b/lucene/src/test/org/apache/lucene/index/index.36.surrogates.zip differ