diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java index 72424d88495..f323dbab19f 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java @@ -62,7 +62,7 @@ public class PreFlexCodec extends Codec { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, true); + return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor); } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index b12eb383236..e286d981ae1 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -58,19 +58,11 @@ public class PreFlexFields extends FieldsProducer { private final Directory dir; private final int readBufferSize; private Directory cfsReader; - private final boolean unicodeSortOrder; - // If unicodeSortOrder is true, we do the surrogates dance - // so that the terms are sorted by unicode sort order. - // This should be true when segments are used for "normal" - // searching; it's only false during testing, to create a - // pre-flex index, using the preflexrw codec under - // src/test. - public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor, boolean unicodeSortOrder) + public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) throws IOException { si = info; - this.unicodeSortOrder = unicodeSortOrder; // NOTE: we must always load terms index, even for // "sequential" scan during merging, because what is @@ -114,6 +106,15 @@ public class PreFlexFields extends FieldsProducer { this.dir = dir; } + // If this returns, we do the surrogates dance so that the + // terms are sorted by unicode sort order. This should be + // true when segments are used for "normal" searching; + // it's only false during testing, to create a pre-flex + // index, using the test-only PreFlexRW. + protected boolean sortTermsByUnicode() { + return true; + } + static void files(Directory dir, SegmentInfo info, Collection files) throws IOException { files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION)); files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION)); @@ -241,7 +242,7 @@ public class PreFlexFields extends FieldsProducer { public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - if (unicodeSortOrder) { + if (sortTermsByUnicode()) { return BytesRef.getUTF8SortedAsUnicodeComparator(); } else { return BytesRef.getUTF8SortedAsUTF16Comparator(); @@ -692,6 +693,8 @@ public class PreFlexFields extends FieldsProducer { } } + private boolean unicodeSortOrder; + void reset(FieldInfo fieldInfo) throws IOException { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; @@ -705,6 +708,8 @@ public class PreFlexFields extends FieldsProducer { } skipNext = true; + unicodeSortOrder = sortTermsByUnicode(); + final Term t = termEnum.term(); if (t != null && t.field() == fieldInfo.name) { newSuffixStart = 0; diff --git a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java index da92b5f1692..3c0fe1c387a 100644 --- a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java +++ b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java @@ -25,7 +25,6 @@ import java.util.List; import java.util.Map; import java.util.Random; import org.apache.lucene.util.*; -import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import junit.framework.Assert; @@ -263,10 +262,7 @@ public class TestStressIndexing2 extends MultiCodecTestCase { } public static void verifyEquals(Random r, IndexReader r1, Directory dir2, String idField) throws Throwable { - // When we're testing w/ PreFlex codec, we must open - // this reader with UTF16 terms since incoming NRT - // reader is sorted this way: - IndexReader r2 = IndexReader.open(dir2, null, true, _TestUtil.nextInt(r, 1, 3), _TestUtil.alwaysCodec(new PreFlexRWCodec("utf16"))); + IndexReader r2 = IndexReader.open(dir2); verifyEquals(r1, r2, idField); r2.close(); } diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java index 5ab7deb33d5..7af84d23b11 100644 --- a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java @@ -63,10 +63,14 @@ public class TestSurrogates extends LuceneTestCaseJ4 { private String getRandomString(Random r) { String s; - if (r.nextInt(3) == 1) { - s = makeDifficultRandomUnicodeString(r); + if (r.nextInt(5) == 1) { + if (r.nextInt(3) == 1) { + s = makeDifficultRandomUnicodeString(r); + } else { + s = _TestUtil.randomUnicodeString(r); + } } else { - s = _TestUtil.randomUnicodeString(r); + s = _TestUtil.randomRealisticUnicodeString(r); } return s; } @@ -272,7 +276,7 @@ public class TestSurrogates extends LuceneTestCaseJ4 { RandomIndexWriter w = new RandomIndexWriter(r, dir, newIndexWriterConfig(r, TEST_VERSION_CURRENT, - new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec(null)))); + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec()))); final int numField = _TestUtil.nextInt(r, 2, 5); diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java index 242e50370ab..653025e2f3a 100644 --- a/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.codecs.preflex.PreFlexCodec; import org.apache.lucene.index.codecs.preflex.PreFlexFields; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.util.LuceneTestCaseJ4; /** Codec, only for testing, that can write and read the * pre-flex index format. @@ -33,20 +34,14 @@ import org.apache.lucene.index.codecs.FieldsProducer; */ public class PreFlexRWCodec extends PreFlexCodec { - private final String termSortOrder; - - // termSortOrder should be null (dynamically deteremined - // by stack), "codepoint" or "utf16" - public PreFlexRWCodec(String termSortOrder) { + public PreFlexRWCodec() { // NOTE: we impersonate the PreFlex codec so that it can // read the segments we write! super(); - this.termSortOrder = termSortOrder; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - System.out.println("PFW"); return new PreFlexFieldsWriter(state); } @@ -56,23 +51,27 @@ public class PreFlexRWCodec extends PreFlexCodec { // Whenever IW opens readers, eg for merging, we have to // keep terms order in UTF16: - boolean unicodeSortOrder; - if (termSortOrder == null) { - unicodeSortOrder = true; + return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor) { + @Override + protected boolean sortTermsByUnicode() { + // We carefully peek into stack track above us: if + // we are part of a "merge", we must sort by UTF16: + boolean unicodeSortOrder = true; - StackTraceElement[] trace = new Exception().getStackTrace(); - for (int i = 0; i < trace.length; i++) { - //System.out.println(trace[i].getClassName()); - if ("org.apache.lucene.index.IndexWriter".equals(trace[i].getClassName())) { - unicodeSortOrder = false; - break; + StackTraceElement[] trace = new Exception().getStackTrace(); + for (int i = 0; i < trace.length; i++) { + //System.out.println(trace[i].getClassName()); + if ("merge".equals(trace[i].getMethodName())) { + unicodeSortOrder = false; + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("NOTE: PreFlexRW codec: forcing legacy UTF16 term sort order"); + } + break; + } } - } - //System.out.println("PRW: " + unicodeSortOrder); - } else { - unicodeSortOrder = termSortOrder.equals("codepoint"); - } - return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, unicodeSortOrder); + return unicodeSortOrder; + } + }; } } diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java index eefad0347d7..ef46ef37447 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java @@ -128,7 +128,7 @@ public abstract class LuceneTestCase extends TestCase { // test-only PreFlexRW codec (since core PreFlex can // only read segments): if (codec.equals("PreFlex")) { - CodecProvider.getDefault().register(new PreFlexRWCodec(null)); + CodecProvider.getDefault().register(new PreFlexRWCodec()); } CodecProvider.setDefaultCodec(codec); } @@ -158,7 +158,7 @@ public abstract class LuceneTestCase extends TestCase { BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount); // Restore read-only PreFlex codec: if (codec.equals("PreFlex")) { - CodecProvider.getDefault().unregister(new PreFlexRWCodec(null)); + CodecProvider.getDefault().unregister(new PreFlexRWCodec()); CodecProvider.getDefault().register(new PreFlexCodec()); } CodecProvider.setDefaultCodec(savedDefaultCodec); diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java index bcfb3a2fc2a..e6d033a018a 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java @@ -152,7 +152,7 @@ public class LuceneTestCaseJ4 { // test-only PreFlexRW codec (since core PreFlex can // only read segments): if (codec.equals("PreFlex")) { - CodecProvider.getDefault().register(new PreFlexRWCodec(null)); + CodecProvider.getDefault().register(new PreFlexRWCodec()); } CodecProvider.setDefaultCodec(codec); } @@ -161,7 +161,7 @@ public class LuceneTestCaseJ4 { public static void afterClassLuceneTestCaseJ4() { // Restore read-only PreFlex codec: if (codec.equals("PreFlex")) { - CodecProvider.getDefault().unregister(new PreFlexRWCodec(null)); + CodecProvider.getDefault().unregister(new PreFlexRWCodec()); CodecProvider.getDefault().register(new PreFlexCodec()); } CodecProvider.setDefaultCodec(savedDefaultCodec);