diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 363313421bc..752c867d9e5 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -56,6 +56,7 @@ + @@ -434,6 +435,8 @@ + + diff --git a/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java b/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java index 82fa02060c5..9a0faed552b 100644 --- a/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java +++ b/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopDocs; @@ -107,8 +108,8 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase { RAMDirectory ramdir = new RAMDirectory(); Analyzer analyzer = randomAnalyzer(); - IndexWriter writer = new IndexWriter(ramdir, analyzer, - IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter writer = new IndexWriter(ramdir, + new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED); Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java index 5386e51bee9..aec2481c41f 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java @@ -18,13 +18,13 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.Random; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -38,8 +38,7 @@ public class BooleanFilterTest extends LuceneTestCase { protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new IndexWriterConfig( - TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, new MockAnalyzer(MockTokenizer.WHITESPACE, false)); //Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags addDoc(writer, "admin guest", "010", "20040101","Y"); diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java index ee930b29902..2e4bb29cb87 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/ChainedFilterTest.java @@ -21,11 +21,9 @@ import java.util.Calendar; import java.util.GregorianCalendar; import java.util.Random; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -63,9 +61,7 @@ public class ChainedFilterTest extends LuceneTestCase { super.setUp(); random = newRandom(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); - + RandomIndexWriter writer = new RandomIndexWriter(random, directory); Calendar cal = new GregorianCalendar(); cal.clear(); cal.setTimeInMillis(1041397200000L); // 2003 January 01 @@ -200,8 +196,7 @@ public class ChainedFilterTest extends LuceneTestCase { public void testWithCachingFilter() throws Exception { Directory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); IndexReader reader = writer.getReader(); writer.close(); diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java index c06f26849f0..e41ebd7282b 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java @@ -20,11 +20,9 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.HashSet; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.DocsEnum; @@ -44,8 +42,7 @@ public class DuplicateFilterTest extends LuceneTestCase { protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); //Add series of docs with filterable fields : url, text and dates flags addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java index fd2881a75ea..96fa522b78b 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/FuzzyLikeThisQueryTest.java @@ -25,7 +25,6 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -41,8 +40,7 @@ public class FuzzyLikeThisQueryTest extends LuceneTestCase { protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); //Add series of docs with misspelt names addDoc(writer, "jonathon smythe","1"); diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java index 9050d78db07..9ac12257d18 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java @@ -19,11 +19,9 @@ package org.apache.lucene.search; import java.util.HashSet; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -53,8 +51,7 @@ public class TermsFilterTest extends LuceneTestCase { public void testMissingTerms() throws Exception { String fieldName="field1"; RAMDirectory rd=new RAMDirectory(); - RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter w = new RandomIndexWriter(newRandom(), rd); for (int i = 0; i < 100; i++) { Document doc=new Document(); int term=i*10; //terms are units of 10; diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java index b4c6aad6ba8..9dad9d98edd 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/regex/TestRegexQuery.java @@ -20,10 +20,8 @@ package org.apache.lucene.search.regex; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.IndexSearcher; @@ -44,8 +42,7 @@ public class TestRegexQuery extends LuceneTestCase { protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); Document doc = new Document(); doc.add(new Field(FN, "the quick brown fox jumps over the lazy dog", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java index 92f5d24e1ec..d36893a8cb7 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/similar/TestMoreLikeThis.java @@ -28,7 +28,6 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; @@ -46,8 +45,7 @@ public class TestMoreLikeThis extends LuceneTestCase { protected void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); // Add series of docs with specific information for MoreLikeThis addDoc(writer, "lucene"); diff --git a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java index 19d1f5072db..c2a68c5ba0e 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -116,7 +116,7 @@ public final class MultiTermsEnum extends TermsEnum { // different TermComps final Comparator subTermComp = termsEnumIndex.termsEnum.getComparator(); if (subTermComp != null && !subTermComp.equals(termComp)) { - throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); + throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge"); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java index 71e6c8519ea..3eb695cf6bc 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -47,13 +47,12 @@ public abstract class CodecProvider { private static String defaultCodec = "Standard"; - public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock"}; + public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock", "PreFlex"}; public void register(Codec codec) { if (codec.name == null) { throw new IllegalArgumentException("code.name is null"); } - if (!codecs.containsKey(codec.name)) { codecs.put(codec.name, codec); codec.getExtensions(knownExtensions); @@ -61,6 +60,21 @@ public abstract class CodecProvider { throw new IllegalArgumentException("codec '" + codec.name + "' is already registered as a different codec instance"); } } + + /** @lucene.internal */ + public void unregister(Codec codec) { + if (codec.name == null) { + throw new IllegalArgumentException("code.name is null"); + } + if (codecs.containsKey(codec.name)) { + Codec c = codecs.get(codec.name); + if (codec == c) { + codecs.remove(codec.name); + } else { + throw new IllegalArgumentException("codec '" + codec.name + "' is being impersonated by a different codec instance!!!"); + } + } + } public Collection getAllExtensions() { return knownExtensions; @@ -111,8 +125,5 @@ class DefaultCodecProvider extends CodecProvider { @Override public Codec getWriter(SegmentWriteState state) { return lookup(CodecProvider.getDefaultCodec()); - //return lookup("Pulsing"); - //return lookup("Sep"); - //return lookup("IntBlock"); } -} \ No newline at end of file +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java index 0c0bcf86569..8389df02600 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.Fields; import org.apache.lucene.index.FieldsEnum; import java.io.IOException; +import java.io.Closeable; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this @@ -30,7 +31,7 @@ import java.io.IOException; * * @lucene.experimental */ -public abstract class FieldsConsumer { +public abstract class FieldsConsumer implements Closeable { /** Add a new field */ public abstract TermsConsumer addField(FieldInfo field) throws IOException; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java index 0c67b81f2d1..f323dbab19f 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java @@ -40,16 +40,16 @@ import org.apache.lucene.index.codecs.FieldsProducer; public class PreFlexCodec extends Codec { /** Extension of terms file */ - static final String TERMS_EXTENSION = "tis"; + public static final String TERMS_EXTENSION = "tis"; /** Extension of terms index file */ - static final String TERMS_INDEX_EXTENSION = "tii"; + public static final String TERMS_INDEX_EXTENSION = "tii"; /** Extension of freq postings file */ - static final String FREQ_EXTENSION = "frq"; + public static final String FREQ_EXTENSION = "frq"; /** Extension of prox postings file */ - static final String PROX_EXTENSION = "prx"; + public static final String PROX_EXTENSION = "prx"; public PreFlexCodec() { name = "PreFlex"; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index f5d8e74eda3..9351a3324dc 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -40,12 +40,11 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { - + private static final boolean DEBUG_SURROGATES = false; public TermInfosReader tis; @@ -60,7 +59,7 @@ public class PreFlexFields extends FieldsProducer { private final int readBufferSize; private Directory cfsReader; - PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) + public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) throws IOException { si = info; @@ -107,6 +106,15 @@ public class PreFlexFields extends FieldsProducer { this.dir = dir; } + // If this returns, we do the surrogates dance so that the + // terms are sorted by unicode sort order. This should be + // true when segments are used for "normal" searching; + // it's only false during testing, to create a pre-flex + // index, using the test-only PreFlexRW. + protected boolean sortTermsByUnicode() { + return true; + } + static void files(Directory dir, SegmentInfo info, Collection files) throws IOException { files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION)); files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION)); @@ -182,6 +190,12 @@ public class PreFlexFields extends FieldsProducer { if (cfsReader != null) { cfsReader.close(); } + if (freqStream != null) { + freqStream.close(); + } + if (proxStream != null) { + proxStream.close(); + } } private class PreFlexFieldsEnum extends FieldsEnum { @@ -228,7 +242,11 @@ public class PreFlexFields extends FieldsProducer { public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (sortTermsByUnicode()) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } } @@ -238,237 +256,475 @@ public class PreFlexFields extends FieldsProducer { private boolean skipNext; private BytesRef current; - private int[] surrogateSeekPending = new int[1]; - private boolean[] surrogateDidSeekBack = new boolean[1]; - private int surrogateSeekUpto; - private char[] pendingPrefix; - private SegmentTermEnum seekTermEnum; private Term protoTerm; + + private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0; + private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee; + + // Returns true if the unicode char is "after" the + // surrogates in UTF16, ie >= U+E000 and <= U+FFFF: + private final boolean isHighBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD; + } + + // Returns true if the unicode char in the UTF8 byte + // sequence starting at idx encodes a char outside of + // BMP (ie what would be a surrogate pair in UTF16): + private final boolean isNonBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD; + } + + private final byte[] scratch = new byte[4]; + private final BytesRef prevTerm = new BytesRef(); + private final BytesRef scratchTerm = new BytesRef(); private int newSuffixStart; + // Swap in S, in place of E: + private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException { + final int savLength = term.length; + + assert term.offset == 0; + + // The 3 bytes starting at downTo make up 1 + // unicode character: + assert isHighBMPChar(term.bytes, pos); + + // NOTE: we cannot make this assert, because + // AutomatonQuery legitimately sends us malformed UTF8 + // (eg the UTF8 bytes with just 0xee) + // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); + + // Save the bytes && length, since we need to + // restore this if seek "back" finds no matching + // terms + if (term.bytes.length < 4+pos) { + term.grow(4+pos); + } + + scratch[0] = term.bytes[pos]; + scratch[1] = term.bytes[pos+1]; + scratch[2] = term.bytes[pos+2]; + + term.bytes[pos] = (byte) 0xf0; + term.bytes[pos+1] = (byte) 0x90; + term.bytes[pos+2] = (byte) 0x80; + term.bytes[pos+3] = (byte) 0x80; + term.length = 4+pos; + + if (DEBUG_SURROGATES) { + System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString())); + } + + // Seek "back": + getTermsDict().seekEnum(te, protoTerm.createTerm(term)); + + // Test if the term we seek'd to in fact found a + // surrogate pair at the same position as the E: + Term t2 = te.term(); + + // Cannot be null (or move to next field) because at + // "worst" it'd seek to the same term we are on now, + // unless we are being called from seek + if (t2 == null || t2.field() != fieldInfo.name) { + return false; + } + + if (DEBUG_SURROGATES) { + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text())); + } + + // Now test if prefix is identical and we found + // a non-BMP char at the same position: + BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + boolean matches; + if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) { + matches = true; + for(int i=0;i limit) { + + if (isHighBMPChar(prevTerm.bytes, downTo)) { + + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length); + } + + if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { + // TODO: more efficient seek? + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + //newSuffixStart = downTo+4; + newSuffixStart = downTo; + scratchTerm.copy(termEnum.term().bytes()); + didSeek = true; + if (DEBUG_SURROGATES) { + System.out.println(" seek!"); + } + break; + } else { + if (DEBUG_SURROGATES) { + System.out.println(" no seek"); + } + } + } + + // Shorten prevTerm in place so that we don't redo + // this loop if we come back here: + if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) { + prevTerm.length = downTo; + } + + downTo--; + } + + return didSeek; + } + + // Look for seek type 3 ("pop"): if the delta from + // prev -> current was replacing an S with an E, + // we must now seek to beyond that E. This seek + // "finishes" the dance at this character + // position. + private boolean doPop() throws IOException { + + if (DEBUG_SURROGATES) { + System.out.println(" try pop"); + } + + assert newSuffixStart <= prevTerm.length; + assert newSuffixStart < scratchTerm.length || newSuffixStart == 0; + + if (prevTerm.length > newSuffixStart && + isNonBMPChar(prevTerm.bytes, newSuffixStart) && + isHighBMPChar(scratchTerm.bytes, newSuffixStart)) { + + // Seek type 2 -- put 0xFF at this position: + scratchTerm.bytes[newSuffixStart] = (byte) 0xff; + scratchTerm.length = newSuffixStart+1; + + if (DEBUG_SURROGATES) { + System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString()); + } + + // TODO: more efficient seek? can we simply swap + // the enums? + getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); + + final Term t2 = termEnum.term(); + + // We could hit EOF or different field since this + // was a seek "forward": + if (t2 != null && t2.field() == fieldInfo.name) { + + if (DEBUG_SURROGATES) { + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes()); + } + + final BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + + // Set newSuffixStart -- we can't use + // termEnum's since the above seek may have + // done no scanning (eg, term was precisely + // and index term, or, was in the term seek + // cache): + scratchTerm.copy(b2); + setNewSuffixStart(prevTerm, scratchTerm); + + return true; + } else if (newSuffixStart != 0 || scratchTerm.length != 0) { + if (DEBUG_SURROGATES) { + System.out.println(" got term=null (or next field)"); + } + newSuffixStart = 0; + scratchTerm.length = 0; + return true; + } + } + + return false; + } + + // Pre-flex indices store terms in UTF16 sort order, but + // certain queries require Unicode codepoint order; this + // method carefully seeks around surrogates to handle + // this impedance mismatch + + private void surrogateDance() throws IOException { + + if (!unicodeSortOrder) { + return; + } + + // We are invoked after TIS.next() (by UTF16 order) to + // possibly seek to a different "next" (by unicode + // order) term. + + // We scan only the "delta" from the last term to the + // current term, in UTF8 bytes. We look at 1) the bytes + // stripped from the prior term, and then 2) the bytes + // appended to that prior term's prefix. + + // We don't care about specific UTF8 sequences, just + // the "category" of the UTF16 character. Category S + // is a high/low surrogate pair (it non-BMP). + // Category E is any BMP char > UNI_SUR_LOW_END (and < + // U+FFFF). Category A is the rest (any unicode char + // <= UNI_SUR_HIGH_START). + + // The core issue is that pre-flex indices sort the + // characters as ASE, while flex must sort as AES. So + // when scanning, when we hit S, we must 1) seek + // forward to E and enum the terms there, then 2) seek + // back to S and enum all terms there, then 3) seek to + // after E. Three different seek points (1, 2, 3). + + // We can easily detect S in UTF8: if a byte has + // prefix 11110 (0xf0), then that byte and the + // following 3 bytes encode a single unicode codepoint + // in S. Similary,we can detect E: if a byte has + // prefix 1110111 (0xee), then that byte and the + // following 2 bytes encode a single unicode codepoint + // in E. + + // Note that this is really a recursive process -- + // maybe the char at pos 2 needs to dance, but any + // point in its dance, suddenly pos 4 needs to dance + // so you must finish pos 4 before returning to pos + // 2. But then during pos 4's dance maybe pos 7 needs + // to dance, etc. However, despite being recursive, + // we don't need to hold any state because the state + // can always be derived by looking at prior term & + // current term. + + // TODO: can we avoid this copy? + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { + scratchTerm.length = 0; + } else { + scratchTerm.copy(termEnum.term().bytes()); + } + + if (DEBUG_SURROGATES) { + System.out.println(" dance"); + System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString())); + System.out.println(" " + prevTerm.toString()); + System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString())); + System.out.println(" " + scratchTerm.toString()); + } + + // This code assumes TermInfosReader/SegmentTermEnum + // always use BytesRef.offset == 0 + assert prevTerm.offset == 0; + assert scratchTerm.offset == 0; + + // Need to loop here because we may need to do multiple + // pops, and possibly a continue in the end, ie: + // + // cont + // pop, cont + // pop, pop, cont + // + // + + while(true) { + if (doContinue()) { + break; + } else { + if (!doPop()) { + break; + } + } + } + + if (DEBUG_SURROGATES) { + System.out.println(" finish bmp ends"); + } + + doPushes(); + } + + + // Look for seek type 1 ("push"): if the newly added + // suffix contains any S, we must try to seek to the + // corresponding E. If we find a match, we go there; + // else we keep looking for additional S's in the new + // suffix. This "starts" the dance, at this character + // position: + private void doPushes() throws IOException { + + int upTo = newSuffixStart; + if (DEBUG_SURROGATES) { + System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length); + } + + while(upTo < scratchTerm.length) { + if (isNonBMPChar(scratchTerm.bytes, upTo) && + (upTo > newSuffixStart || + (upTo >= prevTerm.length || + (!isNonBMPChar(prevTerm.bytes, upTo) && + !isHighBMPChar(prevTerm.bytes, upTo))))) { + + // A non-BMP char (4 bytes UTF8) starts here: + assert scratchTerm.length >= upTo + 4; + + final int savLength = scratchTerm.length; + scratch[0] = scratchTerm.bytes[upTo]; + scratch[1] = scratchTerm.bytes[upTo+1]; + scratch[2] = scratchTerm.bytes[upTo+2]; + + scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD; + scratchTerm.bytes[upTo+1] = (byte) 0x80; + scratchTerm.bytes[upTo+2] = (byte) 0x80; + scratchTerm.length = upTo+3; + + if (DEBUG_SURROGATES) { + System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length); + } + + // Seek "forward": + // TODO: more efficient seek? + getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); + + scratchTerm.bytes[upTo] = scratch[0]; + scratchTerm.bytes[upTo+1] = scratch[1]; + scratchTerm.bytes[upTo+2] = scratch[2]; + scratchTerm.length = savLength; + + // Did we find a match? + final Term t2 = seekTermEnum.term(); + + if (DEBUG_SURROGATES) { + if (t2 == null) { + System.out.println(" hit term=null"); + } else { + System.out.println(" hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes())); + } + } + + // Since this was a seek "forward", we could hit + // EOF or a different field: + boolean matches; + + if (t2 != null && t2.field() == fieldInfo.name) { + final BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) { + matches = true; + for(int i=0;i BMP + upTo += 3; + + // NOTE: we keep iterating, now, since this + // can easily "recurse". Ie, after seeking + // forward at a certain char position, we may + // find another surrogate in our [new] suffix + // and must then do another seek (recurse) + } else { + upTo++; + } + } else { + upTo++; + } + } + } + + private boolean unicodeSortOrder; + void reset(FieldInfo fieldInfo) throws IOException { + //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; protoTerm = new Term(fieldInfo.name); if (termEnum == null) { termEnum = getTermsDict().terms(protoTerm); seekTermEnum = getTermsDict().terms(protoTerm); + //System.out.println(" term=" + termEnum.term()); } else { getTermsDict().seekEnum(termEnum, protoTerm); } skipNext = true; - - surrogateSeekUpto = 0; - newSuffixStart = 0; - surrogatesDance(); - } + unicodeSortOrder = sortTermsByUnicode(); - private void surrogatesDance() throws IOException { - - // Tricky: prior to 4.0, Lucene index sorted terms in - // UTF16 order, but as of 4.0 we sort by Unicode code - // point order. These orders differ because of the - // surrrogates; so we have to fixup our enum, here, by - // carefully first seeking past the surrogates and - // then back again at the end. The process is - // recursive, since any given term could have multiple - // new occurrences of surrogate pairs, so we use a - // stack to record the pending seek-backs. - if (DEBUG_SURROGATES) { - System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); - } - - while(popPendingSeek()); - while(pushNewSurrogate()); - } - - // only for debugging - private String getStack() { - if (surrogateSeekUpto == 0) { - return "null"; - } else { - StringBuffer sb = new StringBuffer(); - for(int i=0;i 0) { - sb.append(' '); - } - sb.append(surrogateSeekPending[i]); - } - sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); - return sb.toString(); - } - } - - private boolean popPendingSeek() throws IOException { - if (DEBUG_SURROGATES) { - System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); - } - // if a .next() has advanced beyond the - // after-surrogates range we had last seeked to, we - // must seek back to the start and resume .next from - // there. this pops the pending seek off the stack. final Term t = termEnum.term(); - if (surrogateSeekUpto > 0) { - final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; - if (DEBUG_SURROGATES) { - System.out.println(" seekPrefix=" + seekPrefix); - } - if (newSuffixStart < seekPrefix) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; - pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix)); - if (DEBUG_SURROGATES) { - System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); - } - getTermsDict().seekEnum(termEnum, t2); - surrogateDidSeekBack[surrogateSeekUpto-1] = true; - - // +2 because we don't want to re-check the - // surrogates we just seek'd back to - newSuffixStart = seekPrefix + 2; - return true; - } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = 0xffff; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix)); - if (DEBUG_SURROGATES) { - System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); - } - getTermsDict().seekEnum(termEnum, t2); - if (DEBUG_SURROGATES) { - System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); - } - surrogateSeekUpto--; - - if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { - // force pop - newSuffixStart = -1; - } else { - newSuffixStart = termEnum.newSuffixStart; - } - - return true; - } + if (t != null && t.field() == fieldInfo.name) { + newSuffixStart = 0; + prevTerm.length = 0; + surrogateDance(); } - - return false; - } - - private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result(); - private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result(); - - private boolean pushNewSurrogate() throws IOException { - if (DEBUG_SURROGATES) { - System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); - } - final Term t = termEnum.term(); - if (t == null || t.field() != fieldInfo.name) { - return false; - } - - final BytesRef bytes = t.bytes(); - UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer); - - for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { - - if (DEBUG_SURROGATES) { - System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); - } - - // the next() that we just did read in a new - // suffix, containing a surrogate pair - - // seek forward to see if there are any terms with - // this same prefix, but with characters after the - // surrogate range; if so, we must first iterate - // them, then seek back to the surrogates - - char[] testPrefix = new char[i+2]; - for(int j=0;j=0;i--) { + if (isHighBMPChar(scratchTerm.bytes, i)) { + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + i + "; try seek"); + } + + if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { + + scratchTerm.copy(seekTermEnum.term().bytes()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + + newSuffixStart = 1+i; + + doPushes(); + + // Found a match + // TODO: faster seek? + current = termEnum.term().bytes(); + return SeekStatus.NOT_FOUND; + } + } + } + + if (DEBUG_SURROGATES) { + System.out.println(" seek END"); + } + current = null; return SeekStatus.END; } else { - current = tr; - return SeekStatus.NOT_FOUND; + + // We found a non-exact but non-null term; this one + // is fun -- just treat it like next, by pretending + // requested term was prev: + prevTerm.copy(term); + + if (DEBUG_SURROGATES) { + System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text())); + } + + final BytesRef br = t.bytes(); + assert br.offset == 0; + + setNewSuffixStart(term, br); + + surrogateDance(); + + final Term t2 = termEnum.term(); + if (t2 == null || t2.field() != fieldInfo.name) { + assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned + current = null; + return SeekStatus.END; + } else { + current = t2.bytes(); + assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString()); + return SeekStatus.NOT_FOUND; + } + } + } + + private void setNewSuffixStart(BytesRef br1, BytesRef br2) { + final int limit = Math.min(br1.length, br2.length); + int lastStart = 0; + for(int i=0;i FORMAT_MINIMUM) - throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); - if (format < FORMAT_CURRENT) - throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); + if (format > FORMAT_MINIMUM) + throw new IndexFormatTooOldException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); + if (format < FORMAT_CURRENT) + throw new IndexFormatTooNewException(null, format, FORMAT_MINIMUM, FORMAT_CURRENT); size = input.readLong(); // read the size - if(format == -1){ - if (!isIndex) { - indexInterval = input.readInt(); - formatM1SkipInterval = input.readInt(); - } - // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in - // skipTo implementation of these versions - skipInterval = Integer.MAX_VALUE; - } else { - indexInterval = input.readInt(); - skipInterval = input.readInt(); - maxSkipLevels = input.readInt(); - } + indexInterval = input.readInt(); + skipInterval = input.readInt(); + maxSkipLevels = input.readInt(); assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0"; assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0"; } @@ -132,18 +122,21 @@ public final class SegmentTermEnum implements Cloneable { position = p; termBuffer.set(t); prevBuffer.reset(); + //System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this); termInfo.set(ti); } /** Increments the enumeration to the next element. True if one exists.*/ public final boolean next() throws IOException { + prevBuffer.set(termBuffer); + //System.out.println(" ste setPrev=" + prev() + " this=" + this); + if (position++ >= size - 1) { - prevBuffer.set(termBuffer); termBuffer.reset(); + //System.out.println(" EOF"); return false; } - prevBuffer.set(termBuffer); termBuffer.read(input, fieldInfos); newSuffixStart = termBuffer.newSuffixStart; @@ -168,6 +161,7 @@ public final class SegmentTermEnum implements Cloneable { if (isIndex) indexPointer += input.readVLong(); // read index pointer + //System.out.println(" ste ret term=" + term()); return true; } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java index 5be99a2e734..679469d76e0 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java @@ -18,9 +18,10 @@ package org.apache.lucene.index.codecs.preflex; */ import java.io.IOException; +import java.util.Comparator; + import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.index.Term; import org.apache.lucene.index.FieldInfos; @@ -28,102 +29,65 @@ final class TermBuffer implements Cloneable { private String field; private Term term; // cached - private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) - private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); private BytesRef bytes = new BytesRef(10); - int newSuffixStart; + private static final Comparator utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator(); - public final int compareTo(TermBuffer other) { + int newSuffixStart; // only valid right after .read is called + + public int compareTo(TermBuffer other) { if (field == other.field) // fields are interned - return compareChars(text.result, text.length, other.text.result, other.text.length); + return utf8AsUTF16Comparator.compare(bytes, other.bytes); else return field.compareTo(other.field); } - private static int compareChars(char[] chars1, int len1, - char[] chars2, int len2) { - final int end = len1 < len2 ? len1:len2; - for (int k = 0; k < end; k++) { - char c1 = chars1[k]; - char c2 = chars2[k]; - if (c1 != c2) { - return c1 - c2; - } - } - return len1 - len2; - } - - public final void read(IndexInput input, FieldInfos fieldInfos) + public void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache - int start = input.readVInt(); + newSuffixStart = input.readVInt(); int length = input.readVInt(); - int totalLength = start + length; + int totalLength = newSuffixStart + length; if (bytes.bytes.length < totalLength) { bytes.grow(totalLength); } - if (dirty) { - // Fully convert all bytes since bytes is dirty - UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); - dirty = false; - } else { - // Incrementally convert only the UTF8 bytes that are new: - bytes.length = totalLength; - input.readBytes(bytes.bytes, start, length); - UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); - } - - while(true) { - newSuffixStart = text.offsets[start]; - if (newSuffixStart != -1) { - break; - } - if (--start == 0) { - newSuffixStart = 0; - break; - } - } + bytes.length = totalLength; + input.readBytes(bytes.bytes, newSuffixStart, length); this.field = fieldInfos.fieldName(input.readVInt()); } - public final void set(Term term) { + public void set(Term term) { if (term == null) { reset(); return; } - - final BytesRef termBytes = term.bytes(); - UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text); - dirty = true; + bytes.copy(term.bytes()); field = term.field(); this.term = term; } - public final void set(TermBuffer other) { - text.copyText(other.text); - dirty = true; + public void set(TermBuffer other) { field = other.field; - term = other.term; + // dangerous to copy Term over, since the underlying + // BytesRef could subsequently be modified: + term = null; + bytes.copy(other.bytes); } public void reset() { field = null; - text.setLength(0); term = null; - dirty = true; } public Term toTerm() { if (field == null) // unset return null; - if (term == null) - term = new Term(field, new BytesRef(text.result, 0, text.length), false); + if (term == null) { + term = new Term(field, new BytesRef(bytes), false); + //term = new Term(field, bytes, false); + } return term; } @@ -134,12 +98,7 @@ final class TermBuffer implements Cloneable { try { clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} - clone.dirty = true; - clone.bytes = new BytesRef(10); - clone.text = new UnicodeUtil.UTF16Result(); - clone.text.offsets = new int[text.offsets.length]; - System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length); - clone.text.copyText(text); + clone.bytes = new BytesRef(bytes); return clone; } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java index 9244ca08b09..bcc12e8fbc4 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfo.java @@ -23,30 +23,30 @@ package org.apache.lucene.index.codecs.preflex; * indexing. */ @Deprecated -class TermInfo { +public class TermInfo { /** The number of documents which contain the term. */ - int docFreq = 0; + public int docFreq = 0; - long freqPointer = 0; - long proxPointer = 0; - int skipOffset; + public long freqPointer = 0; + public long proxPointer = 0; + public int skipOffset; - TermInfo() {} + public TermInfo() {} - TermInfo(int df, long fp, long pp) { + public TermInfo(int df, long fp, long pp) { docFreq = df; freqPointer = fp; proxPointer = pp; } - TermInfo(TermInfo ti) { + public TermInfo(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; skipOffset = ti.skipOffset; } - final void set(int docFreq, + public final void set(int docFreq, long freqPointer, long proxPointer, int skipOffset) { this.docFreq = docFreq; this.freqPointer = freqPointer; @@ -54,7 +54,7 @@ class TermInfo { this.skipOffset = skipOffset; } - final void set(TermInfo ti) { + public final void set(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java index a495fa5fb45..51eeae3b097 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/TermInfosReader.java @@ -119,9 +119,12 @@ public final class TermInfosReader { indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; indexPointers = new long[indexSize]; - - for (int i = 0; indexEnum.next(); i++) { + + for (int i=0;indexEnum.next(); i++) { indexTerms[i] = indexEnum.term(); + assert indexTerms[i] != null; + assert indexTerms[i].text() != null; + assert indexTerms[i].field() != null; indexInfos[i] = indexEnum.termInfo(); indexPointers[i] = indexEnum.indexPointer; @@ -160,14 +163,14 @@ public final class TermInfosReader { return origEnum.maxSkipLevels; } - final void close() throws IOException { + void close() throws IOException { if (origEnum != null) origEnum.close(); threadResources.close(); } /** Returns the number of term/value pairs in the set. */ - final long size() { + long size() { return size; } @@ -183,12 +186,13 @@ public final class TermInfosReader { /** Returns the offset of the greatest index entry which is less than or equal to term.*/ - private final int getIndexOffset(Term term) { + private int getIndexOffset(Term term) { int lo = 0; // binary search indexTerms[] int hi = indexTerms.length - 1; while (hi >= lo) { int mid = (lo + hi) >>> 1; + assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid; int delta = term.compareToUTF16(indexTerms[mid]); if (delta < 0) hi = mid - 1; @@ -200,7 +204,7 @@ public final class TermInfosReader { return hi; } - private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { + private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], ((long) indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); @@ -231,6 +235,9 @@ public final class TermInfosReader { } TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { + if (size == 0) { + return null; + } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.term() != null // term is at or past current @@ -242,7 +249,6 @@ public final class TermInfosReader { // no need to seek final TermInfo ti; - int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); @@ -279,6 +285,7 @@ public final class TermInfosReader { seekEnum(enumerator, indexPos); enumerator.scanTo(term); final TermInfo ti; + if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (tiOrd == null) { @@ -294,7 +301,7 @@ public final class TermInfosReader { } // called only from asserts - private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { + private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.docFreq != ti2.docFreq) { return false; } @@ -319,7 +326,7 @@ public final class TermInfosReader { } /** Returns the position of a Term in the set or -1. */ - final long getPosition(Term term) throws IOException { + long getPosition(Term term) throws IOException { if (size == 0) return -1; ensureIndexIsRead(); diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java index ab0ef4e14ca..3128078cf38 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -331,12 +331,17 @@ public final class BytesRef implements Comparable, Externalizable { // We know the terms are not equal, but, we may // have to carefully fixup the bytes at the // difference to match UTF16's sort order: + + // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff, + // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences] + // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such + // that 6-byte sequences are needed we have much bigger problems anyway. if (aByte >= 0xee && bByte >= 0xee) { if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; + aByte += 0xe; } if ((bByte&0xfe) == 0xee) { - bByte += 0x10; + bByte += 0xe; } } return aByte - bByte; @@ -346,10 +351,6 @@ public final class BytesRef implements Comparable, Externalizable { // One is a prefix of the other, or, they are equal: return a.length - b.length; } - - public boolean equals(Object other) { - return this == other; - } } public void writeExternal(ObjectOutput out) diff --git a/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java b/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java index da31b7bd503..fd1c4737273 100644 --- a/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java +++ b/lucene/src/test/org/apache/lucene/document/TestBinaryDocument.java @@ -2,9 +2,7 @@ package org.apache.lucene.document; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.store.MockRAMDirectory; @@ -58,8 +56,7 @@ public class TestBinaryDocument extends LuceneTestCase { /** add the doc to a ram index */ MockRAMDirectory dir = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(doc); /** open a reader and fetch the document */ @@ -98,8 +95,7 @@ public class TestBinaryDocument extends LuceneTestCase { /** add the doc to a ram index */ MockRAMDirectory dir = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(doc); /** open a reader and fetch the document */ diff --git a/lucene/src/test/org/apache/lucene/document/TestDocument.java b/lucene/src/test/org/apache/lucene/document/TestDocument.java index 5751fb3bd95..fea8f7d0028 100644 --- a/lucene/src/test/org/apache/lucene/document/TestDocument.java +++ b/lucene/src/test/org/apache/lucene/document/TestDocument.java @@ -1,8 +1,6 @@ package org.apache.lucene.document; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; @@ -155,8 +153,7 @@ public class TestDocument extends LuceneTestCase { */ public void testGetValuesForIndexedDocument() throws Exception { RAMDirectory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(makeDocumentWithFields()); IndexReader reader = writer.getReader(); @@ -234,8 +231,7 @@ public class TestDocument extends LuceneTestCase { Field.Index.NOT_ANALYZED)); RAMDirectory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); writer.addDocument(doc); field.setValue("id2"); writer.addDocument(doc); diff --git a/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java b/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java index 5853dd3b126..9c9fe81bc92 100644 --- a/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/RandomIndexWriter.java @@ -17,20 +17,18 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.util.Random; import java.io.Closeable; import java.io.IOException; +import java.util.Random; -import org.apache.lucene.util._TestUtil; -import org.apache.lucene.store.Directory; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; -import org.apache.lucene.index.codecs.Codec; import org.apache.lucene.index.codecs.CodecProvider; -import org.apache.lucene.index.codecs.intblock.IntBlockCodec; -import org.apache.lucene.index.codecs.preflex.PreFlexCodec; -import org.apache.lucene.index.codecs.pulsing.PulsingCodec; -import org.apache.lucene.index.codecs.sep.SepCodec; -import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCaseJ4; +import org.apache.lucene.util.Version; +import org.apache.lucene.util._TestUtil; /** Silly class that randomizes the indexing experience. EG * it may swap in a different merge policy/scheduler; may @@ -45,32 +43,48 @@ public class RandomIndexWriter implements Closeable { int docCount; int flushAt; + // Randomly calls Thread.yield so we mixup thread scheduling + private static final class MockIndexWriter extends IndexWriter { + + private final Random r; + + public MockIndexWriter(Random r,Directory dir, IndexWriterConfig conf) throws IOException { + super(dir, conf); + this.r = r; + } + + @Override + boolean testPoint(String name) { + if (r.nextInt(4) == 2) + Thread.yield(); + return true; + } + } + + /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT and MockAnalyzer */ + public RandomIndexWriter(Random r, Directory dir) throws IOException { + this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, new MockAnalyzer())); + } + + /** create a RandomIndexWriter with a random config: Uses TEST_VERSION_CURRENT */ + public RandomIndexWriter(Random r, Directory dir, Analyzer a) throws IOException { + this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, LuceneTestCaseJ4.TEST_VERSION_CURRENT, a)); + } + + /** create a RandomIndexWriter with a random config */ + public RandomIndexWriter(Random r, Directory dir, Version v, Analyzer a) throws IOException { + this(r, dir, LuceneTestCaseJ4.newIndexWriterConfig(r, v, a)); + } + + /** create a RandomIndexWriter with the provided config */ public RandomIndexWriter(Random r, Directory dir, IndexWriterConfig c) throws IOException { this.r = r; - if (r.nextBoolean()) { - c.setMergePolicy(new LogDocMergePolicy()); - } - if (r.nextBoolean()) { - c.setMergeScheduler(new SerialMergeScheduler()); - } - if (r.nextBoolean()) { - c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000)); - } - if (r.nextBoolean()) { - c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000)); - } - - if (c.getMergePolicy() instanceof LogMergePolicy) { - LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy(); - logmp.setUseCompoundDocStore(r.nextBoolean()); - logmp.setUseCompoundFile(r.nextBoolean()); - logmp.setCalibrateSizeByDeletes(r.nextBoolean()); - } - - c.setReaderPooling(r.nextBoolean()); - c.setCodecProvider(new RandomCodecProvider(r)); - w = new IndexWriter(dir, c); + w = new MockIndexWriter(r, dir, c); flushAt = _TestUtil.nextInt(r, 10, 1000); + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("RIW config=" + w.getConfig()); + System.out.println("codec default=" + CodecProvider.getDefaultCodec()); + } } public void addDocument(Document doc) throws IOException { @@ -89,14 +103,27 @@ public class RandomIndexWriter implements Closeable { w.deleteDocuments(term); } + public void commit() throws CorruptIndexException, IOException { + w.commit(); + } + public int maxDoc() { return w.maxDoc(); } public IndexReader getReader() throws IOException { - if (r.nextBoolean()) { + // If we are writing with PreFlexRW, force a full + // IndexReader.open so terms are sorted in codepoint + // order during searching: + if (!w.codecs.getWriter(null).name.equals("PreFlex") && r.nextBoolean()) { + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("RIW.getReader: use NRT reader"); + } return w.getReader(); } else { + if (LuceneTestCaseJ4.VERBOSE) { + System.out.println("RIW.getReader: open new reader"); + } w.commit(); return IndexReader.open(w.getDirectory(), new KeepOnlyLastCommitDeletionPolicy(), r.nextBoolean(), _TestUtil.nextInt(r, 1, 10)); } @@ -112,22 +139,4 @@ public class RandomIndexWriter implements Closeable { public void optimize() throws IOException { w.optimize(); } - - class RandomCodecProvider extends CodecProvider { - final String codec; - - RandomCodecProvider(Random random) { - register(new StandardCodec()); - register(new IntBlockCodec()); - register(new PreFlexCodec()); - register(new PulsingCodec()); - register(new SepCodec()); - codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)]; - } - - @Override - public Codec getWriter(SegmentWriteState state) { - return lookup(codec); - } - } } diff --git a/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java b/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java index bcf0e56c2eb..7f53710530e 100755 --- a/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java +++ b/lucene/src/test/org/apache/lucene/index/TestAddIndexes.java @@ -19,7 +19,6 @@ package org.apache.lucene.index; import java.io.IOException; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; @@ -139,7 +138,6 @@ public class TestAddIndexes extends LuceneTestCase { setUpDirs(dir, aux); IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND)); - writer.addIndexes(new Directory[] {aux}); // Adds 10 docs, then replaces them with another 10 diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index 11921e3aa03..22b0ea172b7 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -493,14 +493,21 @@ public class TestCodecs extends MultiCodecTestCase { // Test random seek by ord: final int idx = TestCodecs.this.nextInt(field.terms.length); term = field.terms[idx]; - status = termsEnum.seek(idx); - assertEquals(status, TermsEnum.SeekStatus.FOUND); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); - assertEquals(term.docs.length, termsEnum.docFreq()); - if (field.omitTF) { - this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); - } else { - this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); + try { + status = termsEnum.seek(idx); + } catch (UnsupportedOperationException uoe) { + // ok -- skip it + status = null; + } + if (status != null) { + assertEquals(status, TermsEnum.SeekStatus.FOUND); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); + assertEquals(term.docs.length, termsEnum.docFreq()); + if (field.omitTF) { + this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); + } else { + this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); + } } // Test seek to non-existent terms: @@ -520,9 +527,12 @@ public class TestCodecs extends MultiCodecTestCase { // Seek to each term by ord, backwards for(int i=field.terms.length-1;i>=0;i--) { - assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); - assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + try { + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + } catch (UnsupportedOperationException uoe) { + } } // Seek to non-existent empty-string term diff --git a/lucene/src/test/org/apache/lucene/index/TestFlex.java b/lucene/src/test/org/apache/lucene/index/TestFlex.java index cd114a0aa99..b26538f68e3 100644 --- a/lucene/src/test/org/apache/lucene/index/TestFlex.java +++ b/lucene/src/test/org/apache/lucene/index/TestFlex.java @@ -20,6 +20,8 @@ package org.apache.lucene.index; import java.io.*; import java.util.*; import org.apache.lucene.store.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.standard.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; @@ -64,7 +66,8 @@ public class TestFlex extends LuceneTestCase { public void testTermOrd() throws Exception { Directory d = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED)); w.addDocument(doc); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java index 29641b0910a..c8410e23b1d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java @@ -1675,7 +1675,7 @@ public class TestIndexReader extends LuceneTestCase // LUCENE-1586: getUniqueTermCount public void testUniqueTermCount() throws Exception { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1708,7 +1708,7 @@ public class TestIndexReader extends LuceneTestCase // LUCENE-1609: don't load terms index public void testNoTermsIndex() throws Throwable { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1725,7 +1725,7 @@ public class TestIndexReader extends LuceneTestCase } assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); - writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); writer.addDocument(doc); writer.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java index bda9f28a0d8..d33d7365159 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4559,7 +4559,7 @@ public class TestIndexWriter extends LuceneTestCase { dir.close(); } - // LUCENE-2095: make sure with multiple threads commit + // LUCENE-2095: make sure with multiple threads commit // doesn't return until all changes are in fact in the // index public void testCommitThreadSafety() throws Throwable { @@ -4670,16 +4670,16 @@ public class TestIndexWriter extends LuceneTestCase { } // Make sure terms, including ones with surrogate pairs, - // sort in UTF16 sort order by default + // sort in codepoint sort order by default public void testTermUTF16SortOrder() throws Throwable { + Random rnd = newRandom(); Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + RandomIndexWriter writer = new RandomIndexWriter(rnd, dir); Document d = new Document(); // Single segment Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); d.add(f); char[] chars = new char[2]; - Random rnd = newRandom(); final Set allTerms = new HashSet(); for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) { @@ -4705,14 +4705,13 @@ public class TestIndexWriter extends LuceneTestCase { allTerms.add(s); f.setValue(s); - //System.out.println("add " + termDesc(s)); writer.addDocument(d); if ((1+i) % 42 == 0) { writer.commit(); } } - + IndexReader r = writer.getReader(); // Test each sub-segment diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java index be19393b3cd..27dfd8982f4 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java @@ -394,18 +394,18 @@ public class TestIndexWriterDelete extends LuceneTestCase { } public void testDeletesOnDiskFull() throws IOException { - testOperationsOnDiskFull(false); + doTestOperationsOnDiskFull(false); } public void testUpdatesOnDiskFull() throws IOException { - testOperationsOnDiskFull(true); + doTestOperationsOnDiskFull(true); } /** * Make sure if modifier tries to commit but hits disk full that modifier * remains consistent and usable. Similar to TestIndexReader.testDiskFull(). */ - private void testOperationsOnDiskFull(boolean updates) throws IOException { + private void doTestOperationsOnDiskFull(boolean updates) throws IOException { Term searchTerm = new Term("content", "aaa"); int START_COUNT = 157; @@ -700,6 +700,7 @@ public class TestIndexWriterDelete extends LuceneTestCase { try { modifier.commit(); } catch (IOException ioe) { + // expected failed = true; } diff --git a/lucene/src/test/org/apache/lucene/index/TestMultiFields.java b/lucene/src/test/org/apache/lucene/index/TestMultiFields.java index cf62dfc3e62..4b0bc2b7f25 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMultiFields.java +++ b/lucene/src/test/org/apache/lucene/index/TestMultiFields.java @@ -27,11 +27,12 @@ public class TestMultiFields extends LuceneTestCase { public void testRandom() throws Exception { + Random r = newRandom(); + for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) { Directory dir = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); - Random r = new Random(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); Map> docs = new HashMap>(); Set deleted = new HashSet(); @@ -45,7 +46,7 @@ public class TestMultiFields extends LuceneTestCase { doc.add(id); boolean onlyUniqueTerms = r.nextBoolean(); - + Set uniqueTerms = new HashSet(); for(int i=0;i 0) { @@ -61,6 +62,7 @@ public class TestMultiFields extends LuceneTestCase { } docs.get(term).add(i); terms.add(term); + uniqueTerms.add(term); f.setValue(s); } id.setValue(""+i); @@ -75,8 +77,18 @@ public class TestMultiFields extends LuceneTestCase { } } + if (VERBOSE) { + List termsList = new ArrayList(uniqueTerms); + Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); + System.out.println("UTF16 order:"); + for(BytesRef b : termsList) { + System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); + } + } + IndexReader reader = w.getReader(); w.close(); + //System.out.println("TEST reader=" + reader); Bits delDocs = MultiFields.getDeletedDocs(reader); for(int delDoc : deleted) { diff --git a/lucene/src/test/org/apache/lucene/index/TestRollback.java b/lucene/src/test/org/apache/lucene/index/TestRollback.java index e5243e6af0f..4e1370fd1c8 100644 --- a/lucene/src/test/org/apache/lucene/index/TestRollback.java +++ b/lucene/src/test/org/apache/lucene/index/TestRollback.java @@ -31,7 +31,7 @@ public class TestRollback extends LuceneTestCase { // LUCENE-2536 public void testRollbackIntegrityWithBufferFlush() throws Exception { Directory dir = new MockRAMDirectory(); - RandomIndexWriter rw = new RandomIndexWriter(newRandom(), dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter rw = new RandomIndexWriter(newRandom(), dir); for (int i = 0; i < 5; i++) { Document doc = new Document(); doc.add(new Field("pk", Integer.toString(i), Store.YES, Index.ANALYZED_NO_NORMS)); diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java b/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java index 48a5efdd3d8..07b5cff684d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -64,7 +65,7 @@ public class TestSegmentTermEnum extends LuceneTestCase { public void testPrevTermAtEnd() throws IOException { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); addDoc(writer, "aaa bbb"); writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); diff --git a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java index d42e792053f..3c0fe1c387a 100644 --- a/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java +++ b/lucene/src/test/org/apache/lucene/index/TestStressIndexing2.java @@ -68,10 +68,10 @@ public class TestStressIndexing2 extends MultiCodecTestCase { // TODO: verify equals using IW.getReader DocsAndWriter dw = indexRandomIWReader(5, 3, 100, dir); - IndexReader r = dw.writer.getReader(); + IndexReader reader = dw.writer.getReader(); dw.writer.commit(); - verifyEquals(r, dir, "id"); - r.close(); + verifyEquals(r, reader, dir, "id"); + reader.close(); dw.writer.close(); dir.close(); } @@ -261,8 +261,8 @@ public class TestStressIndexing2 extends MultiCodecTestCase { w.close(); } - public static void verifyEquals(IndexReader r1, Directory dir2, String idField) throws Throwable { - IndexReader r2 = IndexReader.open(dir2, true); + public static void verifyEquals(Random r, IndexReader r1, Directory dir2, String idField) throws Throwable { + IndexReader r2 = IndexReader.open(dir2); verifyEquals(r1, r2, idField); r2.close(); } diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java index c50478b3664..7af84d23b11 100644 --- a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java @@ -18,8 +18,10 @@ package org.apache.lucene.index.codecs.preflex; */ import org.apache.lucene.store.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; import org.apache.lucene.index.*; -import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import org.apache.lucene.util.*; import java.util.*; @@ -30,8 +32,6 @@ import org.junit.Test; public class TestSurrogates extends LuceneTestCaseJ4 { - // chooses from a very limited alphabet to exacerbate the - // surrogate seeking required private static String makeDifficultRandomUnicodeString(Random r) { final int end = r.nextInt(20); if (end == 0) { @@ -44,154 +44,297 @@ public class TestSurrogates extends LuceneTestCaseJ4 { if (0 == t && i < end - 1) { // hi - buffer[i++] = (char) 0xd800; + buffer[i++] = (char) (0xd800 + r.nextInt(2)); // lo - buffer[i] = (char) 0xdc00; + buffer[i] = (char) (0xdc00 + r.nextInt(2)); } else if (t <= 3) { - buffer[i] = 'a'; + buffer[i] = (char) ('a' + r.nextInt(2)); } else if (4 == t) { - buffer[i] = 0xe000; + buffer[i] = (char) (0xe000 + r.nextInt(2)); } } return new String(buffer, 0, end); } - private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { - - final int numField = _TestUtil.nextInt(r, 2, 5); - - List terms = new ArrayList(); - - int tc = 0; - - for(int f=0;f() { - public int compare(Term o1, Term o2) { - return o1.compareToUTF16(o2); - } - }); - - TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128); - TermInfo ti = new TermInfo(); - String lastText = null; - int uniqueTermCount = 0; - if (VERBOSE) { - System.out.println("TEST: utf16 order:"); - } - for(Term t : terms) { - FieldInfo fi = fieldInfos.fieldInfo(t.field()); - - String text = t.text(); - if (lastText != null && lastText.equals(text)) { - continue; - } - fieldTerms.add(t); - uniqueTermCount++; - lastText = text; - - if (VERBOSE) { - System.out.println(" " + toHexString(t)); - } - w.add(fi.number, t.bytes().bytes, t.bytes().length, ti); - } - w.close(); - - Collections.sort(fieldTerms); - if (VERBOSE) { - System.out.println("\nTEST: codepoint order"); - for(Term t: fieldTerms) { - System.out.println(" " + t.field() + ":" + toHexString(t)); - } - } - - dir.createOutput(segName + ".prx").close(); - dir.createOutput(segName + ".frq").close(); - - // !!hack alert!! stuffing uniqueTermCount in as docCount - return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec); - } - private String toHexString(Term t) { return t.field() + ":" + UnicodeUtil.toHexString(t.text()); } - - @Test - public void testSurrogatesOrder() throws Exception { - Directory dir = new MockRAMDirectory(); - Codec codec = new PreFlexCodec(); + private String getRandomString(Random r) { + String s; + if (r.nextInt(5) == 1) { + if (r.nextInt(3) == 1) { + s = makeDifficultRandomUnicodeString(r); + } else { + s = _TestUtil.randomUnicodeString(r); + } + } else { + s = _TestUtil.randomRealisticUnicodeString(r); + } + return s; + } - Random r = newRandom(); - FieldInfos fieldInfos = new FieldInfos(); - List fieldTerms = new ArrayList(); - SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); + private static class SortTermAsUTF16Comparator implements Comparator { + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + } - // hack alert!! - int uniqueTermCount = si.docCount; + private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator(); - FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); - assertNotNull(fields); + // single straight enum + private void doTestStraightEnum(List fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException { if (VERBOSE) { - System.out.println("\nTEST: now enum"); + System.out.println("\nTEST: top now enum reader=" + reader); } - FieldsEnum fieldsEnum = fields.iterator(); - String field; - UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator(); - int termCount = 0; - while((field = fieldsEnum.next()) != null) { - TermsEnum termsEnum = fieldsEnum.terms(); - BytesRef text; - BytesRef lastText = null; - while((text = termsEnum.next()) != null) { + { + // Test straight enum: + String field; + int termCount = 0; + while((field = fieldsEnum.next()) != null) { + TermsEnum termsEnum = fieldsEnum.terms(); + BytesRef text; + BytesRef lastText = null; + while((text = termsEnum.next()) != null) { + Term exp = fieldTerms.get(termCount); + if (VERBOSE) { + System.out.println(" got term=" + field + ":" + UnicodeUtil.toHexString(text.utf8ToString())); + System.out.println(" exp=" + exp.field() + ":" + UnicodeUtil.toHexString(exp.text().toString())); + System.out.println(); + } + if (lastText == null) { + lastText = new BytesRef(text); + } else { + assertTrue(lastText.compareTo(text) < 0); + lastText.copy(text); + } + assertEquals(exp.field(), field); + assertEquals(exp.bytes(), text); + termCount++; + } if (VERBOSE) { - UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); - System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); - System.out.println(); + System.out.println(" no more terms for field=" + field); } - if (lastText == null) { - lastText = new BytesRef(text); - } else { - assertTrue(lastText.compareTo(text) < 0); - lastText.copy(text); - } - assertEquals(fieldTerms.get(termCount).field(), field); - assertEquals(fieldTerms.get(termCount).bytes(), text); - termCount++; } + assertEquals(uniqueTermCount, termCount); + } + } + + // randomly seeks to term that we know exists, then next's + // from there + private void doTestSeekExists(Random r, List fieldTerms, IndexReader reader) throws IOException { + + final Map tes = new HashMap(); + + // Test random seek to existing term, then enum: + if (VERBOSE) { + System.out.println("\nTEST: top now seek"); + } + + for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) { + + // pick random field+term + int spot = r.nextInt(fieldTerms.size()); + Term term = fieldTerms.get(spot); + String field = term.field(); + if (VERBOSE) { - System.out.println(" no more terms for field=" + field); + System.out.println("TEST: exist seek field=" + field + " term=" + UnicodeUtil.toHexString(term.text())); + } + + // seek to it + TermsEnum te = tes.get(field); + if (te == null) { + te = MultiFields.getTerms(reader, field).iterator(); + tes.put(field, te); + } + + if (VERBOSE) { + System.out.println(" done get enum"); + } + + // seek should find the term + assertEquals(TermsEnum.SeekStatus.FOUND, + te.seek(term.bytes())); + + // now .next() this many times: + int ct = _TestUtil.nextInt(r, 5, 100); + for(int i=0;i= fieldTerms.size()) { + break; + } + term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } } } - assertEquals(uniqueTermCount, termCount); + } - fields.close(); + private void doTestSeekDoesNotExist(Random r, int numField, List fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException { + + final Map tes = new HashMap(); + + if (VERBOSE) { + System.out.println("TEST: top random seeks"); + } + + { + for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) { + + // seek to random spot + String field = ("f" + r.nextInt(numField)).intern(); + Term tx = new Term(field, getRandomString(r)); + + int spot = Arrays.binarySearch(fieldTermsArray, tx); + + if (spot < 0) { + if (VERBOSE) { + System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text())); + } + + // term does not exist: + TermsEnum te = tes.get(field); + if (te == null) { + te = MultiFields.getTerms(reader, field).iterator(); + tes.put(field, te); + } + + if (VERBOSE) { + System.out.println(" got enum"); + } + + spot = -spot - 1; + + if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) { + assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes())); + } else { + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes())); + + if (VERBOSE) { + System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString())); + System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text())); + } + + assertEquals(fieldTerms.get(spot).bytes(), + te.term()); + + // now .next() this many times: + int ct = _TestUtil.nextInt(r, 5, 100); + for(int i=0;i= fieldTerms.size()) { + break; + } + Term term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } + + } + } + } + } + } + + + @Test + public void testSurrogatesOrder() throws Exception { + Random r = newRandom(); + + Directory dir = new MockRAMDirectory(); + RandomIndexWriter w = new RandomIndexWriter(r, + dir, + newIndexWriterConfig(r, TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec()))); + + final int numField = _TestUtil.nextInt(r, 2, 5); + + int uniqueTermCount = 0; + + int tc = 0; + + List fieldTerms = new ArrayList(); + + for(int f=0;f uniqueTerms = new HashSet(); + + for(int i=0;i pairs in a @@ -71,8 +74,7 @@ final class TermInfosWriter { private long lastIndexPointer; private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; + private final BytesRef lastTerm = new BytesRef(); private int lastFieldNumber = -1; private TermInfosWriter other; @@ -104,13 +106,10 @@ final class TermInfosWriter { assert initUTF16Results(); } - void add(Term term, TermInfo ti) throws IOException { - add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti); - } - // Currently used only by assert statements UnicodeUtil.UTF16Result utf16Result1; UnicodeUtil.UTF16Result utf16Result2; + private final BytesRef scratchBytes = new BytesRef(); // Currently used only by assert statements private boolean initUTF16Results() { @@ -120,7 +119,7 @@ final class TermInfosWriter { } // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { + private int compareToLastTerm(int fieldNumber, BytesRef term) { if (lastFieldNumber != fieldNumber) { final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); @@ -132,8 +131,13 @@ final class TermInfosWriter { return cmp; } - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); + scratchBytes.copy(term); + assert lastTerm.offset == 0; + UnicodeUtil.UTF8toUTF16(lastTerm.bytes, 0, lastTerm.length, utf16Result1); + + assert scratchBytes.offset == 0; + UnicodeUtil.UTF8toUTF16(scratchBytes.bytes, 0, scratchBytes.length, utf16Result2); + final int len; if (utf16Result1.length < utf16Result2.length) len = utf16Result1.length; @@ -152,22 +156,22 @@ final class TermInfosWriter { /** Adds a new <, TermInfo> pair to the set. Term must be lexicographically greater than all previous Terms added. TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) + public void add(int fieldNumber, BytesRef term, TermInfo ti) throws IOException { - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : + assert compareToLastTerm(fieldNumber, term) < 0 || + (isIndex && term.length == 0 && lastTerm.length == 0) : "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); + " text=" + term.utf8ToString() + " lastText=" + lastTerm.utf8ToString(); assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term + other.add(lastFieldNumber, lastTerm, lastTi); // add an index term - writeTerm(fieldNumber, termBytes, termBytesLength); // write term + writeTerm(fieldNumber, term); // write term output.writeVInt(ti.docFreq); // write doc freq output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers @@ -187,29 +191,27 @@ final class TermInfosWriter { size++; } - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) + private void writeTerm(int fieldNumber, BytesRef term) throws IOException { + //System.out.println(" tiw.write field=" + fieldNumber + " term=" + term.utf8ToString()); + // TODO: UTF16toUTF8 could tell us this prefix // Compute prefix in common with last term: int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; + final int limit = term.length < lastTerm.length ? term.length : lastTerm.length; while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) + if (term.bytes[start+term.offset] != lastTerm.bytes[start+lastTerm.offset]) break; start++; } - final int length = termBytesLength - start; + final int length = term.length - start; output.writeVInt(start); // write shared prefix length output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes + output.writeBytes(term.bytes, start+term.offset, length); // write delta bytes output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; + lastTerm.copy(term); } /** Called to complete TermInfos creation. */ diff --git a/lucene/src/test/org/apache/lucene/search/BaseTestRangeFilter.java b/lucene/src/test/org/apache/lucene/search/BaseTestRangeFilter.java index 7f5289c9b56..5da509a1c63 100644 --- a/lucene/src/test/org/apache/lucene/search/BaseTestRangeFilter.java +++ b/lucene/src/test/org/apache/lucene/search/BaseTestRangeFilter.java @@ -25,8 +25,6 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.store.RAMDirectory; @@ -104,7 +102,7 @@ public class BaseTestRangeFilter extends LuceneTestCase { private IndexReader build(Random random, TestIndex index) throws IOException { /* build an index */ RandomIndexWriter writer = new RandomIndexWriter(random, index.index, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()) + newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer()) .setOpenMode(OpenMode.CREATE)); for (int d = minId; d <= maxId; d++) { diff --git a/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java b/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java index 7f82af1f394..c2a323ed4dd 100644 --- a/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestAutomatonQuery.java @@ -20,11 +20,9 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Random; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; @@ -46,8 +44,7 @@ public class TestAutomatonQuery extends LuceneTestCase { super.setUp(); Random random = newRandom(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, directory); Document doc = new Document(); Field titleField = new Field("title", "some title", Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java b/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java index a5e2c434ec2..cd604e8bad4 100644 --- a/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java +++ b/lucene/src/test/org/apache/lucene/search/TestAutomatonQueryUnicode.java @@ -20,11 +20,9 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Random; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; @@ -49,8 +47,7 @@ public class TestAutomatonQueryUnicode extends LuceneTestCase { super.setUp(); Random random = newRandom(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, directory); Document doc = new Document(); Field titleField = new Field("title", "some title", Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/src/test/org/apache/lucene/search/TestBoolean2.java b/lucene/src/test/org/apache/lucene/search/TestBoolean2.java index 705a5cab741..8ded5b019aa 100644 --- a/lucene/src/test/org/apache/lucene/search/TestBoolean2.java +++ b/lucene/src/test/org/apache/lucene/search/TestBoolean2.java @@ -54,7 +54,7 @@ public class TestBoolean2 extends LuceneTestCase { super.setUp(); rnd = newRandom(); RAMDirectory directory = new RAMDirectory(); - RandomIndexWriter writer= new RandomIndexWriter(rnd, directory, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer= new RandomIndexWriter(rnd, directory); for (int i = 0; i < docFields.length; i++) { Document doc = new Document(); doc.add(new Field(field, docFields[i], Field.Store.NO, Field.Index.ANALYZED)); @@ -71,14 +71,14 @@ public class TestBoolean2 extends LuceneTestCase { int docCount = 0; do { final Directory copy = new RAMDirectory(dir2); - RandomIndexWriter w = new RandomIndexWriter(rnd, dir2, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter w = new RandomIndexWriter(rnd, dir2); w.addIndexes(new Directory[] {copy}); docCount = w.maxDoc(); w.close(); mulFactor *= 2; } while(docCount < 3000); - RandomIndexWriter w = new RandomIndexWriter(rnd, dir2, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter w = new RandomIndexWriter(rnd, dir2); Document doc = new Document(); doc.add(new Field("field2", "xxx", Field.Store.NO, Field.Index.ANALYZED)); for(int i=0;i> docs = new ArrayList>(); Document d = new Document(); Field f = new Field("f", "", Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java b/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java index 27a7fa68ac6..9a632860efa 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java +++ b/lucene/src/test/org/apache/lucene/search/TestPositionIncrement.java @@ -35,7 +35,6 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.QueryParser; @@ -91,8 +90,7 @@ public class TestPositionIncrement extends LuceneTestCase { } }; Directory store = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, - new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, analyzer); Document d = new Document(); d.add(new Field("field", "bogus", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(d); @@ -242,8 +240,7 @@ public class TestPositionIncrement extends LuceneTestCase { public void testPayloadsPos0() throws Exception { Directory dir = new MockRAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new TestPayloadAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, new TestPayloadAnalyzer()); Document doc = new Document(); doc.add(new Field("content", new StringReader( "a a b c d e a f g h i j a b k k"))); diff --git a/lucene/src/test/org/apache/lucene/search/TestPrefixFilter.java b/lucene/src/test/org/apache/lucene/search/TestPrefixFilter.java index ab61ae5feca..4ce3b76ea07 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPrefixFilter.java +++ b/lucene/src/test/org/apache/lucene/search/TestPrefixFilter.java @@ -20,10 +20,8 @@ package org.apache.lucene.search; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -39,8 +37,7 @@ public class TestPrefixFilter extends LuceneTestCase { "/Computers/Mac/One", "/Computers/Mac/Two", "/Computers/Windows"}; - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); for (int i = 0; i < categories.length; i++) { Document doc = new Document(); doc.add(new Field("category", categories[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); diff --git a/lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java b/lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java index 9d3a8caa742..ccc304301a8 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestPrefixInBooleanQuery.java @@ -18,11 +18,9 @@ package org.apache.lucene.search; */ import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; @@ -48,8 +46,7 @@ public class TestPrefixInBooleanQuery extends LuceneTestCase { protected void setUp() throws Exception { super.setUp(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); for (int i = 0; i < 5137; ++i) { Document doc = new Document(); diff --git a/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java b/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java index d9f98e4f07b..595f9a6b2d3 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestPrefixQuery.java @@ -20,10 +20,8 @@ package org.apache.lucene.search; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -38,8 +36,7 @@ public class TestPrefixQuery extends LuceneTestCase { String[] categories = new String[] {"/Computers", "/Computers/Mac", "/Computers/Windows"}; - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); for (int i = 0; i < categories.length; i++) { Document doc = new Document(); doc.add(new Field("category", categories[i], Field.Store.YES, Field.Index.NOT_ANALYZED)); diff --git a/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java b/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java index 7c977995906..5deac12cd1b 100644 --- a/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java +++ b/lucene/src/test/org/apache/lucene/search/TestPrefixRandom.java @@ -25,7 +25,6 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.RandomIndexWriter; @@ -51,8 +50,7 @@ public class TestPrefixRandom extends LuceneTestCase { random = newRandom(); dir = new MockRAMDirectory(); // TODO: fix mocktokenizer to not extend chartokenizer, so you can have an 'empty' keyword. - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.KEYWORD, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false)); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java b/lucene/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java index 93e0329f817..505106209e0 100644 --- a/lucene/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java +++ b/lucene/src/test/org/apache/lucene/search/TestQueryWrapperFilter.java @@ -17,13 +17,11 @@ package org.apache.lucene.search; * limitations under the License. */ -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; @@ -35,8 +33,7 @@ public class TestQueryWrapperFilter extends LuceneTestCase { public void testBasic() throws Exception { Directory dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), dir); Document doc = new Document(); doc.add(new Field("field", "value", Store.NO, Index.ANALYZED)); writer.addDocument(doc); diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java b/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java index 3c928e90de2..c08b989dace 100644 --- a/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpQuery.java @@ -20,11 +20,9 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Arrays; -import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; @@ -48,8 +46,7 @@ public class TestRegexpQuery extends LuceneTestCase { public void setUp() throws Exception { super.setUp(); directory = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(newRandom(), directory); Document doc = new Document(); doc.add(new Field(FN, "the quick brown fox jumps over the lazy ??? dog 493432 49344", diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java index 9d0ba6ce70c..f29751291ff 100644 --- a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom.java @@ -51,8 +51,7 @@ public class TestRegexpRandom extends LuceneTestCase { super.setUp(); random = newRandom(); dir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, dir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, dir); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java index 7ddca91d521..1ec7cb16563 100644 --- a/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java +++ b/lucene/src/test/org/apache/lucene/search/TestRegexpRandom2.java @@ -19,13 +19,15 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.Random; +import java.util.Collections; +import java.util.List; +import java.util.ArrayList; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.RandomIndexWriter; @@ -58,17 +60,28 @@ public class TestRegexpRandom2 extends LuceneTestCase { // TODO: fix mocktokenizer to not extend chartokenizer, so you can have an 'empty' keyword. // currently, this means 'empty tokens' arent created/tested in the enumeration: // it's like having a big hairy scary monster in the basement but being upset that it doesn't have fangs - RandomIndexWriter writer = new RandomIndexWriter(random, dir, new IndexWriterConfig(TEST_VERSION_CURRENT, - new MockAnalyzer(MockTokenizer.KEYWORD, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, new MockAnalyzer(MockTokenizer.KEYWORD, false)); Document doc = new Document(); Field field = new Field("field", "", Field.Store.NO, Field.Index.ANALYZED); doc.add(field); - + List terms = new ArrayList(); for (int i = 0; i < 2000*_TestUtil.getRandomMultiplier(); i++) { - field.setValue(_TestUtil.randomUnicodeString(random)); + String s = _TestUtil.randomUnicodeString(random); + field.setValue(s); + terms.add(s); writer.addDocument(doc); } + + if (VERBOSE) { + // utf16 order + Collections.sort(terms); + System.out.println("UTF16 order:"); + for(String s : terms) { + System.out.println(" " + UnicodeUtil.toHexString(s)); + } + } + reader = writer.getReader(); searcher = new IndexSearcher(reader); writer.close(); @@ -122,8 +135,11 @@ public class TestRegexpRandom2 extends LuceneTestCase { /** test a bunch of random regular expressions */ public void testRegexps() throws Exception { - for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) - assertSame(AutomatonTestUtil.randomRegexp(random).toString()); + + for (int i = 0; i < 1000*_TestUtil.getRandomMultiplier(); i++) { + String reg = AutomatonTestUtil.randomRegexp(random).toString(); + assertSame(reg); + } } /** check that the # of hits is the same as from a very diff --git a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java index 4dc7270eb26..7a8feec10c1 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSimilarity.java +++ b/lucene/src/test/org/apache/lucene/search/TestSimilarity.java @@ -20,6 +20,7 @@ package org.apache.lucene.search; import org.apache.lucene.util.LuceneTestCase; import java.io.IOException; import java.util.Collection; +import java.util.Random; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriterConfig; @@ -65,8 +66,9 @@ public class TestSimilarity extends LuceneTestCase { public void testSimilarity() throws Exception { RAMDirectory store = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(newRandom(), store, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()) + Random random = newRandom(); + RandomIndexWriter writer = new RandomIndexWriter(random, store, + newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer()) .setSimilarity(new SimpleSimilarity())); Document d1 = new Document(); diff --git a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java index 5700fae0c02..f294bf3239d 100755 --- a/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java +++ b/lucene/src/test/org/apache/lucene/search/TestSloppyPhraseQuery.java @@ -25,8 +25,6 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; @@ -128,8 +126,7 @@ public class TestSloppyPhraseQuery extends LuceneTestCase { query.setSlop(slop); RAMDirectory ramDir = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, ramDir, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.WHITESPACE, false))); + RandomIndexWriter writer = new RandomIndexWriter(random, ramDir, new MockAnalyzer(MockTokenizer.WHITESPACE, false)); writer.addDocument(doc); IndexReader reader = writer.getReader(); diff --git a/lucene/src/test/org/apache/lucene/search/TestSort.java b/lucene/src/test/org/apache/lucene/search/TestSort.java index b1102899c1f..7ca2b8da029 100644 --- a/lucene/src/test/org/apache/lucene/search/TestSort.java +++ b/lucene/src/test/org/apache/lucene/search/TestSort.java @@ -112,8 +112,7 @@ public class TestSort extends LuceneTestCase implements Serializable { private Searcher getIndex (boolean even, boolean odd) throws IOException { RAMDirectory indexStore = new RAMDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, indexStore, - new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + RandomIndexWriter writer = new RandomIndexWriter(random, indexStore); for (int i=0; i,Object> checkedClasses = Collections.synchronizedMap(new WeakHashMap,Object>()); + // saves default codec: we do this statically as many build indexes in @beforeClass + private static String savedDefaultCodec; + private static String codec; + private static Codec preFlexSav; + + // returns current PreFlex codec + public static Codec installPreFlexRW() { + final Codec preFlex = CodecProvider.getDefault().lookup("PreFlex"); + if (preFlex != null) { + CodecProvider.getDefault().unregister(preFlex); + } + CodecProvider.getDefault().register(new PreFlexRWCodec()); + return preFlex; + } + + // returns current PreFlex codec + public static void restorePreFlex(Codec preFlex) { + Codec preFlexRW = CodecProvider.getDefault().lookup("PreFlex"); + if (preFlexRW != null) { + CodecProvider.getDefault().unregister(preFlexRW); + } + CodecProvider.getDefault().register(preFlex); + } + + @BeforeClass + public static void beforeClassLuceneTestCaseJ4() { + savedDefaultCodec = CodecProvider.getDefaultCodec(); + codec = _TestUtil.getTestCodec(); + if (codec.equals("random")) + codec = CodecProvider.CORE_CODECS[seedRnd.nextInt(CodecProvider.CORE_CODECS.length)]; + + // If we're running w/ PreFlex codec we must swap in the + // test-only PreFlexRW codec (since core PreFlex can + // only read segments): + if (codec.equals("PreFlex")) { + preFlexSav = installPreFlexRW(); + } + + CodecProvider.setDefaultCodec(codec); + } + + @AfterClass + public static void afterClassLuceneTestCaseJ4() { + // Restore read-only PreFlex codec: + if (codec.equals("PreFlex")) { + restorePreFlex(preFlexSav); + } + CodecProvider.setDefaultCodec(savedDefaultCodec); + } + // This is how we get control when errors occur. // Think of this as start/end/success/failed // events. @@ -372,6 +432,34 @@ public class LuceneTestCaseJ4 { return new Random(seed); } + /** create a new index writer config with random defaults */ + public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) { + IndexWriterConfig c = new IndexWriterConfig(v, a); + if (r.nextBoolean()) { + c.setMergePolicy(new LogDocMergePolicy()); + } + if (r.nextBoolean()) { + c.setMergeScheduler(new SerialMergeScheduler()); + } + if (r.nextBoolean()) { + c.setMaxBufferedDocs(_TestUtil.nextInt(r, 2, 1000)); + } + if (r.nextBoolean()) { + c.setTermIndexInterval(_TestUtil.nextInt(r, 1, 1000)); + } + + if (c.getMergePolicy() instanceof LogMergePolicy) { + LogMergePolicy logmp = (LogMergePolicy) c.getMergePolicy(); + logmp.setUseCompoundDocStore(r.nextBoolean()); + logmp.setUseCompoundFile(r.nextBoolean()); + logmp.setCalibrateSizeByDeletes(r.nextBoolean()); + logmp.setMergeFactor(_TestUtil.nextInt(r, 2, 20)); + } + + c.setReaderPooling(r.nextBoolean()); + return c; + } + public String getName() { return this.name; } @@ -395,6 +483,10 @@ public class LuceneTestCaseJ4 { System.out.println("NOTE: random static seed of testclass '" + getName() + "' was: " + staticSeed); } + if (_TestUtil.getTestCodec().equals("random")) { + System.out.println("NOTE: random codec of testcase '" + getName() + "' was: " + codec); + } + if (seed != null) { System.out.println("NOTE: random seed of testcase '" + getName() + "' was: " + seed); } @@ -407,5 +499,4 @@ public class LuceneTestCaseJ4 { private static final Random seedRnd = new Random(); private String name = ""; - } diff --git a/lucene/src/test/org/apache/lucene/util/_TestUtil.java b/lucene/src/test/org/apache/lucene/util/_TestUtil.java index 4f0ab7705b7..36c6af946b0 100644 --- a/lucene/src/test/org/apache/lucene/util/_TestUtil.java +++ b/lucene/src/test/org/apache/lucene/util/_TestUtil.java @@ -23,6 +23,9 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.Directory; import java.io.ByteArrayOutputStream; import java.io.PrintStream; @@ -130,7 +133,7 @@ public class _TestUtil { final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { int t = r.nextInt(5); - //buffer[i] = (char) (97 + r.nextInt(26)); + if (0 == t && i < end - 1) { // Make a surrogate pair // High surrogate @@ -218,4 +221,39 @@ public class _TestUtil { public static int getRandomMultiplier() { return Integer.parseInt(System.getProperty("random.multiplier", "1")); } + + /** gets the codec to run tests with */ + public static String getTestCodec() { + // by default we randomly pick a different codec for + // each test case (non-J4 tests) and each test class (J4 + // tests) + return System.getProperty("tests.codec", "random"); + } + + public static CodecProvider alwaysCodec(final Codec c) { + return new CodecProvider() { + @Override + public Codec getWriter(SegmentWriteState state) { + return c; + } + + @Override + public Codec lookup(String name) { + // can't do this until we fix PreFlexRW to not + //impersonate PreFlex: + if (name.equals(c.name)) { + return c; + } else { + return CodecProvider.getDefault().lookup(name); + } + } + }; + } + + /** Return a CodecProvider that can read any of the + * default codecs, but always writes in the specified + * codec. */ + public static CodecProvider alwaysCodec(final String codec) { + return alwaysCodec(CodecProvider.getDefault().lookup(codec)); + } } diff --git a/solr/build.xml b/solr/build.xml index fb5d68a3c98..d2db702b8bd 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -435,6 +435,7 @@ > + diff --git a/solr/common-build.xml b/solr/common-build.xml index 805bb769cec..84181463521 100644 --- a/solr/common-build.xml +++ b/solr/common-build.xml @@ -44,6 +44,8 @@ + +