From af4a9deb16640733a0a031a5b5c27e9d1402dc01 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 23 Jul 2010 15:26:25 +0000 Subject: [PATCH] commit my current state git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/preflexfixes@967130 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/common-build.xml | 3 + .../lucene/index/memory/MemoryIndexTest.java | 5 +- .../apache/lucene/index/DocumentsWriter.java | 2 +- .../apache/lucene/index/MultiTermsEnum.java | 2 +- .../lucene/index/codecs/CodecProvider.java | 6 +- .../lucene/index/codecs/FieldsConsumer.java | 3 +- .../index/codecs/preflex/PreFlexCodec.java | 10 +- .../index/codecs/preflex/PreFlexFields.java | 848 +++++++++++++----- .../index/codecs/preflex/SegmentTermEnum.java | 8 +- .../index/codecs/preflex/TermBuffer.java | 89 +- .../lucene/index/codecs/preflex/TermInfo.java | 20 +- .../index/codecs/preflex/TermInfosReader.java | 25 +- .../apache/lucene/search/FieldCacheImpl.java | 1 + .../java/org/apache/lucene/util/BytesRef.java | 10 +- .../lucene/index/RandomIndexWriter.java | 20 +- .../apache/lucene/index/TestAddIndexes.java | 2 - .../org/apache/lucene/index/TestCodecs.java | 32 +- .../org/apache/lucene/index/TestFlex.java | 5 +- .../apache/lucene/index/TestIndexReader.java | 6 +- .../apache/lucene/index/TestIndexWriter.java | 11 +- .../lucene/index/TestIndexWriterDelete.java | 7 +- .../apache/lucene/index/TestMultiFields.java | 19 +- .../lucene/index/TestSegmentTermEnum.java | 3 +- .../index/codecs/preflex/TestSurrogates.java | 360 +++++--- .../codecs/preflexrw/PreFlexFieldsWriter.java | 212 +++++ .../codecs/preflexrw/PreFlexRWCodec.java | 52 ++ .../TermInfosWriter.java | 60 +- .../apache/lucene/util/LuceneTestCase.java | 15 +- .../apache/lucene/util/LuceneTestCaseJ4.java | 23 + .../org/apache/lucene/util/_TestUtil.java | 42 + 30 files changed, 1386 insertions(+), 515 deletions(-) create mode 100644 lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java create mode 100644 lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java rename lucene/src/test/org/apache/lucene/index/codecs/{preflex => preflexrw}/TermInfosWriter.java (80%) diff --git a/lucene/common-build.xml b/lucene/common-build.xml index 363313421bc..9661d103a03 100644 --- a/lucene/common-build.xml +++ b/lucene/common-build.xml @@ -56,6 +56,7 @@ + @@ -434,6 +435,8 @@ + + diff --git a/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java b/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java index 82fa02060c5..9a0faed552b 100644 --- a/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java +++ b/lucene/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopDocs; @@ -107,8 +108,8 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase { RAMDirectory ramdir = new RAMDirectory(); Analyzer analyzer = randomAnalyzer(); - IndexWriter writer = new IndexWriter(ramdir, analyzer, - IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter writer = new IndexWriter(ramdir, + new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED); Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED); diff --git a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java index 1a61cfc0f74..8853a33a2fc 100644 --- a/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -1095,7 +1095,7 @@ final class DocumentsWriter { continue; } assert checkDeleteTerm(term); - + if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); diff --git a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java index 19d1f5072db..c2a68c5ba0e 100644 --- a/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -116,7 +116,7 @@ public final class MultiTermsEnum extends TermsEnum { // different TermComps final Comparator subTermComp = termsEnumIndex.termsEnum.getComparator(); if (subTermComp != null && !subTermComp.equals(termComp)) { - throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); + throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge"); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java index 71e6c8519ea..a756e82c9b2 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -47,14 +47,14 @@ public abstract class CodecProvider { private static String defaultCodec = "Standard"; - public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock"}; + public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock", "PreFlex"}; public void register(Codec codec) { if (codec.name == null) { throw new IllegalArgumentException("code.name is null"); } - - if (!codecs.containsKey(codec.name)) { + // nocommit + if (!codecs.containsKey(codec.name) || codec.name.equals("PreFlex")) { codecs.put(codec.name, codec); codec.getExtensions(knownExtensions); } else if (codecs.get(codec.name) != codec) { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java index 0c0bcf86569..8389df02600 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/FieldsConsumer.java @@ -22,6 +22,7 @@ import org.apache.lucene.index.Fields; import org.apache.lucene.index.FieldsEnum; import java.io.IOException; +import java.io.Closeable; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this @@ -30,7 +31,7 @@ import java.io.IOException; * * @lucene.experimental */ -public abstract class FieldsConsumer { +public abstract class FieldsConsumer implements Closeable { /** Add a new field */ public abstract TermsConsumer addField(FieldInfo field) throws IOException; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java index 0c67b81f2d1..72424d88495 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexCodec.java @@ -40,16 +40,16 @@ import org.apache.lucene.index.codecs.FieldsProducer; public class PreFlexCodec extends Codec { /** Extension of terms file */ - static final String TERMS_EXTENSION = "tis"; + public static final String TERMS_EXTENSION = "tis"; /** Extension of terms index file */ - static final String TERMS_INDEX_EXTENSION = "tii"; + public static final String TERMS_INDEX_EXTENSION = "tii"; /** Extension of freq postings file */ - static final String FREQ_EXTENSION = "frq"; + public static final String FREQ_EXTENSION = "frq"; /** Extension of prox postings file */ - static final String PROX_EXTENSION = "prx"; + public static final String PROX_EXTENSION = "prx"; public PreFlexCodec() { name = "PreFlex"; @@ -62,7 +62,7 @@ public class PreFlexCodec extends Codec { @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor); + return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, true); } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index f5d8e74eda3..b12eb383236 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -40,12 +40,11 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { - + private static final boolean DEBUG_SURROGATES = false; public TermInfosReader tis; @@ -59,11 +58,19 @@ public class PreFlexFields extends FieldsProducer { private final Directory dir; private final int readBufferSize; private Directory cfsReader; + private final boolean unicodeSortOrder; - PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) + // If unicodeSortOrder is true, we do the surrogates dance + // so that the terms are sorted by unicode sort order. + // This should be true when segments are used for "normal" + // searching; it's only false during testing, to create a + // pre-flex index, using the preflexrw codec under + // src/test. + public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor, boolean unicodeSortOrder) throws IOException { si = info; + this.unicodeSortOrder = unicodeSortOrder; // NOTE: we must always load terms index, even for // "sequential" scan during merging, because what is @@ -182,6 +189,12 @@ public class PreFlexFields extends FieldsProducer { if (cfsReader != null) { cfsReader.close(); } + if (freqStream != null) { + freqStream.close(); + } + if (proxStream != null) { + proxStream.close(); + } } private class PreFlexFieldsEnum extends FieldsEnum { @@ -228,7 +241,11 @@ public class PreFlexFields extends FieldsProducer { public Comparator getComparator() { // Pre-flex indexes always sorted in UTF16 order, but // we remap on-the-fly to unicode order - return BytesRef.getUTF8SortedAsUnicodeComparator(); + if (unicodeSortOrder) { + return BytesRef.getUTF8SortedAsUnicodeComparator(); + } else { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } } } @@ -238,237 +255,473 @@ public class PreFlexFields extends FieldsProducer { private boolean skipNext; private BytesRef current; - private int[] surrogateSeekPending = new int[1]; - private boolean[] surrogateDidSeekBack = new boolean[1]; - private int surrogateSeekUpto; - private char[] pendingPrefix; - private SegmentTermEnum seekTermEnum; private Term protoTerm; + + private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0; + private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee; + + // Returns true if the unicode char is "after" the + // surrogates in UTF16, ie >= U+E000 and <= U+FFFF: + private final boolean isHighBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD; + } + + // Returns true if the unicode char in the UTF8 byte + // sequence starting at idx encodes a char outside of + // BMP (ie what would be a surrogate pair in UTF16): + private final boolean isNonBMPChar(byte[] b, int idx) { + return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD; + } + + private final byte[] scratch = new byte[4]; + private final BytesRef prevTerm = new BytesRef(); + private final BytesRef scratchTerm = new BytesRef(); private int newSuffixStart; + // Swap in S, in place of E: + private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException { + final int savLength = term.length; + + assert term.offset == 0; + + // The 3 bytes starting at downTo make up 1 + // unicode character: + assert isHighBMPChar(term.bytes, pos); + + // nocommit -- why does this trip? + // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3); + + // Save the bytes && length, since we need to + // restore this if seek "back" finds no matching + // terms + if (term.bytes.length < 4+pos) { + term.grow(4+pos); + } + + scratch[0] = term.bytes[pos]; + scratch[1] = term.bytes[pos+1]; + scratch[2] = term.bytes[pos+2]; + + term.bytes[pos] = (byte) 0xf0; + term.bytes[pos+1] = (byte) 0x90; + term.bytes[pos+2] = (byte) 0x80; + term.bytes[pos+3] = (byte) 0x80; + term.length = 4+pos; + + if (DEBUG_SURROGATES) { + System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString())); + } + + // Seek "back": + getTermsDict().seekEnum(te, protoTerm.createTerm(term)); + + // Test if the term we seek'd to in fact found a + // surrogate pair at the same position as the E: + Term t2 = te.term(); + + // Cannot be null (or move to next field) because at + // "worst" it'd seek to the same term we are on now, + // unless we are being called from seek + if (t2 == null || t2.field() != fieldInfo.name) { + return false; + } + + if (DEBUG_SURROGATES) { + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text())); + } + + // Now test if prefix is identical and we found + // a non-BMP char at the same position: + BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + boolean matches; + if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) { + matches = true; + for(int i=0;i limit) { + + if (isHighBMPChar(prevTerm.bytes, downTo)) { + + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length); + } + + if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) { + // TODO: more efficient seek? + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + //newSuffixStart = downTo+4; + newSuffixStart = downTo; + scratchTerm.copy(termEnum.term().bytes()); + didSeek = true; + if (DEBUG_SURROGATES) { + System.out.println(" seek!"); + } + break; + } else { + if (DEBUG_SURROGATES) { + System.out.println(" no seek"); + } + } + } + + // Shorten prevTerm in place so that we don't redo + // this loop if we come back here: + if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) { + prevTerm.length = downTo; + } + + downTo--; + } + + return didSeek; + } + + // Look for seek type 3 ("pop"): if the delta from + // prev -> current was replacing an S with an E, + // we must now seek to beyond that E. This seek + // "finishes" the dance at this character + // position. + private boolean doPop() throws IOException { + + if (DEBUG_SURROGATES) { + System.out.println(" try pop"); + } + + assert newSuffixStart <= prevTerm.length; + assert newSuffixStart < scratchTerm.length || newSuffixStart == 0; + + if (prevTerm.length > newSuffixStart && + isNonBMPChar(prevTerm.bytes, newSuffixStart) && + isHighBMPChar(scratchTerm.bytes, newSuffixStart)) { + + // Seek type 2 -- put U+FFFF at this position: + // nocommit -- can we somehow use 0xff??? + scratchTerm.bytes[newSuffixStart] = (byte) 0xff; + //scratchTerm.bytes[newSuffixStart] = (byte) 0xef; + scratchTerm.bytes[newSuffixStart+1] = (byte) 0xbf; + scratchTerm.bytes[newSuffixStart+2] = (byte) 0xbf; + scratchTerm.length = newSuffixStart+3; + + if (DEBUG_SURROGATES) { + System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString()); + } + + // TODO: more efficient seek? can we simply swap + // the enums? + getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm)); + + final Term t2 = termEnum.term(); + + // We could hit EOF or different field since this + // was a seek "forward": + if (t2 != null && t2.field() == fieldInfo.name) { + + if (DEBUG_SURROGATES) { + System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes()); + } + + final BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + + + // Set newSuffixStart -- we can't use + // termEnum's since the above seek may have + // done no scanning (eg, term was precisely + // and index term, or, was in the term seek + // cache): + scratchTerm.copy(b2); + setNewSuffixStart(prevTerm, scratchTerm); + + return true; + } else if (newSuffixStart != 0 || scratchTerm.length != 0) { + if (DEBUG_SURROGATES) { + System.out.println(" got term=null (or next field)"); + } + newSuffixStart = 0; + scratchTerm.length = 0; + return true; + } + } + + return false; + } + + // Pre-flex indices store terms in UTF16 sort order, but + // certain queries require Unicode codepoint order; this + // method carefully seeks around surrogates to handle + // this impedance mismatch + + private void surrogateDance() throws IOException { + + if (!unicodeSortOrder) { + return; + } + + // We are invoked after TIS.next() (by UTF16 order) to + // possibly seek to a different "next" (by unicode + // order) term. + + // We scan only the "delta" from the last term to the + // current term, in UTF8 bytes. We look at 1) the bytes + // stripped from the prior term, and then 2) the bytes + // appended to that prior term's prefix. + + // We don't care about specific UTF8 sequences, just + // the "category" of the UTF16 character. Category S + // is a high/low surrogate pair (it non-BMP). + // Category E is any BMP char > UNI_SUR_LOW_END (and < + // U+FFFF). Category A is the rest (any unicode char + // <= UNI_SUR_HIGH_START). + + // The core issue is that pre-flex indices sort the + // characters as ASE, while flex must sort as AES. So + // when scanning, when we hit S, we must 1) seek + // forward to E and enum the terms there, then 2) seek + // back to S and enum all terms there, then 3) seek to + // after E. Three different seek points (1, 2, 3). + + // We can easily detect S in UTF8: if a byte has + // prefix 11110 (0xf0), then that byte and the + // following 3 bytes encode a single unicode codepoint + // in S. Similary,we can detect E: if a byte has + // prefix 1110111 (0xee), then that byte and the + // following 2 bytes encode a single unicode codepoint + // in E. + + // Note that this is really a recursive process -- + // maybe the char at pos 2 needs to dance, but any + // point in its dance, suddenly pos 4 needs to dance + // so you must finish pos 4 before returning to pos + // 2. But then during pos 4's dance maybe pos 7 needs + // to dance, etc. However, despite being recursive, + // we don't need to hold any state because the state + // can always be derived by looking at prior term & + // current term. + + // TODO: can we avoid this copy? + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { + scratchTerm.length = 0; + } else { + scratchTerm.copy(termEnum.term().bytes()); + } + + if (DEBUG_SURROGATES) { + System.out.println(" dance"); + System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString())); + System.out.println(" " + prevTerm.toString()); + System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString())); + System.out.println(" " + scratchTerm.toString()); + } + + // This code assumes TermInfosReader/SegmentTermEnum + // always use BytesRef.offset == 0 + assert prevTerm.offset == 0; + assert scratchTerm.offset == 0; + + // Need to loop here because we may need to do multiple + // pops, and possibly a continue in the end, ie: + // + // cont + // pop, cont + // pop, pop, cont + // + // + + while(true) { + if (doContinue()) { + break; + } else { + if (!doPop()) { + break; + } + } + } + + if (DEBUG_SURROGATES) { + System.out.println(" finish bmp ends"); + } + + doPushes(); + } + + + // Look for seek type 1 ("push"): if the newly added + // suffix contains any S, we must try to seek to the + // corresponding E. If we find a match, we go there; + // else we keep looking for additional S's in the new + // suffix. This "starts" the dance, at this character + // position: + private void doPushes() throws IOException { + + int upTo = newSuffixStart; + if (DEBUG_SURROGATES) { + System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length); + } + + while(upTo < scratchTerm.length) { + if (isNonBMPChar(scratchTerm.bytes, upTo) && + (upTo > newSuffixStart || + (upTo >= prevTerm.length || + (!isNonBMPChar(prevTerm.bytes, upTo) && + !isHighBMPChar(prevTerm.bytes, upTo))))) { // nocommit -- we can't cmp to prevTerm if we'd done a seek 3 or seek 2 before? + + // A non-BMP char (4 bytes UTF8) starts here: + assert scratchTerm.length >= upTo + 4; + + final int savLength = scratchTerm.length; + scratch[0] = scratchTerm.bytes[upTo]; + scratch[1] = scratchTerm.bytes[upTo+1]; + scratch[2] = scratchTerm.bytes[upTo+2]; + + scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD; + scratchTerm.bytes[upTo+1] = (byte) 0x80; + scratchTerm.bytes[upTo+2] = (byte) 0x80; + scratchTerm.length = upTo+3; + + if (DEBUG_SURROGATES) { + System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length); + } + + // Seek "forward": + // TODO: more efficient seek? + getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm)); + + scratchTerm.bytes[upTo] = scratch[0]; + scratchTerm.bytes[upTo+1] = scratch[1]; + scratchTerm.bytes[upTo+2] = scratch[2]; + scratchTerm.length = savLength; + + // Did we find a match? + final Term t2 = seekTermEnum.term(); + + if (DEBUG_SURROGATES) { + if (t2 == null) { + System.out.println(" hit term=null"); + } else { + System.out.println(" hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes())); + } + } + + // Since this was a seek "forward", we could hit + // EOF or a different field: + boolean matches; + + if (t2 != null && t2.field() == fieldInfo.name) { + final BytesRef b2 = t2.bytes(); + assert b2.offset == 0; + if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) { + matches = true; + for(int i=0;i BMP + upTo += 3; + + // NOTE: we keep iterating, now, since this + // can easily "recurse". Ie, after seeking + // forward at a certain char position, we may + // find another surrogate in our [new] suffix + // and must then do another seek (recurse) + } else { + upTo++; + } + } else { + upTo++; + } + } + } + void reset(FieldInfo fieldInfo) throws IOException { + //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; protoTerm = new Term(fieldInfo.name); if (termEnum == null) { termEnum = getTermsDict().terms(protoTerm); seekTermEnum = getTermsDict().terms(protoTerm); + //System.out.println(" term=" + termEnum.term()); } else { getTermsDict().seekEnum(termEnum, protoTerm); } skipNext = true; - - surrogateSeekUpto = 0; - newSuffixStart = 0; - surrogatesDance(); - } - - private void surrogatesDance() throws IOException { - - // Tricky: prior to 4.0, Lucene index sorted terms in - // UTF16 order, but as of 4.0 we sort by Unicode code - // point order. These orders differ because of the - // surrrogates; so we have to fixup our enum, here, by - // carefully first seeking past the surrogates and - // then back again at the end. The process is - // recursive, since any given term could have multiple - // new occurrences of surrogate pairs, so we use a - // stack to record the pending seek-backs. - if (DEBUG_SURROGATES) { - System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); - } - - while(popPendingSeek()); - while(pushNewSurrogate()); - } - - // only for debugging - private String getStack() { - if (surrogateSeekUpto == 0) { - return "null"; - } else { - StringBuffer sb = new StringBuffer(); - for(int i=0;i 0) { - sb.append(' '); - } - sb.append(surrogateSeekPending[i]); - } - sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); - return sb.toString(); - } - } - - private boolean popPendingSeek() throws IOException { - if (DEBUG_SURROGATES) { - System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); - } - // if a .next() has advanced beyond the - // after-surrogates range we had last seeked to, we - // must seek back to the start and resume .next from - // there. this pops the pending seek off the stack. final Term t = termEnum.term(); - if (surrogateSeekUpto > 0) { - final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; - if (DEBUG_SURROGATES) { - System.out.println(" seekPrefix=" + seekPrefix); - } - if (newSuffixStart < seekPrefix) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; - pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix)); - if (DEBUG_SURROGATES) { - System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); - } - getTermsDict().seekEnum(termEnum, t2); - surrogateDidSeekBack[surrogateSeekUpto-1] = true; - - // +2 because we don't want to re-check the - // surrogates we just seek'd back to - newSuffixStart = seekPrefix + 2; - return true; - } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { - assert pendingPrefix != null; - assert pendingPrefix.length > seekPrefix; - pendingPrefix[seekPrefix] = 0xffff; - Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix)); - if (DEBUG_SURROGATES) { - System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); - } - getTermsDict().seekEnum(termEnum, t2); - if (DEBUG_SURROGATES) { - System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); - } - surrogateSeekUpto--; - - if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { - // force pop - newSuffixStart = -1; - } else { - newSuffixStart = termEnum.newSuffixStart; - } - - return true; - } + if (t != null && t.field() == fieldInfo.name) { + newSuffixStart = 0; + prevTerm.length = 0; + surrogateDance(); } - - return false; - } - - private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result(); - private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result(); - - private boolean pushNewSurrogate() throws IOException { - if (DEBUG_SURROGATES) { - System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); - } - final Term t = termEnum.term(); - if (t == null || t.field() != fieldInfo.name) { - return false; - } - - final BytesRef bytes = t.bytes(); - UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer); - - for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { - - if (DEBUG_SURROGATES) { - System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); - } - - // the next() that we just did read in a new - // suffix, containing a surrogate pair - - // seek forward to see if there are any terms with - // this same prefix, but with characters after the - // surrogate range; if so, we must first iterate - // them, then seek back to the surrogates - - char[] testPrefix = new char[i+2]; - for(int j=0;j=0;i--) { + if (isHighBMPChar(scratchTerm.bytes, i)) { + if (DEBUG_SURROGATES) { + System.out.println(" found E pos=" + i + "; try seek"); + } + + if (seekToNonBMP(seekTermEnum, scratchTerm, i)) { + + scratchTerm.copy(seekTermEnum.term().bytes()); + getTermsDict().seekEnum(termEnum, seekTermEnum.term()); + + newSuffixStart = 1+i; + + doPushes(); + + // Found a match + // TODO: faster seek? + current = termEnum.term().bytes(); + return SeekStatus.NOT_FOUND; + } + } + } + + if (DEBUG_SURROGATES) { + System.out.println(" seek END"); + } + current = null; return SeekStatus.END; } else { - current = tr; - return SeekStatus.NOT_FOUND; + + // We found a non-exact but non-null term; this one + // is fun -- just treat it like next, by pretending + // requested term was prev: + prevTerm.copy(term); + + if (DEBUG_SURROGATES) { + System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text())); + } + + final BytesRef br = t.bytes(); + assert br.offset == 0; + + setNewSuffixStart(term, br); + + surrogateDance(); + + final Term t2 = termEnum.term(); + if (t2 == null || t2.field() != fieldInfo.name) { + assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned + current = null; + return SeekStatus.END; + } else { + current = t2.bytes(); + assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString()); + return SeekStatus.NOT_FOUND; + } + } + } + + private void setNewSuffixStart(BytesRef br1, BytesRef br2) { + final int limit = Math.min(br1.length, br2.length); + int lastStart = 0; + for(int i=0;i=0;i--) { - assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); - assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); - assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + try { + assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); + assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); + assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); + } catch (UnsupportedOperationException uoe) { + } } // Seek to non-existent empty-string term diff --git a/lucene/src/test/org/apache/lucene/index/TestFlex.java b/lucene/src/test/org/apache/lucene/index/TestFlex.java index cd114a0aa99..b26538f68e3 100644 --- a/lucene/src/test/org/apache/lucene/index/TestFlex.java +++ b/lucene/src/test/org/apache/lucene/index/TestFlex.java @@ -20,6 +20,8 @@ package org.apache.lucene.index; import java.io.*; import java.util.*; import org.apache.lucene.store.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.standard.*; import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; @@ -64,7 +66,8 @@ public class TestFlex extends LuceneTestCase { public void testTermOrd() throws Exception { Directory d = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED)); w.addDocument(doc); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java index 29641b0910a..c8410e23b1d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexReader.java @@ -1675,7 +1675,7 @@ public class TestIndexReader extends LuceneTestCase // LUCENE-1586: getUniqueTermCount public void testUniqueTermCount() throws Exception { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1708,7 +1708,7 @@ public class TestIndexReader extends LuceneTestCase // LUCENE-1609: don't load terms index public void testNoTermsIndex() throws Throwable { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document doc = new Document(); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); @@ -1725,7 +1725,7 @@ public class TestIndexReader extends LuceneTestCase } assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); - writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); writer.addDocument(doc); writer.close(); diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java index bda9f28a0d8..9fede4462bf 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4559,7 +4559,7 @@ public class TestIndexWriter extends LuceneTestCase { dir.close(); } - // LUCENE-2095: make sure with multiple threads commit + // LUCENE-2095: make sure with multiple threads commit // doesn't return until all changes are in fact in the // index public void testCommitThreadSafety() throws Throwable { @@ -4673,7 +4673,9 @@ public class TestIndexWriter extends LuceneTestCase { // sort in UTF16 sort order by default public void testTermUTF16SortOrder() throws Throwable { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); + // nocommit -- allow preflexrw but must force preflex + // for reading + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); Document d = new Document(); // Single segment Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); @@ -4682,7 +4684,7 @@ public class TestIndexWriter extends LuceneTestCase { Random rnd = newRandom(); final Set allTerms = new HashSet(); - for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) { + for(int i=0;i<10*_TestUtil.getRandomMultiplier();i++) { final String s; if (rnd.nextBoolean()) { @@ -4705,14 +4707,13 @@ public class TestIndexWriter extends LuceneTestCase { allTerms.add(s); f.setValue(s); - //System.out.println("add " + termDesc(s)); writer.addDocument(d); if ((1+i) % 42 == 0) { writer.commit(); } } - + IndexReader r = writer.getReader(); // Test each sub-segment diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java index be19393b3cd..27dfd8982f4 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriterDelete.java @@ -394,18 +394,18 @@ public class TestIndexWriterDelete extends LuceneTestCase { } public void testDeletesOnDiskFull() throws IOException { - testOperationsOnDiskFull(false); + doTestOperationsOnDiskFull(false); } public void testUpdatesOnDiskFull() throws IOException { - testOperationsOnDiskFull(true); + doTestOperationsOnDiskFull(true); } /** * Make sure if modifier tries to commit but hits disk full that modifier * remains consistent and usable. Similar to TestIndexReader.testDiskFull(). */ - private void testOperationsOnDiskFull(boolean updates) throws IOException { + private void doTestOperationsOnDiskFull(boolean updates) throws IOException { Term searchTerm = new Term("content", "aaa"); int START_COUNT = 157; @@ -700,6 +700,7 @@ public class TestIndexWriterDelete extends LuceneTestCase { try { modifier.commit(); } catch (IOException ioe) { + // expected failed = true; } diff --git a/lucene/src/test/org/apache/lucene/index/TestMultiFields.java b/lucene/src/test/org/apache/lucene/index/TestMultiFields.java index cf62dfc3e62..e3fd00b8cb2 100644 --- a/lucene/src/test/org/apache/lucene/index/TestMultiFields.java +++ b/lucene/src/test/org/apache/lucene/index/TestMultiFields.java @@ -27,11 +27,12 @@ public class TestMultiFields extends LuceneTestCase { public void testRandom() throws Exception { + Random r = newRandom(); + for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) { Directory dir = new MockRAMDirectory(); - IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); - Random r = new Random(); + IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES)); Map> docs = new HashMap>(); Set deleted = new HashSet(); @@ -45,7 +46,7 @@ public class TestMultiFields extends LuceneTestCase { doc.add(id); boolean onlyUniqueTerms = r.nextBoolean(); - + Set uniqueTerms = new HashSet(); for(int i=0;i 0) { @@ -61,6 +62,7 @@ public class TestMultiFields extends LuceneTestCase { } docs.get(term).add(i); terms.add(term); + uniqueTerms.add(term); f.setValue(s); } id.setValue(""+i); @@ -75,8 +77,19 @@ public class TestMultiFields extends LuceneTestCase { } } + if (VERBOSE) { + List termsList = new ArrayList(uniqueTerms); + Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator()); + System.out.println("UTF16 order:"); + for(BytesRef b : termsList) { + System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString())); + } + } + + // nocommit IndexReader reader = w.getReader(); w.close(); + //System.out.println("TEST reader=" + reader); Bits delDocs = MultiFields.getDeletedDocs(reader); for(int delDoc : deleted) { diff --git a/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java b/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java index 48a5efdd3d8..07b5cff684d 100644 --- a/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java +++ b/lucene/src/test/org/apache/lucene/index/TestSegmentTermEnum.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util._TestUtil; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -64,7 +65,7 @@ public class TestSegmentTermEnum extends LuceneTestCase { public void testPrevTermAtEnd() throws IOException { Directory dir = new MockRAMDirectory(); - IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard"))); addDoc(writer, "aaa bbb"); writer.close(); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java index c50478b3664..4990685ecb8 100644 --- a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java @@ -18,8 +18,11 @@ package org.apache.lucene.index.codecs.preflex; */ import org.apache.lucene.store.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; import org.apache.lucene.index.*; import org.apache.lucene.index.codecs.*; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; import org.apache.lucene.util.*; import java.util.*; @@ -30,8 +33,6 @@ import org.junit.Test; public class TestSurrogates extends LuceneTestCaseJ4 { - // chooses from a very limited alphabet to exacerbate the - // surrogate seeking required private static String makeDifficultRandomUnicodeString(Random r) { final int end = r.nextInt(20); if (end == 0) { @@ -44,154 +45,295 @@ public class TestSurrogates extends LuceneTestCaseJ4 { if (0 == t && i < end - 1) { // hi - buffer[i++] = (char) 0xd800; + buffer[i++] = (char) (0xd800 + r.nextInt(2)); // lo - buffer[i] = (char) 0xdc00; + buffer[i] = (char) (0xdc00 + r.nextInt(2)); } else if (t <= 3) { - buffer[i] = 'a'; + buffer[i] = (char) ('a' + r.nextInt(2)); } else if (4 == t) { - buffer[i] = 0xe000; + buffer[i] = (char) (0xe000 + r.nextInt(2)); } } return new String(buffer, 0, end); } - private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + private String toHexString(Term t) { + return t.field() + ":" + UnicodeUtil.toHexString(t.text()); + } + + private String getRandomString(Random r) { + String s; + if (r.nextInt(3) == 1) { + s = makeDifficultRandomUnicodeString(r); + } else { + s = _TestUtil.randomUnicodeString(r); + } + return s; + } + + private static class SortTermAsUTF16Comparator implements Comparator { + public int compare(Term o1, Term o2) { + return o1.compareToUTF16(o2); + } + } + + private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator(); + + // single straight enum + private void doTestStraightEnum(List fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException { + + if (VERBOSE) { + System.out.println("\nTEST: top now enum reader=" + reader); + } + FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator(); + + { + // Test straight enum: + String field; + int termCount = 0; + while((field = fieldsEnum.next()) != null) { + TermsEnum termsEnum = fieldsEnum.terms(); + BytesRef text; + BytesRef lastText = null; + while((text = termsEnum.next()) != null) { + Term exp = fieldTerms.get(termCount); + if (VERBOSE) { + System.out.println(" got term=" + field + ":" + UnicodeUtil.toHexString(text.utf8ToString())); + System.out.println(" exp=" + exp.field() + ":" + UnicodeUtil.toHexString(exp.text().toString())); + System.out.println(); + } + if (lastText == null) { + lastText = new BytesRef(text); + } else { + assertTrue(lastText.compareTo(text) < 0); + lastText.copy(text); + } + assertEquals(exp.field(), field); + assertEquals(exp.bytes(), text); + termCount++; + } + if (VERBOSE) { + System.out.println(" no more terms for field=" + field); + } + } + assertEquals(uniqueTermCount, termCount); + } + } + + // randomly seeks to term that we know exists, then next's + // from there + private void doTestSeekExists(Random r, List fieldTerms, IndexReader reader) throws IOException { + + final Map tes = new HashMap(); + + // Test random seek to existing term, then enum: + if (VERBOSE) { + System.out.println("\nTEST: top now seek"); + } + + for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) { + + // pick random field+term + int spot = r.nextInt(fieldTerms.size()); + Term term = fieldTerms.get(spot); + String field = term.field(); + + if (VERBOSE) { + System.out.println("TEST: exist seek field=" + field + " term=" + UnicodeUtil.toHexString(term.text())); + } + + // seek to it + TermsEnum te = tes.get(field); + if (te == null) { + te = MultiFields.getTerms(reader, field).iterator(); + tes.put(field, te); + } + + if (VERBOSE) { + System.out.println(" done get enum"); + } + + // seek should find the term + assertEquals(TermsEnum.SeekStatus.FOUND, + te.seek(term.bytes())); + + // now .next() this many times: + int ct = _TestUtil.nextInt(r, 5, 100); + for(int i=0;i= fieldTerms.size()) { + break; + } + term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } + } + } + + private void doTestSeekDoesNotExist(Random r, int numField, List fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException { + + final Map tes = new HashMap(); + + if (VERBOSE) { + System.out.println("TEST: top random seeks"); + } + + { + for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) { + + // seek to random spot + String field = ("f" + r.nextInt(numField)).intern(); + Term tx = new Term(field, getRandomString(r)); + + int spot = Arrays.binarySearch(fieldTermsArray, tx); + + if (spot < 0) { + if (VERBOSE) { + System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text())); + } + + // term does not exist: + TermsEnum te = tes.get(field); + if (te == null) { + te = MultiFields.getTerms(reader, field).iterator(); + tes.put(field, te); + } + + if (VERBOSE) { + System.out.println(" got enum"); + } + + spot = -spot - 1; + + if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) { + assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes())); + } else { + assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes())); + + if (VERBOSE) { + System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString())); + System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text())); + } + + assertEquals(fieldTerms.get(spot).bytes(), + te.term()); + + // now .next() this many times: + int ct = _TestUtil.nextInt(r, 5, 100); + for(int i=0;i= fieldTerms.size()) { + break; + } + Term term = fieldTerms.get(1+spot+i); + if (term.field() != field) { + assertNull(te.next()); + break; + } else { + BytesRef t = te.next(); + + if (VERBOSE) { + System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString()))); + System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString())); + } + + assertEquals(term.bytes(), t); + } + } + + } + } + } + } + } + + + @Test + public void testSurrogatesOrder() throws Exception { + Random r = newRandom(); + + Directory dir = new MockRAMDirectory(); + RandomIndexWriter w = new RandomIndexWriter(r, + dir, + new IndexWriterConfig(TEST_VERSION_CURRENT, + new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec()))); final int numField = _TestUtil.nextInt(r, 2, 5); - List terms = new ArrayList(); + int uniqueTermCount = 0; int tc = 0; + List fieldTerms = new ArrayList(); + for(int f=0;f uniqueTerms = new HashSet(); + for(int i=0;i() { - public int compare(Term o1, Term o2) { - return o1.compareToUTF16(o2); - } - }); - - TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128); - TermInfo ti = new TermInfo(); - String lastText = null; - int uniqueTermCount = 0; if (VERBOSE) { - System.out.println("TEST: utf16 order:"); - } - for(Term t : terms) { - FieldInfo fi = fieldInfos.fieldInfo(t.field()); + Collections.sort(fieldTerms, termAsUTF16Comparator); - String text = t.text(); - if (lastText != null && lastText.equals(text)) { - continue; - } - fieldTerms.add(t); - uniqueTermCount++; - lastText = text; - - if (VERBOSE) { + System.out.println("\nTEST: UTF16 order"); + for(Term t: fieldTerms) { System.out.println(" " + toHexString(t)); } - w.add(fi.number, t.bytes().bytes, t.bytes().length, ti); } - w.close(); + // sorts in code point order: Collections.sort(fieldTerms); + if (VERBOSE) { System.out.println("\nTEST: codepoint order"); for(Term t: fieldTerms) { - System.out.println(" " + t.field() + ":" + toHexString(t)); + System.out.println(" " + toHexString(t)); } } - dir.createOutput(segName + ".prx").close(); - dir.createOutput(segName + ".frq").close(); + Term[] fieldTermsArray = fieldTerms.toArray(new Term[fieldTerms.size()]); - // !!hack alert!! stuffing uniqueTermCount in as docCount - return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec); - } + //SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); - private String toHexString(Term t) { - return t.field() + ":" + UnicodeUtil.toHexString(t.text()); - } - - @Test - public void testSurrogatesOrder() throws Exception { - Directory dir = new MockRAMDirectory(); + //FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); + //assertNotNull(fields); - Codec codec = new PreFlexCodec(); + doTestStraightEnum(fieldTerms, reader, uniqueTermCount); + doTestSeekExists(r, fieldTerms, reader); + doTestSeekDoesNotExist(r, numField, fieldTerms, fieldTermsArray, reader); - Random r = newRandom(); - FieldInfos fieldInfos = new FieldInfos(); - List fieldTerms = new ArrayList(); - SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); - - // hack alert!! - int uniqueTermCount = si.docCount; - - FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); - assertNotNull(fields); - - if (VERBOSE) { - System.out.println("\nTEST: now enum"); - } - FieldsEnum fieldsEnum = fields.iterator(); - String field; - UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); - - int termCount = 0; - while((field = fieldsEnum.next()) != null) { - TermsEnum termsEnum = fieldsEnum.terms(); - BytesRef text; - BytesRef lastText = null; - while((text = termsEnum.next()) != null) { - if (VERBOSE) { - UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); - System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); - System.out.println(); - } - if (lastText == null) { - lastText = new BytesRef(text); - } else { - assertTrue(lastText.compareTo(text) < 0); - lastText.copy(text); - } - assertEquals(fieldTerms.get(termCount).field(), field); - assertEquals(fieldTerms.get(termCount).bytes(), text); - termCount++; - } - if (VERBOSE) { - System.out.println(" no more terms for field=" + field); - } - } - assertEquals(uniqueTermCount, termCount); - - fields.close(); + reader.close(); } } diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java new file mode 100644 index 00000000000..2b0a4167174 --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java @@ -0,0 +1,212 @@ +package org.apache.lucene.index.codecs.preflexrw; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.TermsConsumer; +import org.apache.lucene.index.codecs.PostingsConsumer; +import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter; +import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.codecs.preflex.TermInfo; +import org.apache.lucene.store.IndexOutput; + +import java.io.IOException; +import java.util.Comparator; + +class PreFlexFieldsWriter extends FieldsConsumer { + + private final TermInfosWriter termsOut; + private final IndexOutput freqOut; + private final IndexOutput proxOut; + private final DefaultSkipListWriter skipListWriter; + private final int totalNumDocs; + + public PreFlexFieldsWriter(SegmentWriteState state) throws IOException { + termsOut = new TermInfosWriter(state.directory, + state.segmentName, + state.fieldInfos, + state.termIndexInterval); + state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_EXTENSION)); + state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_INDEX_EXTENSION)); + + final String freqFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.FREQ_EXTENSION); + freqOut = state.directory.createOutput(freqFile); + state.flushedFiles.add(freqFile); + totalNumDocs = state.numDocs; + + if (state.fieldInfos.hasProx()) { + final String proxFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.PROX_EXTENSION); + proxOut = state.directory.createOutput(proxFile); + state.flushedFiles.add(proxFile); + } else { + proxOut = null; + } + + skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, + termsOut.maxSkipLevels, + totalNumDocs, + freqOut, + proxOut); + //System.out.println("\nw start seg=" + segment); + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + assert field.number != -1; + //System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number); + return new PreFlexTermsWriter(field); + } + + @Override + public void close() throws IOException { + termsOut.close(); + freqOut.close(); + if (proxOut != null) { + proxOut.close(); + } + } + + private class PreFlexTermsWriter extends TermsConsumer { + private final FieldInfo fieldInfo; + private final boolean omitTF; + private final boolean storePayloads; + + private final TermInfo termInfo = new TermInfo(); + private final PostingsWriter postingsWriter = new PostingsWriter(); + + public PreFlexTermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + omitTF = fieldInfo.omitTermFreqAndPositions; + storePayloads = fieldInfo.storePayloads; + } + + private class PostingsWriter extends PostingsConsumer { + private int lastDocID; + private int lastPayloadLength = -1; + private int lastPosition; + private int df; + + public PostingsWriter reset() { + df = 0; + lastDocID = 0; + lastPayloadLength = -1; + return this; + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + //System.out.println(" w doc=" + docID); + + final int delta = docID - lastDocID; + if (docID < 0 || (df > 0 && delta <= 0)) { + throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); + } + + if ((++df % termsOut.skipInterval) == 0) { + skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength); + skipListWriter.bufferSkip(df); + } + + lastDocID = docID; + + assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; + + if (omitTF) { + freqOut.writeVInt(delta); + } else { + final int code = delta << 1; + if (termDocFreq == 1) { + freqOut.writeVInt(code|1); + } else { + freqOut.writeVInt(code); + freqOut.writeVInt(termDocFreq); + } + } + lastPosition = 0; + } + + @Override + public void addPosition(int position, BytesRef payload) throws IOException { + assert proxOut != null; + + //System.out.println(" w pos=" + position + " payl=" + payload); + final int delta = position - lastPosition; + lastPosition = position; + + if (storePayloads) { + final int payloadLength = payload == null ? 0 : payload.length; + if (payloadLength != lastPayloadLength) { + //System.out.println(" write payload len=" + payloadLength); + lastPayloadLength = payloadLength; + proxOut.writeVInt((delta<<1)|1); + proxOut.writeVInt(payloadLength); + } else { + proxOut.writeVInt(delta << 1); + } + if (payloadLength > 0) { + proxOut.writeBytes(payload.bytes, payload.offset, payload.length); + } + } else { + proxOut.writeVInt(delta); + } + } + + @Override + public void finishDoc() throws IOException { + } + } + + @Override + public PostingsConsumer startTerm(BytesRef text) throws IOException { + //System.out.println(" w term=" + text.utf8ToString()); + skipListWriter.resetSkip(); + termInfo.freqPointer = freqOut.getFilePointer(); + if (proxOut != null) { + termInfo.proxPointer = proxOut.getFilePointer(); + } + return postingsWriter.reset(); + } + + @Override + public void finishTerm(BytesRef text, int numDocs) throws IOException { + if (numDocs > 0) { + long skipPointer = skipListWriter.writeSkip(freqOut); + termInfo.docFreq = numDocs; + termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer); + //System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number); + termsOut.add(fieldInfo.number, + text, + termInfo); + } + } + + @Override + public void finish() throws IOException { + } + + @Override + public Comparator getComparator() throws IOException { + return BytesRef.getUTF8SortedAsUTF16Comparator(); + } + } +} \ No newline at end of file diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java new file mode 100644 index 00000000000..becad2e80bc --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/PreFlexRWCodec.java @@ -0,0 +1,52 @@ +package org.apache.lucene.index.codecs.preflexrw; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.codecs.preflex.PreFlexCodec; +import org.apache.lucene.index.codecs.preflex.PreFlexFields; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; + +/** Codec, only for testing, that can write and read the + * pre-flex index format. + * + * @lucene.experimental + */ +public class PreFlexRWCodec extends PreFlexCodec { + + public PreFlexRWCodec() { + // NOTE: we use same name as core PreFlex codec so that + // it can read the segments we write! + super(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + return new PreFlexFieldsWriter(state); + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + //System.out.println("preflexrw"); + return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, false); + } +} diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java similarity index 80% rename from lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java rename to lucene/src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java index e1d15403bf9..782cd3a2a01 100644 --- a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflexrw/TermInfosWriter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.index.codecs.preflex; +package org.apache.lucene.index.codecs.preflexrw; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -19,9 +19,12 @@ package org.apache.lucene.index.codecs.preflex; import java.io.IOException; -import org.apache.lucene.store.*; -import org.apache.lucene.index.*; -import org.apache.lucene.util.*; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.index.codecs.preflex.TermInfo; /** This stores a monotonically increasing set of pairs in a @@ -71,8 +74,7 @@ final class TermInfosWriter { private long lastIndexPointer; private boolean isIndex; - private byte[] lastTermBytes = new byte[10]; - private int lastTermBytesLength = 0; + private final BytesRef lastTerm = new BytesRef(); private int lastFieldNumber = -1; private TermInfosWriter other; @@ -104,13 +106,10 @@ final class TermInfosWriter { assert initUTF16Results(); } - void add(Term term, TermInfo ti) throws IOException { - add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti); - } - // Currently used only by assert statements UnicodeUtil.UTF16Result utf16Result1; UnicodeUtil.UTF16Result utf16Result2; + private final BytesRef scratchBytes = new BytesRef(); // Currently used only by assert statements private boolean initUTF16Results() { @@ -120,7 +119,7 @@ final class TermInfosWriter { } // Currently used only by assert statement - private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { + private int compareToLastTerm(int fieldNumber, BytesRef term) { if (lastFieldNumber != fieldNumber) { final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); @@ -132,8 +131,13 @@ final class TermInfosWriter { return cmp; } - UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); - UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); + scratchBytes.copy(term); + assert lastTerm.offset == 0; + UnicodeUtil.UTF8toUTF16(lastTerm.bytes, 0, lastTerm.length, utf16Result1); + + assert scratchBytes.offset == 0; + UnicodeUtil.UTF8toUTF16(scratchBytes.bytes, 0, scratchBytes.length, utf16Result2); + final int len; if (utf16Result1.length < utf16Result2.length) len = utf16Result1.length; @@ -152,22 +156,22 @@ final class TermInfosWriter { /** Adds a new <, TermInfo> pair to the set. Term must be lexicographically greater than all previous Terms added. TermInfo pointers must be positive and greater than all previous.*/ - void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) + public void add(int fieldNumber, BytesRef term, TermInfo ti) throws IOException { - assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || - (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : + assert compareToLastTerm(fieldNumber, term) < 0 || + (isIndex && term.length == 0 && lastTerm.length == 0) : "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + - " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); + " text=" + term.utf8ToString() + " lastText=" + lastTerm.utf8ToString(); assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; if (!isIndex && size % indexInterval == 0) - other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term + other.add(lastFieldNumber, lastTerm, lastTi); // add an index term - writeTerm(fieldNumber, termBytes, termBytesLength); // write term + writeTerm(fieldNumber, term); // write term output.writeVInt(ti.docFreq); // write doc freq output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers @@ -187,29 +191,27 @@ final class TermInfosWriter { size++; } - private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) + private void writeTerm(int fieldNumber, BytesRef term) throws IOException { + //System.out.println(" tiw.write field=" + fieldNumber + " term=" + term.utf8ToString()); + // TODO: UTF16toUTF8 could tell us this prefix // Compute prefix in common with last term: int start = 0; - final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; + final int limit = term.length < lastTerm.length ? term.length : lastTerm.length; while(start < limit) { - if (termBytes[start] != lastTermBytes[start]) + if (term.bytes[start+term.offset] != lastTerm.bytes[start+lastTerm.offset]) break; start++; } - final int length = termBytesLength - start; + final int length = term.length - start; output.writeVInt(start); // write shared prefix length output.writeVInt(length); // write delta length - output.writeBytes(termBytes, start, length); // write delta bytes + output.writeBytes(term.bytes, start+term.offset, length); // write delta bytes output.writeVInt(fieldNumber); // write field num - if (lastTermBytes.length < termBytesLength) { - lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); - } - System.arraycopy(termBytes, start, lastTermBytes, start, length); - lastTermBytesLength = termBytesLength; + lastTerm.copy(term); } /** Called to complete TermInfos creation. */ diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java index e214f388663..76cee88252d 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCase.java @@ -34,6 +34,8 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; /** * Base class for all Lucene unit tests. @@ -72,6 +74,8 @@ public abstract class LuceneTestCase extends TestCase { private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null; + private String savedDefaultCodec; + /** Used to track if setUp and tearDown are called correctly from subclasses */ private boolean setup; @@ -110,6 +114,8 @@ public abstract class LuceneTestCase extends TestCase { ConcurrentMergeScheduler.setTestMode(); savedBoolMaxClauseCount = BooleanQuery.getMaxClauseCount(); + savedDefaultCodec = CodecProvider.getDefaultCodec(); + CodecProvider.setDefaultCodec(_TestUtil.getTestCodec()); } /** @@ -135,7 +141,8 @@ public abstract class LuceneTestCase extends TestCase { assertTrue("ensure your setUp() calls super.setUp()!!!", setup); setup = false; BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount); - + CodecProvider.setDefaultCodec(savedDefaultCodec); + try { Thread.setDefaultUncaughtExceptionHandler(savedUncaughtExceptionHandler); if (!uncaughtExceptions.isEmpty()) { @@ -298,4 +305,10 @@ public abstract class LuceneTestCase extends TestCase { // static members private static final Random seedRnd = new Random(); + + // register preflex-rw statically. + static { + CodecProvider.getDefault().register(new PreFlexRWCodec()); + } + } diff --git a/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java b/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java index 3fa12efd2cb..a08e7ae2b88 100644 --- a/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java +++ b/lucene/src/test/org/apache/lucene/util/LuceneTestCaseJ4.java @@ -22,9 +22,14 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec; + import org.junit.After; +import org.junit.AfterClass; import org.junit.Assert; import org.junit.Before; +import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TestWatchman; @@ -127,6 +132,20 @@ public class LuceneTestCaseJ4 { private static final Map,Object> checkedClasses = Collections.synchronizedMap(new WeakHashMap,Object>()); + // saves default codec: we do this statically as many build indexes in @beforeClass + private static String savedDefaultCodec; + + @BeforeClass + public static void beforeClassLuceneTestCaseJ4() { + savedDefaultCodec = CodecProvider.getDefaultCodec(); + CodecProvider.setDefaultCodec(_TestUtil.getTestCodec()); + } + + @AfterClass + public static void afterClassLuceneTestCaseJ4() { + CodecProvider.setDefaultCodec(savedDefaultCodec); + } + // This is how we get control when errors occur. // Think of this as start/end/success/failed // events. @@ -405,4 +424,8 @@ public class LuceneTestCaseJ4 { private String name = ""; + // register PreFlexRWCodec statically + static { + CodecProvider.getDefault().register(new PreFlexRWCodec()); + } } diff --git a/lucene/src/test/org/apache/lucene/util/_TestUtil.java b/lucene/src/test/org/apache/lucene/util/_TestUtil.java index 4f0ab7705b7..9ff400a82f6 100644 --- a/lucene/src/test/org/apache/lucene/util/_TestUtil.java +++ b/lucene/src/test/org/apache/lucene/util/_TestUtil.java @@ -23,6 +23,9 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.CheckIndex; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.Directory; import java.io.ByteArrayOutputStream; import java.io.PrintStream; @@ -129,8 +132,24 @@ public class _TestUtil { } final char[] buffer = new char[end]; for (int i = 0; i < end; i++) { + int t = r.nextInt(5); + //buffer[i] = (char) (97 + r.nextInt(26)); + + /* + if (0 == t && i < end - 1) { + // hi + buffer[i++] = (char) 0xd800; + // lo + buffer[i] = (char) 0xdc00; + } else if (t <= 3) { + buffer[i] = 'a'; + } else if (4 == t) { + buffer[i] = 0xe000; + } + */ + if (0 == t && i < end - 1) { // Make a surrogate pair // High surrogate @@ -218,4 +237,27 @@ public class _TestUtil { public static int getRandomMultiplier() { return Integer.parseInt(System.getProperty("random.multiplier", "1")); } + + /** gets the codec to run tests with */ + public static String getTestCodec() { + return System.getProperty("tests.codec", "Standard"); + } + + public static CodecProvider alwaysCodec(final Codec c) { + return new CodecProvider() { + @Override + public Codec getWriter(SegmentWriteState state) { + return c; + } + + @Override + public Codec lookup(String name) { + return c; + } + }; + } + + public static CodecProvider alwaysCodec(final String codec) { + return alwaysCodec(CodecProvider.getDefault().lookup(codec)); + } }