diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java index 6dac1cfb88d..611bfb5f674 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java @@ -426,7 +426,7 @@ public class InstantiatedIndexReader extends IndexReader { @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } }; } diff --git a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java index 2454643a196..24360283160 100644 --- a/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java +++ b/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java @@ -123,7 +123,7 @@ public class InstantiatedTermsEnum extends TermsEnum { @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } diff --git a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 39834075ced..a4d1f5caefe 100644 --- a/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -808,7 +808,7 @@ public class MemoryIndex implements Serializable { @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -903,7 +903,7 @@ public class MemoryIndex implements Serializable { @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index f69783e9ac6..1dabe02c94f 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -32,7 +32,7 @@ import java.io.PrintStream; import java.io.IOException; import java.io.File; import java.util.Collection; - +import java.util.Comparator; import java.util.List; import java.util.ArrayList; import java.util.Map; @@ -596,6 +596,10 @@ public class CheckIndex { boolean hasOrd = true; final long termCountStart = status.termCount; + BytesRef lastTerm = null; + + Comparator termComp = terms.getComparator(); + while(true) { final BytesRef term = terms.next(); @@ -603,6 +607,17 @@ public class CheckIndex { break; } + // make sure terms arrive in order according to + // the comp + if (lastTerm == null) { + lastTerm = new BytesRef(term); + } else { + if (termComp.compare(lastTerm, term) >= 0) { + throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term); + } + lastTerm.copy(term); + } + final int docFreq = terms.docFreq(); status.totFreq += docFreq; diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index c641f9afd30..bf065b2a81d 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -53,7 +53,7 @@ public final class FieldInfos { private final HashMap byName = new HashMap(); private int format; - FieldInfos() { } + public FieldInfos() { } /** * Construct a FieldInfos object using the directory and the name of the file @@ -62,7 +62,7 @@ public final class FieldInfos { * @param name The name of the file to open the IndexInput from in the Directory * @throws IOException */ - FieldInfos(Directory d, String name) throws IOException { + public FieldInfos(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { read(input, name); diff --git a/lucene/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/src/java/org/apache/lucene/index/IndexWriter.java index 8593fa1ecab..563446499e2 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/src/java/org/apache/lucene/index/IndexWriter.java @@ -3964,7 +3964,7 @@ public class IndexWriter implements Closeable { // commit merged deletes SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores, MERGE_READ_BUFFER_SIZE, - -1); + -config.getReaderTermsIndexDivisor()); // We clone the segment readers because other // deletes may come in while we're merging so we diff --git a/lucene/src/java/org/apache/lucene/index/SegmentReadState.java b/lucene/src/java/org/apache/lucene/index/SegmentReadState.java index 84db38016c3..eda0e932e36 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentReadState.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentReadState.java @@ -27,6 +27,12 @@ public class SegmentReadState { public final SegmentInfo segmentInfo; public final FieldInfos fieldInfos; public final int readBufferSize; + + // NOTE: if this is < 0, that means "defer terms index + // load until needed". But if the codec must load the + // terms index on init (preflex is the only once currently + // that must do so), then it should negate this value to + // get the app's terms divisor: public final int termsIndexDivisor; public SegmentReadState(Directory dir, diff --git a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java index abec3d150be..71359ead8ff 100644 --- a/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java @@ -130,7 +130,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { // TODO: we may want to make this sort in same order // as Codec's terms dict? - final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator()); + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); tvf.writeVInt(numPostings); byte bits = 0x0; diff --git a/lucene/src/java/org/apache/lucene/index/TermsEnum.java b/lucene/src/java/org/apache/lucene/index/TermsEnum.java index 3c571862467..722c86c1758 100644 --- a/lucene/src/java/org/apache/lucene/index/TermsEnum.java +++ b/lucene/src/java/org/apache/lucene/index/TermsEnum.java @@ -144,8 +144,7 @@ public abstract class TermsEnum { @Override public Comparator getComparator() { - // return an unused dummy to prevent NPE - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return null; } @Override diff --git a/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java index 1650aaa74c1..2b5c0a618be 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java @@ -67,7 +67,7 @@ public class IntBlockCodec extends Codec { success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -95,7 +95,7 @@ public class IntBlockCodec extends Codec { state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -111,7 +111,7 @@ public class IntBlockCodec extends Codec { state.segmentInfo.name, postingsReader, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), StandardCodec.TERMS_CACHE_SIZE); success = true; return ret; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index dc985549146..43d49214e1f 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -39,11 +39,15 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.ArrayUtil; /** Exposes flex API on a pre-flex index, as a codec. * @lucene.experimental */ public class PreFlexFields extends FieldsProducer { + private static final boolean DEBUG_SURROGATES = false; + public TermInfosReader tis; public final TermInfosReader tisNoIndex; @@ -60,6 +64,15 @@ public class PreFlexFields extends FieldsProducer { throws IOException { si = info; + + // NOTE: we must always load terms index, even for + // "sequential" scan during merging, because what is + // sequential to merger may not be to TermInfosReader + // since we do the surrogates dance: + if (indexDivisor < 0) { + indexDivisor = -indexDivisor; + } + TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor); if (indexDivisor == -1) { tisNoIndex = r; @@ -174,7 +187,6 @@ public class PreFlexFields extends FieldsProducer { private class PreFlexFieldsEnum extends FieldsEnum { final Iterator it; private final PreTermsEnum termsEnum; - private int count; FieldInfo current; public PreFlexFieldsEnum() throws IOException { @@ -185,7 +197,6 @@ public class PreFlexFields extends FieldsProducer { @Override public String next() { if (it.hasNext()) { - count++; current = it.next(); return current.name; } else { @@ -195,7 +206,7 @@ public class PreFlexFields extends FieldsProducer { @Override public TermsEnum terms() throws IOException { - termsEnum.reset(current, count == 1); + termsEnum.reset(current); return termsEnum; } } @@ -209,14 +220,15 @@ public class PreFlexFields extends FieldsProducer { @Override public TermsEnum iterator() throws IOException { PreTermsEnum termsEnum = new PreTermsEnum(); - termsEnum.reset(fieldInfo, false); + termsEnum.reset(fieldInfo); return termsEnum; } @Override public Comparator getComparator() { - // Pre-flex indexes always sorted in UTF16 order - return BytesRef.getUTF8SortedAsUTF16Comparator(); + // Pre-flex indexes always sorted in UTF16 order, but + // we remap on-the-fly to unicode order + return BytesRef.getUTF8SortedAsUnicodeComparator(); } } @@ -227,37 +239,229 @@ public class PreFlexFields extends FieldsProducer { private BytesRef current; private final BytesRef scratchBytesRef = new BytesRef(); - void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException { + private int[] surrogateSeekPending = new int[1]; + private boolean[] surrogateDidSeekBack = new boolean[1]; + private int surrogateSeekUpto; + private char[] pendingPrefix; + + private SegmentTermEnum seekTermEnum; + private Term protoTerm; + private int newSuffixStart; + + void reset(FieldInfo fieldInfo) throws IOException { this.fieldInfo = fieldInfo; + protoTerm = new Term(fieldInfo.name); if (termEnum == null) { - // First time reset is called - if (isFirstField) { - termEnum = getTermsDict().terms(); - skipNext = false; - } else { - termEnum = getTermsDict().terms(new Term(fieldInfo.name, "")); - skipNext = true; - } + termEnum = getTermsDict().terms(protoTerm); + seekTermEnum = getTermsDict().terms(protoTerm); } else { - final Term t = termEnum.term(); - if (t != null && t.field() == fieldInfo.name) { - // No need to seek -- we have already advanced onto - // this field. We must be @ first term because - // flex API will not advance this enum further, on - // seeing a different field. - } else { - assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned - final TermInfosReader tis = getTermsDict(); - tis.seekEnum(termEnum, new Term(fieldInfo.name, "")); - } - skipNext = true; + getTermsDict().seekEnum(termEnum, protoTerm); } + skipNext = true; + + surrogateSeekUpto = 0; + newSuffixStart = 0; + + surrogatesDance(); + } + + private void surrogatesDance() throws IOException { + + // Tricky: prior to 4.0, Lucene index sorted terms in + // UTF16 order, but as of 4.0 we sort by Unicode code + // point order. These orders differ because of the + // surrrogates; so we have to fixup our enum, here, by + // carefully first seeking past the surrogates and + // then back again at the end. The process is + // recursive, since any given term could have multiple + // new occurrences of surrogate pairs, so we use a + // stack to record the pending seek-backs. + if (DEBUG_SURROGATES) { + System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); + } + + while(popPendingSeek()); + while(pushNewSurrogate()); + } + + // only for debugging + private String getStack() { + if (surrogateSeekUpto == 0) { + return "null"; + } else { + StringBuffer sb = new StringBuffer(); + for(int i=0;i 0) { + sb.append(' '); + } + sb.append(surrogateSeekPending[i]); + } + sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); + return sb.toString(); + } + } + + private boolean popPendingSeek() throws IOException { + if (DEBUG_SURROGATES) { + System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); + } + // if a .next() has advanced beyond the + // after-surrogates range we had last seeked to, we + // must seek back to the start and resume .next from + // there. this pops the pending seek off the stack. + final Term t = termEnum.term(); + if (surrogateSeekUpto > 0) { + final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; + if (DEBUG_SURROGATES) { + System.out.println(" seekPrefix=" + seekPrefix); + } + if (newSuffixStart < seekPrefix) { + assert pendingPrefix != null; + assert pendingPrefix.length > seekPrefix; + pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; + Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + if (DEBUG_SURROGATES) { + System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); + } + getTermsDict().seekEnum(termEnum, t2); + surrogateDidSeekBack[surrogateSeekUpto-1] = true; + + // +2 because we don't want to re-check the + // surrogates we just seek'd back to + newSuffixStart = seekPrefix + 2; + return true; + } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { + assert pendingPrefix != null; + assert pendingPrefix.length > seekPrefix; + pendingPrefix[seekPrefix] = 0xffff; + Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); + if (DEBUG_SURROGATES) { + System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); + } + getTermsDict().seekEnum(termEnum, t2); + if (DEBUG_SURROGATES) { + System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); + } + surrogateSeekUpto--; + + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { + // force pop + newSuffixStart = -1; + } else { + newSuffixStart = termEnum.newSuffixStart; + } + + return true; + } + } + + return false; + } + + private boolean pushNewSurrogate() throws IOException { + if (DEBUG_SURROGATES) { + System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); + } + final Term t = termEnum.term(); + if (t == null || t.field() != fieldInfo.name) { + return false; + } + final String text = t.text(); + final int textLen = text.length(); + + for(int i=Math.max(0,newSuffixStart);i= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { + + if (DEBUG_SURROGATES) { + System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); + } + + // the next() that we just did read in a new + // suffix, containing a surrogate pair + + // seek forward to see if there are any terms with + // this same prefix, but with characters after the + // surrogate range; if so, we must first iterate + // them, then seek back to the surrogates + + char[] testPrefix = new char[i+1]; + for(int j=0;j 0) { in.close(); this.in = null; if (success) { @@ -173,7 +173,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { // We still create the indexReader when indexDivisor // is -1, so that StandardTermsDictReader can call // isIndexTerm for each field: - if (indexDivisor != -1) { + if (indexDivisor > 0) { coreIndex = new CoreFieldIndex(indexStart, termsStart, packedIndexStart, @@ -218,7 +218,8 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { @Override public void getIndexOffset(long ord, TermsIndexResult result) throws IOException { - // You must call loadTermsIndex if you had specified -1 for indexDivisor + // You must call loadTermsIndex if you had specified + // indexDivisor < 0 to ctor if (coreIndex == null) { throw new IllegalStateException("terms index was not loaded"); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java index 46a55f5b167..98d4f72bf9d 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java @@ -58,7 +58,7 @@ public class StandardCodec extends Codec { success = false; try { - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator()); + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; return ret; } finally { @@ -85,7 +85,7 @@ public class StandardCodec extends Codec { state.fieldInfos, state.segmentInfo.name, state.termsIndexDivisor, - BytesRef.getUTF8SortedAsUTF16Comparator()); + BytesRef.getUTF8SortedAsUnicodeComparator()); success = true; } finally { if (!success) { @@ -101,7 +101,7 @@ public class StandardCodec extends Codec { state.segmentInfo.name, postings, state.readBufferSize, - BytesRef.getUTF8SortedAsUTF16Comparator(), + BytesRef.getUTF8SortedAsUnicodeComparator(), TERMS_CACHE_SIZE); success = true; return ret; diff --git a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java b/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java index 69ac17e6a7b..51340cca24a 100644 --- a/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java +++ b/lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java @@ -103,7 +103,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { // build a cache of sorted transitions for every state allTransitions = new Transition[runAutomaton.getSize()][]; for (State state : this.automaton.getNumberedStates()) { - state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order); + state.sortTransitions(Transition.CompareByMinMaxThenDest); state.trimTransitionsArray(); allTransitions[state.getNumber()] = state.transitionsArray; } @@ -158,11 +158,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { // seek to the next possible string; if (nextString()) { // reposition - - // FIXME: this is really bad to turn off - // but it cannot work correctly until terms are in utf8 order. - linear = false; - + if (linear) setLinear(infinitePosition); return seekBytesRef; @@ -188,15 +184,15 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { } for (int i = 0; i < allTransitions[state].length; i++) { Transition t = allTransitions[state][i]; - if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && - compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) { + if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && + (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { maxInterval = t.getMax(); break; } } - // 0xef terms don't get the optimization... not worth the trouble. - if (maxInterval != 0xef) - maxInterval = incrementUTF16(maxInterval); + // 0xff terms don't get the optimization... not worth the trouble. + if (maxInterval != 0xff) + maxInterval = incrementUTF8(maxInterval); int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) linearUpperBound.bytes = new byte[length]; @@ -281,7 +277,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { // if the next character is U+FFFF and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. - c = incrementUTF16(c); + c = incrementUTF8(c); if (c == -1) return false; } @@ -295,8 +291,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { for (int i = 0; i < transitions.length; i++) { Transition transition = transitions[i]; - if (compareToUTF16(transition.getMax(), c) >= 0) { - int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); + if (transition.getMax() >= c) { + int nextChar = Math.max(c, transition.getMin()); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length + 1); seekBytesRef.length++; @@ -342,9 +338,9 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { private boolean backtrack(int position) { while (position > 0) { int nextChar = seekBytesRef.bytes[position - 1] & 0xff; - // if a character is 0xef its a dead-end too, - // because there is no higher character in UTF-16 sort order. - nextChar = incrementUTF16(nextChar); + // if a character is 0xff its a dead-end too, + // because there is no higher character in UTF-8 sort order. + nextChar = incrementUTF8(nextChar); if (nextChar != -1) { seekBytesRef.bytes[position - 1] = (byte) nextChar; seekBytesRef.length = position; @@ -355,34 +351,11 @@ public class AutomatonTermsEnum extends FilteredTermsEnum { return false; /* all solutions exhausted */ } - /* return the next utf8 byte in utf16 order, or -1 if exhausted */ - private final int incrementUTF16(int utf8) { + /* return the next utf8 byte in utf8 order, or -1 if exhausted */ + private final int incrementUTF8(int utf8) { switch(utf8) { - case 0xed: return 0xf0; - case 0xfd: return 0xee; - case 0xee: return 0xef; - case 0xef: return -1; + case 0xff: return -1; default: return utf8 + 1; } } - - int compareToUTF16(int aByte, int bByte) { - if (aByte != bByte) { - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - return 0; - } } diff --git a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java index 1e66e7d44c8..151ae1a95db 100644 --- a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java +++ b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java @@ -327,6 +327,29 @@ public final class ArrayUtil { return array; } + public static boolean[] grow(boolean[] array, int minSize) { + if (array.length < minSize) { + boolean[] newArray = new boolean[oversize(minSize, 1)]; + System.arraycopy(array, 0, newArray, 0, array.length); + return newArray; + } else + return array; + } + + public static boolean[] grow(boolean[] array) { + return grow(array, 1 + array.length); + } + + public static boolean[] shrink(boolean[] array, int targetSize) { + final int newSize = getShrinkSize(array.length, targetSize, 1); + if (newSize != array.length) { + boolean[] newArray = new boolean[newSize]; + System.arraycopy(array, 0, newArray, 0, newSize); + return newArray; + } else + return array; + } + public static char[] grow(char[] array, int minSize) { if (array.length < minSize) { char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)]; diff --git a/lucene/src/java/org/apache/lucene/util/BytesRef.java b/lucene/src/java/org/apache/lucene/util/BytesRef.java index ca8466f4a2d..3e46c7b5836 100644 --- a/lucene/src/java/org/apache/lucene/util/BytesRef.java +++ b/lucene/src/java/org/apache/lucene/util/BytesRef.java @@ -217,14 +217,7 @@ public final class BytesRef implements Comparable, Externalizable { bytes = ArrayUtil.grow(bytes, newLength); } - private final static Comparator utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); - - public static Comparator getUTF8SortedAsUTF16Comparator() { - return utf8SortedAsUTF16SortOrder; - } - /** Unsigned byte order comparison */ - /* public int compareTo(BytesRef other) { if (this == other) return 0; @@ -245,52 +238,18 @@ public final class BytesRef implements Comparable, Externalizable { // One is a prefix of the other, or, they are equal: return this.length - other.length; } - */ - /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change - * in the future to unsigned byte comparison. */ - public int compareTo(BytesRef other) { - if (this == other) return 0; + private final static Comparator utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); - final byte[] aBytes = this.bytes; - int aUpto = this.offset; - final byte[] bBytes = other.bytes; - int bUpto = other.offset; - - final int aStop = aUpto + Math.min(this.length, other.length); - - while(aUpto < aStop) { - int aByte = aBytes[aUpto++] & 0xff; - int bByte = bBytes[bUpto++] & 0xff; - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - } - - // One is a prefix of the other, or, they are equal: - return this.length - other.length; + public static Comparator getUTF8SortedAsUnicodeComparator() { + return utf8SortedAsUnicodeSortOrder; } - private static class UTF8SortedAsUTF16Comparator implements Comparator { + private static class UTF8SortedAsUnicodeComparator implements Comparator { // Only singleton - private UTF8SortedAsUTF16Comparator() {}; + private UTF8SortedAsUnicodeComparator() {}; public int compare(BytesRef a, BytesRef b) { - final byte[] aBytes = a.bytes; int aUpto = a.offset; final byte[] bBytes = b.bytes; @@ -307,32 +266,15 @@ public final class BytesRef implements Comparable, Externalizable { int aByte = aBytes[aUpto++] & 0xff; int bByte = bBytes[bUpto++] & 0xff; - if (aByte != bByte) { - - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; + int diff = aByte - bByte; + if (diff != 0) { + return diff; } } // One is a prefix of the other, or, they are equal: return a.length - b.length; - } - - public boolean equals(Object other) { - return this == other; - } + } } public void writeExternal(ObjectOutput out) diff --git a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java index a672b244e60..d871d9dfe0b 100644 --- a/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -358,7 +358,6 @@ final public class UnicodeUtil { out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START); } } - offsets[upto] = outUpto; result.length = outUpto; } @@ -483,7 +482,7 @@ final public class UnicodeUtil { } } */ - public static final boolean validUTF16String(CharSequence s) { + public static boolean validUTF16String(CharSequence s) { final int size = s.length(); for(int i=0;i= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { @@ -559,7 +558,7 @@ final public class UnicodeUtil { /** Returns the number of code points in this utf8 * sequence. Behavior is undefined if the utf8 sequence * is invalid.*/ - public static final int codePointCount(BytesRef utf8) { + public static int codePointCount(BytesRef utf8) { int upto = utf8.offset; final int limit = utf8.offset + utf8.length; final byte[] bytes = utf8.bytes; @@ -673,4 +672,33 @@ final public class UnicodeUtil { } return new String(chars, 0, w); } + + // for debugging + public static String toHexString(String s) { + StringBuilder sb = new StringBuilder(); + for(int i=0;i 0) { + sb.append(' '); + } + if (ch < 128) { + sb.append(ch); + } else { + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + sb.append("H:"); + } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + sb.append("L:"); + } else if (ch > UNI_SUR_LOW_END) { + if (ch == 0xffff) { + sb.append("F:"); + } else { + sb.append("E:"); + } + } + + sb.append("0x" + Integer.toHexString(ch)); + } + } + return sb.toString(); + } } diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Transition.java b/lucene/src/java/org/apache/lucene/util/automaton/Transition.java index 0058d3039f5..8cdfe76098a 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/Transition.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/Transition.java @@ -210,64 +210,4 @@ public class Transition implements Serializable, Cloneable { } public static final Comparator CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); - - private static class UTF8InUTF16Order { - protected int compareCodePoint(int aByte, int bByte) { - if (aByte != bByte) { - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order - - // We know the terms are not equal, but, we may - // have to carefully fixup the bytes at the - // difference to match UTF16's sort order: - if (aByte >= 0xee && bByte >= 0xee) { - if ((aByte & 0xfe) == 0xee) { - aByte += 0x10; - } - if ((bByte&0xfe) == 0xee) { - bByte += 0x10; - } - } - return aByte - bByte; - } - return 0; - } - } - - private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { - public int compare(Transition t1, Transition t2) { - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - int minComp = compareCodePoint(t1.min, t2.min); - if (minComp != 0) return minComp; - int maxComp = compareCodePoint(t1.max, t2.max); - if (maxComp != 0) return maxComp; - return 0; - } - } - - public static final Comparator CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle(); - - private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator { - public int compare(Transition t1, Transition t2) { - int minComp = compareCodePoint(t1.min, t2.min); - if (minComp != 0) return minComp; - int maxComp = compareCodePoint(t1.max, t2.max); - if (maxComp != 0) return maxComp; - if (t1.to != t2.to) { - if (t1.to == null) return -1; - else if (t2.to == null) return 1; - else if (t1.to.number < t2.to.number) return -1; - else if (t1.to.number > t2.to.number) return 1; - } - return 0; - } - } - - public static final Comparator CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle(); - - } diff --git a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java index d70599f35d7..aa62754cb73 100644 --- a/lucene/src/test/org/apache/lucene/TestExternalCodecs.java +++ b/lucene/src/test/org/apache/lucene/TestExternalCodecs.java @@ -179,7 +179,7 @@ public class TestExternalCodecs extends LuceneTestCase { @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override @@ -263,7 +263,7 @@ public class TestExternalCodecs extends LuceneTestCase { @Override public Comparator getComparator() { - return BytesRef.getUTF8SortedAsUTF16Comparator(); + return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override diff --git a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java index 0cc3ec314b2..72d0ecce463 100644 --- a/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -4621,38 +4621,22 @@ public class TestIndexWriter extends LuceneTestCase { private void checkTermsOrder(IndexReader r, Set allTerms, boolean isTop) throws IOException { TermsEnum terms = MultiFields.getFields(r).terms("f").iterator(); - char[] last = new char[2]; - int lastLength = 0; + BytesRef last = new BytesRef(); Set seenTerms = new HashSet(); - UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); while(true) { final BytesRef term = terms.next(); if (term == null) { break; } - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); - assertTrue(utf16.length <= 2); - // Make sure last term comes before current one, in - // UTF16 sort order - int i = 0; - for(i=0;i pairs in a + Directory. A TermInfos can be written once, in order. */ + +final class TermInfosWriter { + /** The file format version, a negative number. */ + public static final int FORMAT = -3; + + // Changed strings to true utf8 with length-in-bytes not + // length-in-chars + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; + + // NOTE: always change this if you switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; + + private FieldInfos fieldInfos; + private IndexOutput output; + private TermInfo lastTi = new TermInfo(); + private long size; + + // TODO: the default values for these two parameters should be settable from + // IndexWriter. However, once that's done, folks will start setting them to + // ridiculous values and complaining that things don't work well, as with + // mergeFactor. So, let's wait until a number of folks find that alternate + // values work better. Note that both of these values are stored in the + // segment, so that it's safe to change these w/o rebuilding all indexes. + + /** Expert: The fraction of terms in the "dictionary" which should be stored + * in RAM. Smaller values use more memory, but make searching slightly + * faster, while larger values use less memory and make searching slightly + * slower. Searching is typically not dominated by dictionary lookup, so + * tweaking this is rarely useful.*/ + int indexInterval = 128; + + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in + * smaller indexes, greater acceleration, but fewer accelerable cases, while + * smaller values result in bigger indexes, less acceleration and more + * accelerable cases. More detailed experiments would be useful here. */ + int skipInterval = 16; + + /** Expert: The maximum number of skip levels. Smaller values result in + * slightly smaller indexes, but slower skipping in big posting lists. + */ + int maxSkipLevels = 10; + + private long lastIndexPointer; + private boolean isIndex; + private byte[] lastTermBytes = new byte[10]; + private int lastTermBytesLength = 0; + private int lastFieldNumber = -1; + + private TermInfosWriter other; + private BytesRef utf8Result = new BytesRef(10); + + TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval) + throws IOException { + initialize(directory, segment, fis, interval, false); + other = new TermInfosWriter(directory, segment, fis, interval, true); + other.other = this; + } + + private TermInfosWriter(Directory directory, String segment, FieldInfos fis, + int interval, boolean isIndex) throws IOException { + initialize(directory, segment, fis, interval, isIndex); + } + + private void initialize(Directory directory, String segment, FieldInfos fis, + int interval, boolean isi) throws IOException { + indexInterval = interval; + fieldInfos = fis; + isIndex = isi; + output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); + output.writeInt(FORMAT_CURRENT); // write format + output.writeLong(0); // leave space for size + output.writeInt(indexInterval); // write indexInterval + output.writeInt(skipInterval); // write skipInterval + output.writeInt(maxSkipLevels); // write maxSkipLevels + assert initUTF16Results(); + } + + void add(Term term, TermInfo ti) throws IOException { + UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result); + add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti); + } + + // Currently used only by assert statements + UnicodeUtil.UTF16Result utf16Result1; + UnicodeUtil.UTF16Result utf16Result2; + + // Currently used only by assert statements + private boolean initUTF16Results() { + utf16Result1 = new UnicodeUtil.UTF16Result(); + utf16Result2 = new UnicodeUtil.UTF16Result(); + return true; + } + + // Currently used only by assert statement + private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { + + if (lastFieldNumber != fieldNumber) { + final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); + // If there is a field named "" (empty string) then we + // will get 0 on this comparison, yet, it's "OK". But + // it's not OK if two different field numbers map to + // the same name. + if (cmp != 0 || lastFieldNumber != -1) + return cmp; + } + + UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); + UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); + final int len; + if (utf16Result1.length < utf16Result2.length) + len = utf16Result1.length; + else + len = utf16Result2.length; + + for(int i=0;i, TermInfo> pair to the set. + Term must be lexicographically greater than all previous Terms added. + TermInfo pointers must be positive and greater than all previous.*/ + void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) + throws IOException { + + assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || + (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : + "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + + " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); + + assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; + assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; + + if (!isIndex && size % indexInterval == 0) + other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term + + writeTerm(fieldNumber, termBytes, termBytesLength); // write term + + output.writeVInt(ti.docFreq); // write doc freq + output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers + output.writeVLong(ti.proxPointer - lastTi.proxPointer); + + if (ti.docFreq >= skipInterval) { + output.writeVInt(ti.skipOffset); + } + + if (isIndex) { + output.writeVLong(other.output.getFilePointer() - lastIndexPointer); + lastIndexPointer = other.output.getFilePointer(); // write pointer + } + + lastFieldNumber = fieldNumber; + lastTi.set(ti); + size++; + } + + private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) + throws IOException { + + // TODO: UTF16toUTF8 could tell us this prefix + // Compute prefix in common with last term: + int start = 0; + final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; + while(start < limit) { + if (termBytes[start] != lastTermBytes[start]) + break; + start++; + } + + final int length = termBytesLength - start; + output.writeVInt(start); // write shared prefix length + output.writeVInt(length); // write delta length + output.writeBytes(termBytes, start, length); // write delta bytes + output.writeVInt(fieldNumber); // write field num + if (lastTermBytes.length < termBytesLength) { + lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); + } + System.arraycopy(termBytes, start, lastTermBytes, start, length); + lastTermBytesLength = termBytesLength; + } + + /** Called to complete TermInfos creation. */ + void close() throws IOException { + output.seek(4); // write size after format + output.writeLong(size); + output.close(); + + if (!isIndex) + other.close(); + } + +} diff --git a/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java new file mode 100644 index 00000000000..83bdeb5a6da --- /dev/null +++ b/lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java @@ -0,0 +1,212 @@ +package org.apache.lucene.index.codecs.preflex; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.store.*; +import org.apache.lucene.index.*; +import org.apache.lucene.index.codecs.*; +import org.apache.lucene.util.*; + +import java.util.*; +import java.io.IOException; + +public class TestSurrogates extends LuceneTestCase { + + private static final boolean DEBUG = false; + + // like Term, but uses BytesRef for text + private static class FieldAndText implements Comparable { + String field; + BytesRef text; + + public FieldAndText(Term t) { + field = t.field(); + text = new BytesRef(t.text()); + } + + public int compareTo(FieldAndText other) { + if (other.field == field) { + return text.compareTo(other.text); + } else { + return field.compareTo(other.field); + } + } + } + + // chooses from a very limited alphabet to exacerbate the + // surrogate seeking required + private static String makeDifficultRandomUnicodeString(Random r) { + final int end = r.nextInt(20); + if (end == 0) { + // allow 0 length + return ""; + } + final char[] buffer = new char[end]; + for (int i = 0; i < end; i++) { + int t = r.nextInt(5); + + if (0 == t && i < end - 1) { + // hi + buffer[i++] = (char) 0xd800; + // lo + buffer[i] = (char) 0xdc00; + } else if (t <= 3) { + buffer[i] = 'a'; + } else if (4 == t) { + buffer[i] = 0xe000; + } + } + + return new String(buffer, 0, end); + } + + private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List fieldTerms) throws IOException { + + final int numField = _TestUtil.nextInt(r, 2, 5); + + List terms = new ArrayList(); + + int tc = 0; + + for(int f=0;f fieldTerms = new ArrayList(); + SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); + + // hack alert!! + int uniqueTermCount = si.docCount; + + FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); + assertNotNull(fields); + + if (DEBUG) { + System.out.println("\nTEST: now enum"); + } + FieldsEnum fieldsEnum = fields.iterator(); + String field; + UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); + + int termCount = 0; + while((field = fieldsEnum.next()) != null) { + TermsEnum termsEnum = fieldsEnum.terms(); + BytesRef text; + BytesRef lastText = null; + while((text = termsEnum.next()) != null) { + UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); + if (DEBUG) { + System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); + System.out.println(); + } + if (lastText == null) { + lastText = new BytesRef(text); + } else { + assertTrue(lastText.compareTo(text) < 0); + lastText.copy(text); + } + assertEquals(fieldTerms.get(termCount).field, field); + assertEquals(fieldTerms.get(termCount).text, text); + termCount++; + } + if (DEBUG) { + System.out.println(" no more terms for field=" + field); + } + } + assertEquals(uniqueTermCount, termCount); + + fields.close(); + } +} diff --git a/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java b/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java index e59727fcd3d..26c1a7310ef 100644 --- a/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java +++ b/lucene/src/test/org/apache/lucene/util/TestNumericUtils.java @@ -30,7 +30,7 @@ public class TestNumericUtils extends LuceneTestCase { NumericUtils.longToPrefixCoded(l, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works @@ -48,7 +48,7 @@ public class TestNumericUtils extends LuceneTestCase { NumericUtils.intToPrefixCoded(i, 0, act); if (last!=null) { // test if smaller - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); } // test is back and forward conversion works @@ -84,7 +84,7 @@ public class TestNumericUtils extends LuceneTestCase { // check sort order (prefixVals should be ascending) for (int i=1; i