mirror of https://github.com/apache/lucene.git
LUCENE-2426: change index term sort order to unicode code point order (used to be UTF16 order)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@957545 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f289f22125
commit
705dfee68c
|
@ -426,7 +426,7 @@ public class InstantiatedIndexReader extends IndexReader {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -123,7 +123,7 @@ public class InstantiatedTermsEnum extends TermsEnum {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -808,7 +808,7 @@ public class MemoryIndex implements Serializable {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -903,7 +903,7 @@ public class MemoryIndex implements Serializable {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ import java.io.PrintStream;
|
|||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Map;
|
||||
|
@ -596,6 +596,10 @@ public class CheckIndex {
|
|||
boolean hasOrd = true;
|
||||
final long termCountStart = status.termCount;
|
||||
|
||||
BytesRef lastTerm = null;
|
||||
|
||||
Comparator<BytesRef> termComp = terms.getComparator();
|
||||
|
||||
while(true) {
|
||||
|
||||
final BytesRef term = terms.next();
|
||||
|
@ -603,6 +607,17 @@ public class CheckIndex {
|
|||
break;
|
||||
}
|
||||
|
||||
// make sure terms arrive in order according to
|
||||
// the comp
|
||||
if (lastTerm == null) {
|
||||
lastTerm = new BytesRef(term);
|
||||
} else {
|
||||
if (termComp.compare(lastTerm, term) >= 0) {
|
||||
throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
|
||||
}
|
||||
lastTerm.copy(term);
|
||||
}
|
||||
|
||||
final int docFreq = terms.docFreq();
|
||||
status.totFreq += docFreq;
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ public final class FieldInfos {
|
|||
private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>();
|
||||
private int format;
|
||||
|
||||
FieldInfos() { }
|
||||
public FieldInfos() { }
|
||||
|
||||
/**
|
||||
* Construct a FieldInfos object using the directory and the name of the file
|
||||
|
@ -62,7 +62,7 @@ public final class FieldInfos {
|
|||
* @param name The name of the file to open the IndexInput from in the Directory
|
||||
* @throws IOException
|
||||
*/
|
||||
FieldInfos(Directory d, String name) throws IOException {
|
||||
public FieldInfos(Directory d, String name) throws IOException {
|
||||
IndexInput input = d.openInput(name);
|
||||
try {
|
||||
read(input, name);
|
||||
|
|
|
@ -3964,7 +3964,7 @@ public class IndexWriter implements Closeable {
|
|||
// commit merged deletes
|
||||
SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores,
|
||||
MERGE_READ_BUFFER_SIZE,
|
||||
-1);
|
||||
-config.getReaderTermsIndexDivisor());
|
||||
|
||||
// We clone the segment readers because other
|
||||
// deletes may come in while we're merging so we
|
||||
|
|
|
@ -27,6 +27,12 @@ public class SegmentReadState {
|
|||
public final SegmentInfo segmentInfo;
|
||||
public final FieldInfos fieldInfos;
|
||||
public final int readBufferSize;
|
||||
|
||||
// NOTE: if this is < 0, that means "defer terms index
|
||||
// load until needed". But if the codec must load the
|
||||
// terms index on init (preflex is the only once currently
|
||||
// that must do so), then it should negate this value to
|
||||
// get the app's terms divisor:
|
||||
public final int termsIndexDivisor;
|
||||
|
||||
public SegmentReadState(Directory dir,
|
||||
|
|
|
@ -130,7 +130,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
|||
|
||||
// TODO: we may want to make this sort in same order
|
||||
// as Codec's terms dict?
|
||||
final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
|
||||
tvf.writeVInt(numPostings);
|
||||
byte bits = 0x0;
|
||||
|
|
|
@ -144,8 +144,7 @@ public abstract class TermsEnum {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
// return an unused dummy to prevent NPE
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -67,7 +67,7 @@ public class IntBlockCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -95,7 +95,7 @@ public class IntBlockCodec extends Codec {
|
|||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
state.termsIndexDivisor,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -111,7 +111,7 @@ public class IntBlockCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator(),
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
|
|
|
@ -39,11 +39,15 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/** Exposes flex API on a pre-flex index, as a codec.
|
||||
* @lucene.experimental */
|
||||
public class PreFlexFields extends FieldsProducer {
|
||||
|
||||
private static final boolean DEBUG_SURROGATES = false;
|
||||
|
||||
public TermInfosReader tis;
|
||||
public final TermInfosReader tisNoIndex;
|
||||
|
||||
|
@ -60,6 +64,15 @@ public class PreFlexFields extends FieldsProducer {
|
|||
throws IOException {
|
||||
|
||||
si = info;
|
||||
|
||||
// NOTE: we must always load terms index, even for
|
||||
// "sequential" scan during merging, because what is
|
||||
// sequential to merger may not be to TermInfosReader
|
||||
// since we do the surrogates dance:
|
||||
if (indexDivisor < 0) {
|
||||
indexDivisor = -indexDivisor;
|
||||
}
|
||||
|
||||
TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);
|
||||
if (indexDivisor == -1) {
|
||||
tisNoIndex = r;
|
||||
|
@ -174,7 +187,6 @@ public class PreFlexFields extends FieldsProducer {
|
|||
private class PreFlexFieldsEnum extends FieldsEnum {
|
||||
final Iterator<FieldInfo> it;
|
||||
private final PreTermsEnum termsEnum;
|
||||
private int count;
|
||||
FieldInfo current;
|
||||
|
||||
public PreFlexFieldsEnum() throws IOException {
|
||||
|
@ -185,7 +197,6 @@ public class PreFlexFields extends FieldsProducer {
|
|||
@Override
|
||||
public String next() {
|
||||
if (it.hasNext()) {
|
||||
count++;
|
||||
current = it.next();
|
||||
return current.name;
|
||||
} else {
|
||||
|
@ -195,7 +206,7 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public TermsEnum terms() throws IOException {
|
||||
termsEnum.reset(current, count == 1);
|
||||
termsEnum.reset(current);
|
||||
return termsEnum;
|
||||
}
|
||||
}
|
||||
|
@ -209,14 +220,15 @@ public class PreFlexFields extends FieldsProducer {
|
|||
@Override
|
||||
public TermsEnum iterator() throws IOException {
|
||||
PreTermsEnum termsEnum = new PreTermsEnum();
|
||||
termsEnum.reset(fieldInfo, false);
|
||||
termsEnum.reset(fieldInfo);
|
||||
return termsEnum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
// Pre-flex indexes always sorted in UTF16 order
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
// Pre-flex indexes always sorted in UTF16 order, but
|
||||
// we remap on-the-fly to unicode order
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -227,37 +239,229 @@ public class PreFlexFields extends FieldsProducer {
|
|||
private BytesRef current;
|
||||
private final BytesRef scratchBytesRef = new BytesRef();
|
||||
|
||||
void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException {
|
||||
private int[] surrogateSeekPending = new int[1];
|
||||
private boolean[] surrogateDidSeekBack = new boolean[1];
|
||||
private int surrogateSeekUpto;
|
||||
private char[] pendingPrefix;
|
||||
|
||||
private SegmentTermEnum seekTermEnum;
|
||||
private Term protoTerm;
|
||||
private int newSuffixStart;
|
||||
|
||||
void reset(FieldInfo fieldInfo) throws IOException {
|
||||
this.fieldInfo = fieldInfo;
|
||||
protoTerm = new Term(fieldInfo.name);
|
||||
if (termEnum == null) {
|
||||
// First time reset is called
|
||||
if (isFirstField) {
|
||||
termEnum = getTermsDict().terms();
|
||||
skipNext = false;
|
||||
} else {
|
||||
termEnum = getTermsDict().terms(new Term(fieldInfo.name, ""));
|
||||
skipNext = true;
|
||||
}
|
||||
termEnum = getTermsDict().terms(protoTerm);
|
||||
seekTermEnum = getTermsDict().terms(protoTerm);
|
||||
} else {
|
||||
final Term t = termEnum.term();
|
||||
if (t != null && t.field() == fieldInfo.name) {
|
||||
// No need to seek -- we have already advanced onto
|
||||
// this field. We must be @ first term because
|
||||
// flex API will not advance this enum further, on
|
||||
// seeing a different field.
|
||||
} else {
|
||||
assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned
|
||||
final TermInfosReader tis = getTermsDict();
|
||||
tis.seekEnum(termEnum, new Term(fieldInfo.name, ""));
|
||||
}
|
||||
skipNext = true;
|
||||
getTermsDict().seekEnum(termEnum, protoTerm);
|
||||
}
|
||||
skipNext = true;
|
||||
|
||||
surrogateSeekUpto = 0;
|
||||
newSuffixStart = 0;
|
||||
|
||||
surrogatesDance();
|
||||
}
|
||||
|
||||
private void surrogatesDance() throws IOException {
|
||||
|
||||
// Tricky: prior to 4.0, Lucene index sorted terms in
|
||||
// UTF16 order, but as of 4.0 we sort by Unicode code
|
||||
// point order. These orders differ because of the
|
||||
// surrrogates; so we have to fixup our enum, here, by
|
||||
// carefully first seeking past the surrogates and
|
||||
// then back again at the end. The process is
|
||||
// recursive, since any given term could have multiple
|
||||
// new occurrences of surrogate pairs, so we use a
|
||||
// stack to record the pending seek-backs.
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
|
||||
}
|
||||
|
||||
while(popPendingSeek());
|
||||
while(pushNewSurrogate());
|
||||
}
|
||||
|
||||
// only for debugging
|
||||
private String getStack() {
|
||||
if (surrogateSeekUpto == 0) {
|
||||
return "null";
|
||||
} else {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for(int i=0;i<surrogateSeekUpto;i++) {
|
||||
if (i > 0) {
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(surrogateSeekPending[i]);
|
||||
}
|
||||
sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean popPendingSeek() throws IOException {
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
|
||||
}
|
||||
// if a .next() has advanced beyond the
|
||||
// after-surrogates range we had last seeked to, we
|
||||
// must seek back to the start and resume .next from
|
||||
// there. this pops the pending seek off the stack.
|
||||
final Term t = termEnum.term();
|
||||
if (surrogateSeekUpto > 0) {
|
||||
final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" seekPrefix=" + seekPrefix);
|
||||
}
|
||||
if (newSuffixStart < seekPrefix) {
|
||||
assert pendingPrefix != null;
|
||||
assert pendingPrefix.length > seekPrefix;
|
||||
pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
|
||||
Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
|
||||
}
|
||||
getTermsDict().seekEnum(termEnum, t2);
|
||||
surrogateDidSeekBack[surrogateSeekUpto-1] = true;
|
||||
|
||||
// +2 because we don't want to re-check the
|
||||
// surrogates we just seek'd back to
|
||||
newSuffixStart = seekPrefix + 2;
|
||||
return true;
|
||||
} else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
|
||||
assert pendingPrefix != null;
|
||||
assert pendingPrefix.length > seekPrefix;
|
||||
pendingPrefix[seekPrefix] = 0xffff;
|
||||
Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
|
||||
}
|
||||
getTermsDict().seekEnum(termEnum, t2);
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
|
||||
}
|
||||
surrogateSeekUpto--;
|
||||
|
||||
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
|
||||
// force pop
|
||||
newSuffixStart = -1;
|
||||
} else {
|
||||
newSuffixStart = termEnum.newSuffixStart;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean pushNewSurrogate() throws IOException {
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack());
|
||||
}
|
||||
final Term t = termEnum.term();
|
||||
if (t == null || t.field() != fieldInfo.name) {
|
||||
return false;
|
||||
}
|
||||
final String text = t.text();
|
||||
final int textLen = text.length();
|
||||
|
||||
for(int i=Math.max(0,newSuffixStart);i<textLen;i++) {
|
||||
final char ch = text.charAt(i);
|
||||
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
|
||||
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
|
||||
}
|
||||
|
||||
// the next() that we just did read in a new
|
||||
// suffix, containing a surrogate pair
|
||||
|
||||
// seek forward to see if there are any terms with
|
||||
// this same prefix, but with characters after the
|
||||
// surrogate range; if so, we must first iterate
|
||||
// them, then seek back to the surrogates
|
||||
|
||||
char[] testPrefix = new char[i+1];
|
||||
for(int j=0;j<i;j++) {
|
||||
testPrefix[j] = text.charAt(j);
|
||||
}
|
||||
testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
|
||||
|
||||
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new String(testPrefix)));
|
||||
|
||||
Term t2 = seekTermEnum.term();
|
||||
boolean isPrefix;
|
||||
if (t2 != null && t2.field() == fieldInfo.name) {
|
||||
String seekText = t2.text();
|
||||
isPrefix = true;
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" seek found " + UnicodeUtil.toHexString(seekText));
|
||||
}
|
||||
for(int j=0;j<i;j++) {
|
||||
if (testPrefix[j] != seekText.charAt(j)) {
|
||||
isPrefix = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (DEBUG_SURROGATES && !isPrefix) {
|
||||
System.out.println(" no end terms");
|
||||
}
|
||||
} else {
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" no end terms");
|
||||
}
|
||||
isPrefix = false;
|
||||
}
|
||||
|
||||
if (isPrefix) {
|
||||
// we found a term, sharing the same prefix,
|
||||
// with characters after the surrogates, so we
|
||||
// must first enum those, and then return the
|
||||
// the surrogates afterwards. push that pending
|
||||
// seek on the surrogates stack now:
|
||||
pendingPrefix = testPrefix;
|
||||
|
||||
getTermsDict().seekEnum(termEnum, t2);
|
||||
|
||||
if (surrogateSeekUpto == surrogateSeekPending.length) {
|
||||
surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
|
||||
}
|
||||
if (surrogateSeekUpto == surrogateDidSeekBack.length) {
|
||||
surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
|
||||
}
|
||||
surrogateSeekPending[surrogateSeekUpto] = i;
|
||||
surrogateDidSeekBack[surrogateSeekUpto] = false;
|
||||
surrogateSeekUpto++;
|
||||
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
|
||||
}
|
||||
|
||||
newSuffixStart = i+1;
|
||||
|
||||
return true;
|
||||
} else {
|
||||
// there are no terms after the surrogates, so
|
||||
// we do nothing to the enum and just step
|
||||
// through the surrogates like normal. but we
|
||||
// must keep iterating through the term, in case
|
||||
// another surrogate pair appears later
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
// Pre-flex indexes always sorted in UTF16 order
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
// Pre-flex indexes always sorted in UTF16 order, but
|
||||
// we remap on-the-fly to unicode order
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -272,14 +476,24 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println("TE.seek() term=" + term.utf8ToString());
|
||||
}
|
||||
skipNext = false;
|
||||
final TermInfosReader tis = getTermsDict();
|
||||
final Term t0 = new Term(fieldInfo.name, term.utf8ToString());
|
||||
final Term t0 = protoTerm.createTerm(term.utf8ToString());
|
||||
|
||||
assert termEnum != null;
|
||||
|
||||
if (termEnum == null) {
|
||||
termEnum = tis.terms(t0);
|
||||
} else {
|
||||
tis.seekEnum(termEnum, t0);
|
||||
}
|
||||
|
||||
surrogateSeekUpto = 0;
|
||||
surrogatesDance();
|
||||
|
||||
final Term t = termEnum.term();
|
||||
|
||||
final BytesRef tr;
|
||||
|
@ -304,6 +518,9 @@ public class PreFlexFields extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println("TE.next() skipNext=" + skipNext);
|
||||
}
|
||||
if (skipNext) {
|
||||
skipNext = false;
|
||||
if (termEnum.term() == null) {
|
||||
|
@ -313,19 +530,37 @@ public class PreFlexFields extends FieldsProducer {
|
|||
return current = scratchBytesRef;
|
||||
}
|
||||
}
|
||||
if (termEnum.next()) {
|
||||
if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
|
||||
newSuffixStart = termEnum.newSuffixStart;
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" set newSuffixStart=" + newSuffixStart);
|
||||
}
|
||||
surrogatesDance();
|
||||
final Term t = termEnum.term();
|
||||
if (t.field() == fieldInfo.name) {
|
||||
if (t == null || t.field() != fieldInfo.name) {
|
||||
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
|
||||
current = null;
|
||||
} else {
|
||||
scratchBytesRef.copy(t.text());
|
||||
current = scratchBytesRef;
|
||||
}
|
||||
return current;
|
||||
} else {
|
||||
if (DEBUG_SURROGATES) {
|
||||
System.out.println(" force pop");
|
||||
}
|
||||
// force pop
|
||||
newSuffixStart = -1;
|
||||
surrogatesDance();
|
||||
final Term t = termEnum.term();
|
||||
if (t == null || t.field() != fieldInfo.name) {
|
||||
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
|
||||
return null;
|
||||
} else {
|
||||
scratchBytesRef.copy(t.text());
|
||||
current = scratchBytesRef;
|
||||
return current;
|
||||
} else {
|
||||
assert !t.field().equals(fieldInfo.name); // make sure field name is interned
|
||||
// Crossed into new field
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -53,6 +53,7 @@ public final class SegmentTermEnum implements Cloneable {
|
|||
long indexPointer = 0;
|
||||
int indexInterval;
|
||||
int skipInterval;
|
||||
int newSuffixStart;
|
||||
int maxSkipLevels;
|
||||
private int formatM1SkipInterval;
|
||||
|
||||
|
@ -136,6 +137,7 @@ public final class SegmentTermEnum implements Cloneable {
|
|||
|
||||
prevBuffer.set(termBuffer);
|
||||
termBuffer.read(input, fieldInfos);
|
||||
newSuffixStart = termBuffer.newSuffixStart;
|
||||
|
||||
termInfo.docFreq = input.readVInt(); // read doc freq
|
||||
termInfo.freqPointer += input.readVLong(); // read freq pointer
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.index.codecs.preflex;
|
|||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.index.Term;
|
||||
|
@ -34,6 +33,8 @@ final class TermBuffer implements Cloneable {
|
|||
private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
|
||||
private BytesRef bytes = new BytesRef(10);
|
||||
|
||||
int newSuffixStart;
|
||||
|
||||
public final int compareTo(TermBuffer other) {
|
||||
if (field == other.field) // fields are interned
|
||||
return compareChars(text.result, text.length, other.text.result, other.text.length);
|
||||
|
@ -60,23 +61,33 @@ final class TermBuffer implements Cloneable {
|
|||
int start = input.readVInt();
|
||||
int length = input.readVInt();
|
||||
int totalLength = start + length;
|
||||
if (bytes.bytes.length < totalLength) {
|
||||
bytes.grow(totalLength);
|
||||
}
|
||||
if (dirty) {
|
||||
// Fully convert all bytes since bytes is dirty
|
||||
UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
|
||||
if (bytes.bytes.length < totalLength)
|
||||
bytes.bytes = new byte[totalLength];
|
||||
bytes.length = totalLength;
|
||||
input.readBytes(bytes.bytes, start, length);
|
||||
UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
|
||||
dirty = false;
|
||||
} else {
|
||||
// Incrementally convert only the UTF8 bytes that are new:
|
||||
if (bytes.bytes.length < totalLength)
|
||||
bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength);
|
||||
bytes.length = totalLength;
|
||||
input.readBytes(bytes.bytes, start, length);
|
||||
UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
|
||||
}
|
||||
|
||||
while(true) {
|
||||
newSuffixStart = text.offsets[start];
|
||||
if (newSuffixStart != -1) {
|
||||
break;
|
||||
}
|
||||
if (--start == 0) {
|
||||
newSuffixStart = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
this.field = fieldInfos.fieldName(input.readVInt());
|
||||
}
|
||||
|
||||
|
@ -124,10 +135,11 @@ final class TermBuffer implements Cloneable {
|
|||
try {
|
||||
clone = (TermBuffer)super.clone();
|
||||
} catch (CloneNotSupportedException e) {}
|
||||
|
||||
clone.dirty = true;
|
||||
clone.bytes = new BytesRef(10);
|
||||
clone.text = new UnicodeUtil.UTF16Result();
|
||||
clone.text.offsets = new int[text.offsets.length];
|
||||
System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
|
||||
clone.text.copyText(text);
|
||||
return clone;
|
||||
}
|
||||
|
|
|
@ -80,7 +80,7 @@ public class PulsingCodec extends Codec {
|
|||
// Terms dict
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -111,7 +111,7 @@ public class PulsingCodec extends Codec {
|
|||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
state.termsIndexDivisor,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -126,7 +126,7 @@ public class PulsingCodec extends Codec {
|
|||
state.dir, state.fieldInfos, state.segmentInfo.name,
|
||||
pulsingReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator(),
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
|
|
|
@ -63,7 +63,7 @@ public class SepCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -95,7 +95,7 @@ public class SepCodec extends Codec {
|
|||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
state.termsIndexDivisor,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -111,7 +111,7 @@ public class SepCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postingsReader,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator(),
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
StandardCodec.TERMS_CACHE_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
|
|
|
@ -104,7 +104,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
|
|||
indexInterval = in.readInt();
|
||||
this.indexDivisor = indexDivisor;
|
||||
|
||||
if (indexDivisor == -1) {
|
||||
if (indexDivisor < 0) {
|
||||
totalIndexInterval = indexInterval;
|
||||
} else {
|
||||
// In case terms index gets loaded, later, on demand
|
||||
|
@ -131,7 +131,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
|
|||
}
|
||||
success = true;
|
||||
} finally {
|
||||
if (indexDivisor != -1) {
|
||||
if (indexDivisor > 0) {
|
||||
in.close();
|
||||
this.in = null;
|
||||
if (success) {
|
||||
|
@ -173,7 +173,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
|
|||
// We still create the indexReader when indexDivisor
|
||||
// is -1, so that StandardTermsDictReader can call
|
||||
// isIndexTerm for each field:
|
||||
if (indexDivisor != -1) {
|
||||
if (indexDivisor > 0) {
|
||||
coreIndex = new CoreFieldIndex(indexStart,
|
||||
termsStart,
|
||||
packedIndexStart,
|
||||
|
@ -218,7 +218,8 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
|
|||
|
||||
@Override
|
||||
public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
|
||||
// You must call loadTermsIndex if you had specified -1 for indexDivisor
|
||||
// You must call loadTermsIndex if you had specified
|
||||
// indexDivisor < 0 to ctor
|
||||
if (coreIndex == null) {
|
||||
throw new IllegalStateException("terms index was not loaded");
|
||||
}
|
||||
|
|
|
@ -58,7 +58,7 @@ public class StandardCodec extends Codec {
|
|||
|
||||
success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -85,7 +85,7 @@ public class StandardCodec extends Codec {
|
|||
state.fieldInfos,
|
||||
state.segmentInfo.name,
|
||||
state.termsIndexDivisor,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator());
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator());
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
|
@ -101,7 +101,7 @@ public class StandardCodec extends Codec {
|
|||
state.segmentInfo.name,
|
||||
postings,
|
||||
state.readBufferSize,
|
||||
BytesRef.getUTF8SortedAsUTF16Comparator(),
|
||||
BytesRef.getUTF8SortedAsUnicodeComparator(),
|
||||
TERMS_CACHE_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
|
|
|
@ -103,7 +103,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
// build a cache of sorted transitions for every state
|
||||
allTransitions = new Transition[runAutomaton.getSize()][];
|
||||
for (State state : this.automaton.getNumberedStates()) {
|
||||
state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
|
||||
state.sortTransitions(Transition.CompareByMinMaxThenDest);
|
||||
state.trimTransitionsArray();
|
||||
allTransitions[state.getNumber()] = state.transitionsArray;
|
||||
}
|
||||
|
@ -158,11 +158,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
// seek to the next possible string;
|
||||
if (nextString()) {
|
||||
// reposition
|
||||
|
||||
// FIXME: this is really bad to turn off
|
||||
// but it cannot work correctly until terms are in utf8 order.
|
||||
linear = false;
|
||||
|
||||
|
||||
if (linear)
|
||||
setLinear(infinitePosition);
|
||||
return seekBytesRef;
|
||||
|
@ -188,15 +184,15 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
}
|
||||
for (int i = 0; i < allTransitions[state].length; i++) {
|
||||
Transition t = allTransitions[state][i];
|
||||
if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
|
||||
compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
|
||||
if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
|
||||
(seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
|
||||
maxInterval = t.getMax();
|
||||
break;
|
||||
}
|
||||
}
|
||||
// 0xef terms don't get the optimization... not worth the trouble.
|
||||
if (maxInterval != 0xef)
|
||||
maxInterval = incrementUTF16(maxInterval);
|
||||
// 0xff terms don't get the optimization... not worth the trouble.
|
||||
if (maxInterval != 0xff)
|
||||
maxInterval = incrementUTF8(maxInterval);
|
||||
int length = position + 1; /* position + maxTransition */
|
||||
if (linearUpperBound.bytes.length < length)
|
||||
linearUpperBound.bytes = new byte[length];
|
||||
|
@ -281,7 +277,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
// if the next character is U+FFFF and is not part of the useful portion,
|
||||
// then by definition it puts us in a reject state, and therefore this
|
||||
// path is dead. there cannot be any higher transitions. backtrack.
|
||||
c = incrementUTF16(c);
|
||||
c = incrementUTF8(c);
|
||||
if (c == -1)
|
||||
return false;
|
||||
}
|
||||
|
@ -295,8 +291,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
|
||||
for (int i = 0; i < transitions.length; i++) {
|
||||
Transition transition = transitions[i];
|
||||
if (compareToUTF16(transition.getMax(), c) >= 0) {
|
||||
int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
|
||||
if (transition.getMax() >= c) {
|
||||
int nextChar = Math.max(c, transition.getMin());
|
||||
// append either the next sequential char, or the minimum transition
|
||||
seekBytesRef.grow(seekBytesRef.length + 1);
|
||||
seekBytesRef.length++;
|
||||
|
@ -342,9 +338,9 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
private boolean backtrack(int position) {
|
||||
while (position > 0) {
|
||||
int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
|
||||
// if a character is 0xef its a dead-end too,
|
||||
// because there is no higher character in UTF-16 sort order.
|
||||
nextChar = incrementUTF16(nextChar);
|
||||
// if a character is 0xff its a dead-end too,
|
||||
// because there is no higher character in UTF-8 sort order.
|
||||
nextChar = incrementUTF8(nextChar);
|
||||
if (nextChar != -1) {
|
||||
seekBytesRef.bytes[position - 1] = (byte) nextChar;
|
||||
seekBytesRef.length = position;
|
||||
|
@ -355,34 +351,11 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
|
|||
return false; /* all solutions exhausted */
|
||||
}
|
||||
|
||||
/* return the next utf8 byte in utf16 order, or -1 if exhausted */
|
||||
private final int incrementUTF16(int utf8) {
|
||||
/* return the next utf8 byte in utf8 order, or -1 if exhausted */
|
||||
private final int incrementUTF8(int utf8) {
|
||||
switch(utf8) {
|
||||
case 0xed: return 0xf0;
|
||||
case 0xfd: return 0xee;
|
||||
case 0xee: return 0xef;
|
||||
case 0xef: return -1;
|
||||
case 0xff: return -1;
|
||||
default: return utf8 + 1;
|
||||
}
|
||||
}
|
||||
|
||||
int compareToUTF16(int aByte, int bByte) {
|
||||
if (aByte != bByte) {
|
||||
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
|
||||
|
||||
// We know the terms are not equal, but, we may
|
||||
// have to carefully fixup the bytes at the
|
||||
// difference to match UTF16's sort order:
|
||||
if (aByte >= 0xee && bByte >= 0xee) {
|
||||
if ((aByte & 0xfe) == 0xee) {
|
||||
aByte += 0x10;
|
||||
}
|
||||
if ((bByte&0xfe) == 0xee) {
|
||||
bByte += 0x10;
|
||||
}
|
||||
}
|
||||
return aByte - bByte;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -327,6 +327,29 @@ public final class ArrayUtil {
|
|||
return array;
|
||||
}
|
||||
|
||||
public static boolean[] grow(boolean[] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
boolean[] newArray = new boolean[oversize(minSize, 1)];
|
||||
System.arraycopy(array, 0, newArray, 0, array.length);
|
||||
return newArray;
|
||||
} else
|
||||
return array;
|
||||
}
|
||||
|
||||
public static boolean[] grow(boolean[] array) {
|
||||
return grow(array, 1 + array.length);
|
||||
}
|
||||
|
||||
public static boolean[] shrink(boolean[] array, int targetSize) {
|
||||
final int newSize = getShrinkSize(array.length, targetSize, 1);
|
||||
if (newSize != array.length) {
|
||||
boolean[] newArray = new boolean[newSize];
|
||||
System.arraycopy(array, 0, newArray, 0, newSize);
|
||||
return newArray;
|
||||
} else
|
||||
return array;
|
||||
}
|
||||
|
||||
public static char[] grow(char[] array, int minSize) {
|
||||
if (array.length < minSize) {
|
||||
char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)];
|
||||
|
|
|
@ -217,14 +217,7 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
|
|||
bytes = ArrayUtil.grow(bytes, newLength);
|
||||
}
|
||||
|
||||
private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
|
||||
|
||||
public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
|
||||
return utf8SortedAsUTF16SortOrder;
|
||||
}
|
||||
|
||||
/** Unsigned byte order comparison */
|
||||
/*
|
||||
public int compareTo(BytesRef other) {
|
||||
if (this == other) return 0;
|
||||
|
||||
|
@ -245,52 +238,18 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
|
|||
// One is a prefix of the other, or, they are equal:
|
||||
return this.length - other.length;
|
||||
}
|
||||
*/
|
||||
|
||||
/** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
|
||||
* in the future to unsigned byte comparison. */
|
||||
public int compareTo(BytesRef other) {
|
||||
if (this == other) return 0;
|
||||
private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
|
||||
|
||||
final byte[] aBytes = this.bytes;
|
||||
int aUpto = this.offset;
|
||||
final byte[] bBytes = other.bytes;
|
||||
int bUpto = other.offset;
|
||||
|
||||
final int aStop = aUpto + Math.min(this.length, other.length);
|
||||
|
||||
while(aUpto < aStop) {
|
||||
int aByte = aBytes[aUpto++] & 0xff;
|
||||
int bByte = bBytes[bUpto++] & 0xff;
|
||||
if (aByte != bByte) {
|
||||
|
||||
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
|
||||
|
||||
// We know the terms are not equal, but, we may
|
||||
// have to carefully fixup the bytes at the
|
||||
// difference to match UTF16's sort order:
|
||||
if (aByte >= 0xee && bByte >= 0xee) {
|
||||
if ((aByte & 0xfe) == 0xee) {
|
||||
aByte += 0x10;
|
||||
}
|
||||
if ((bByte&0xfe) == 0xee) {
|
||||
bByte += 0x10;
|
||||
}
|
||||
}
|
||||
return aByte - bByte;
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return this.length - other.length;
|
||||
public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
|
||||
return utf8SortedAsUnicodeSortOrder;
|
||||
}
|
||||
|
||||
private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
|
||||
private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
|
||||
// Only singleton
|
||||
private UTF8SortedAsUTF16Comparator() {};
|
||||
private UTF8SortedAsUnicodeComparator() {};
|
||||
|
||||
public int compare(BytesRef a, BytesRef b) {
|
||||
|
||||
final byte[] aBytes = a.bytes;
|
||||
int aUpto = a.offset;
|
||||
final byte[] bBytes = b.bytes;
|
||||
|
@ -307,32 +266,15 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
|
|||
int aByte = aBytes[aUpto++] & 0xff;
|
||||
int bByte = bBytes[bUpto++] & 0xff;
|
||||
|
||||
if (aByte != bByte) {
|
||||
|
||||
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
|
||||
|
||||
// We know the terms are not equal, but, we may
|
||||
// have to carefully fixup the bytes at the
|
||||
// difference to match UTF16's sort order:
|
||||
if (aByte >= 0xee && bByte >= 0xee) {
|
||||
if ((aByte & 0xfe) == 0xee) {
|
||||
aByte += 0x10;
|
||||
}
|
||||
if ((bByte&0xfe) == 0xee) {
|
||||
bByte += 0x10;
|
||||
}
|
||||
}
|
||||
return aByte - bByte;
|
||||
int diff = aByte - bByte;
|
||||
if (diff != 0) {
|
||||
return diff;
|
||||
}
|
||||
}
|
||||
|
||||
// One is a prefix of the other, or, they are equal:
|
||||
return a.length - b.length;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
return this == other;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void writeExternal(ObjectOutput out)
|
||||
|
|
|
@ -358,7 +358,6 @@ final public class UnicodeUtil {
|
|||
out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
|
||||
}
|
||||
}
|
||||
|
||||
offsets[upto] = outUpto;
|
||||
result.length = outUpto;
|
||||
}
|
||||
|
@ -483,7 +482,7 @@ final public class UnicodeUtil {
|
|||
}
|
||||
}
|
||||
*/
|
||||
public static final boolean validUTF16String(CharSequence s) {
|
||||
public static boolean validUTF16String(CharSequence s) {
|
||||
final int size = s.length();
|
||||
for(int i=0;i<size;i++) {
|
||||
char ch = s.charAt(i);
|
||||
|
@ -507,7 +506,7 @@ final public class UnicodeUtil {
|
|||
return true;
|
||||
}
|
||||
|
||||
public static final boolean validUTF16String(char[] s, int size) {
|
||||
public static boolean validUTF16String(char[] s, int size) {
|
||||
for(int i=0;i<size;i++) {
|
||||
char ch = s[i];
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
||||
|
@ -559,7 +558,7 @@ final public class UnicodeUtil {
|
|||
/** Returns the number of code points in this utf8
|
||||
* sequence. Behavior is undefined if the utf8 sequence
|
||||
* is invalid.*/
|
||||
public static final int codePointCount(BytesRef utf8) {
|
||||
public static int codePointCount(BytesRef utf8) {
|
||||
int upto = utf8.offset;
|
||||
final int limit = utf8.offset + utf8.length;
|
||||
final byte[] bytes = utf8.bytes;
|
||||
|
@ -673,4 +672,33 @@ final public class UnicodeUtil {
|
|||
}
|
||||
return new String(chars, 0, w);
|
||||
}
|
||||
|
||||
// for debugging
|
||||
public static String toHexString(String s) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int i=0;i<s.length();i++) {
|
||||
char ch = s.charAt(i);
|
||||
if (i > 0) {
|
||||
sb.append(' ');
|
||||
}
|
||||
if (ch < 128) {
|
||||
sb.append(ch);
|
||||
} else {
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
||||
sb.append("H:");
|
||||
} else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
|
||||
sb.append("L:");
|
||||
} else if (ch > UNI_SUR_LOW_END) {
|
||||
if (ch == 0xffff) {
|
||||
sb.append("F:");
|
||||
} else {
|
||||
sb.append("E:");
|
||||
}
|
||||
}
|
||||
|
||||
sb.append("0x" + Integer.toHexString(ch));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -210,64 +210,4 @@ public class Transition implements Serializable, Cloneable {
|
|||
}
|
||||
|
||||
public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
|
||||
|
||||
private static class UTF8InUTF16Order {
|
||||
protected int compareCodePoint(int aByte, int bByte) {
|
||||
if (aByte != bByte) {
|
||||
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
|
||||
|
||||
// We know the terms are not equal, but, we may
|
||||
// have to carefully fixup the bytes at the
|
||||
// difference to match UTF16's sort order:
|
||||
if (aByte >= 0xee && bByte >= 0xee) {
|
||||
if ((aByte & 0xfe) == 0xee) {
|
||||
aByte += 0x10;
|
||||
}
|
||||
if ((bByte&0xfe) == 0xee) {
|
||||
bByte += 0x10;
|
||||
}
|
||||
}
|
||||
return aByte - bByte;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
|
||||
public int compare(Transition t1, Transition t2) {
|
||||
if (t1.to != t2.to) {
|
||||
if (t1.to == null) return -1;
|
||||
else if (t2.to == null) return 1;
|
||||
else if (t1.to.number < t2.to.number) return -1;
|
||||
else if (t1.to.number > t2.to.number) return 1;
|
||||
}
|
||||
int minComp = compareCodePoint(t1.min, t2.min);
|
||||
if (minComp != 0) return minComp;
|
||||
int maxComp = compareCodePoint(t1.max, t2.max);
|
||||
if (maxComp != 0) return maxComp;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
|
||||
|
||||
private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
|
||||
public int compare(Transition t1, Transition t2) {
|
||||
int minComp = compareCodePoint(t1.min, t2.min);
|
||||
if (minComp != 0) return minComp;
|
||||
int maxComp = compareCodePoint(t1.max, t2.max);
|
||||
if (maxComp != 0) return maxComp;
|
||||
if (t1.to != t2.to) {
|
||||
if (t1.to == null) return -1;
|
||||
else if (t2.to == null) return 1;
|
||||
else if (t1.to.number < t2.to.number) return -1;
|
||||
else if (t1.to.number > t2.to.number) return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -179,7 +179,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -263,7 +263,7 @@ public class TestExternalCodecs extends LuceneTestCase {
|
|||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUTF16Comparator();
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -4621,38 +4621,22 @@ public class TestIndexWriter extends LuceneTestCase {
|
|||
private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
|
||||
TermsEnum terms = MultiFields.getFields(r).terms("f").iterator();
|
||||
|
||||
char[] last = new char[2];
|
||||
int lastLength = 0;
|
||||
BytesRef last = new BytesRef();
|
||||
|
||||
Set<String> seenTerms = new HashSet<String>();
|
||||
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
while(true) {
|
||||
final BytesRef term = terms.next();
|
||||
if (term == null) {
|
||||
break;
|
||||
}
|
||||
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
|
||||
assertTrue(utf16.length <= 2);
|
||||
|
||||
// Make sure last term comes before current one, in
|
||||
// UTF16 sort order
|
||||
int i = 0;
|
||||
for(i=0;i<lastLength && i<utf16.length;i++) {
|
||||
assertTrue("UTF16 code unit " + termDesc(new String(utf16.result, 0, utf16.length)) + " incorrectly sorted after code unit " + termDesc(new String(last, 0, lastLength)), last[i] <= utf16.result[i]);
|
||||
if (last[i] < utf16.result[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Terms should not have been identical
|
||||
assertTrue(lastLength != utf16.length || i < lastLength);
|
||||
assertTrue(last.compareTo(term) < 0);
|
||||
last.copy(term);
|
||||
|
||||
final String s = new String(utf16.result, 0, utf16.length);
|
||||
final String s = term.utf8ToString();
|
||||
assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
|
||||
seenTerms.add(s);
|
||||
|
||||
System.arraycopy(utf16.result, 0, last, 0, utf16.length);
|
||||
lastLength = utf16.length;
|
||||
}
|
||||
|
||||
if (isTop) {
|
||||
|
|
|
@ -1,5 +1,22 @@
|
|||
package org.apache.lucene.index.codecs.intblock;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.index.codecs.sep.*;
|
||||
|
@ -34,7 +51,7 @@ public class TestIntBlockCodec extends LuceneTestCase {
|
|||
out.close();
|
||||
|
||||
IntIndexInput in = new SimpleIntBlockIndexInput(dir, "test", 128);
|
||||
IntIndexInput.Reader r = in.reader();
|
||||
in.reader();
|
||||
// read no ints
|
||||
in.close();
|
||||
dir.close();
|
||||
|
|
|
@ -0,0 +1,227 @@
|
|||
package org.apache.lucene.index.codecs.preflex;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.util.*;
|
||||
|
||||
|
||||
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
||||
Directory. A TermInfos can be written once, in order. */
|
||||
|
||||
final class TermInfosWriter {
|
||||
/** The file format version, a negative number. */
|
||||
public static final int FORMAT = -3;
|
||||
|
||||
// Changed strings to true utf8 with length-in-bytes not
|
||||
// length-in-chars
|
||||
public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
|
||||
|
||||
// NOTE: always change this if you switch to a new format!
|
||||
public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
private IndexOutput output;
|
||||
private TermInfo lastTi = new TermInfo();
|
||||
private long size;
|
||||
|
||||
// TODO: the default values for these two parameters should be settable from
|
||||
// IndexWriter. However, once that's done, folks will start setting them to
|
||||
// ridiculous values and complaining that things don't work well, as with
|
||||
// mergeFactor. So, let's wait until a number of folks find that alternate
|
||||
// values work better. Note that both of these values are stored in the
|
||||
// segment, so that it's safe to change these w/o rebuilding all indexes.
|
||||
|
||||
/** Expert: The fraction of terms in the "dictionary" which should be stored
|
||||
* in RAM. Smaller values use more memory, but make searching slightly
|
||||
* faster, while larger values use less memory and make searching slightly
|
||||
* slower. Searching is typically not dominated by dictionary lookup, so
|
||||
* tweaking this is rarely useful.*/
|
||||
int indexInterval = 128;
|
||||
|
||||
/** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
|
||||
* used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in
|
||||
* smaller indexes, greater acceleration, but fewer accelerable cases, while
|
||||
* smaller values result in bigger indexes, less acceleration and more
|
||||
* accelerable cases. More detailed experiments would be useful here. */
|
||||
int skipInterval = 16;
|
||||
|
||||
/** Expert: The maximum number of skip levels. Smaller values result in
|
||||
* slightly smaller indexes, but slower skipping in big posting lists.
|
||||
*/
|
||||
int maxSkipLevels = 10;
|
||||
|
||||
private long lastIndexPointer;
|
||||
private boolean isIndex;
|
||||
private byte[] lastTermBytes = new byte[10];
|
||||
private int lastTermBytesLength = 0;
|
||||
private int lastFieldNumber = -1;
|
||||
|
||||
private TermInfosWriter other;
|
||||
private BytesRef utf8Result = new BytesRef(10);
|
||||
|
||||
TermInfosWriter(Directory directory, String segment, FieldInfos fis,
|
||||
int interval)
|
||||
throws IOException {
|
||||
initialize(directory, segment, fis, interval, false);
|
||||
other = new TermInfosWriter(directory, segment, fis, interval, true);
|
||||
other.other = this;
|
||||
}
|
||||
|
||||
private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
|
||||
int interval, boolean isIndex) throws IOException {
|
||||
initialize(directory, segment, fis, interval, isIndex);
|
||||
}
|
||||
|
||||
private void initialize(Directory directory, String segment, FieldInfos fis,
|
||||
int interval, boolean isi) throws IOException {
|
||||
indexInterval = interval;
|
||||
fieldInfos = fis;
|
||||
isIndex = isi;
|
||||
output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
|
||||
output.writeInt(FORMAT_CURRENT); // write format
|
||||
output.writeLong(0); // leave space for size
|
||||
output.writeInt(indexInterval); // write indexInterval
|
||||
output.writeInt(skipInterval); // write skipInterval
|
||||
output.writeInt(maxSkipLevels); // write maxSkipLevels
|
||||
assert initUTF16Results();
|
||||
}
|
||||
|
||||
void add(Term term, TermInfo ti) throws IOException {
|
||||
UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result);
|
||||
add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti);
|
||||
}
|
||||
|
||||
// Currently used only by assert statements
|
||||
UnicodeUtil.UTF16Result utf16Result1;
|
||||
UnicodeUtil.UTF16Result utf16Result2;
|
||||
|
||||
// Currently used only by assert statements
|
||||
private boolean initUTF16Results() {
|
||||
utf16Result1 = new UnicodeUtil.UTF16Result();
|
||||
utf16Result2 = new UnicodeUtil.UTF16Result();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Currently used only by assert statement
|
||||
private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
|
||||
|
||||
if (lastFieldNumber != fieldNumber) {
|
||||
final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
|
||||
// If there is a field named "" (empty string) then we
|
||||
// will get 0 on this comparison, yet, it's "OK". But
|
||||
// it's not OK if two different field numbers map to
|
||||
// the same name.
|
||||
if (cmp != 0 || lastFieldNumber != -1)
|
||||
return cmp;
|
||||
}
|
||||
|
||||
UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
|
||||
UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
|
||||
final int len;
|
||||
if (utf16Result1.length < utf16Result2.length)
|
||||
len = utf16Result1.length;
|
||||
else
|
||||
len = utf16Result2.length;
|
||||
|
||||
for(int i=0;i<len;i++) {
|
||||
final char ch1 = utf16Result1.result[i];
|
||||
final char ch2 = utf16Result2.result[i];
|
||||
if (ch1 != ch2)
|
||||
return ch1-ch2;
|
||||
}
|
||||
return utf16Result1.length - utf16Result2.length;
|
||||
}
|
||||
|
||||
/** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
|
||||
Term must be lexicographically greater than all previous Terms added.
|
||||
TermInfo pointers must be positive and greater than all previous.*/
|
||||
void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
|
||||
throws IOException {
|
||||
|
||||
assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
|
||||
(isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
|
||||
"Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
|
||||
" lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
|
||||
" text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
|
||||
|
||||
assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
|
||||
assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
|
||||
|
||||
if (!isIndex && size % indexInterval == 0)
|
||||
other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
|
||||
|
||||
writeTerm(fieldNumber, termBytes, termBytesLength); // write term
|
||||
|
||||
output.writeVInt(ti.docFreq); // write doc freq
|
||||
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
|
||||
output.writeVLong(ti.proxPointer - lastTi.proxPointer);
|
||||
|
||||
if (ti.docFreq >= skipInterval) {
|
||||
output.writeVInt(ti.skipOffset);
|
||||
}
|
||||
|
||||
if (isIndex) {
|
||||
output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
|
||||
lastIndexPointer = other.output.getFilePointer(); // write pointer
|
||||
}
|
||||
|
||||
lastFieldNumber = fieldNumber;
|
||||
lastTi.set(ti);
|
||||
size++;
|
||||
}
|
||||
|
||||
private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
|
||||
throws IOException {
|
||||
|
||||
// TODO: UTF16toUTF8 could tell us this prefix
|
||||
// Compute prefix in common with last term:
|
||||
int start = 0;
|
||||
final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
|
||||
while(start < limit) {
|
||||
if (termBytes[start] != lastTermBytes[start])
|
||||
break;
|
||||
start++;
|
||||
}
|
||||
|
||||
final int length = termBytesLength - start;
|
||||
output.writeVInt(start); // write shared prefix length
|
||||
output.writeVInt(length); // write delta length
|
||||
output.writeBytes(termBytes, start, length); // write delta bytes
|
||||
output.writeVInt(fieldNumber); // write field num
|
||||
if (lastTermBytes.length < termBytesLength) {
|
||||
lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
|
||||
}
|
||||
System.arraycopy(termBytes, start, lastTermBytes, start, length);
|
||||
lastTermBytesLength = termBytesLength;
|
||||
}
|
||||
|
||||
/** Called to complete TermInfos creation. */
|
||||
void close() throws IOException {
|
||||
output.seek(4); // write size after format
|
||||
output.writeLong(size);
|
||||
output.close();
|
||||
|
||||
if (!isIndex)
|
||||
other.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,212 @@
|
|||
package org.apache.lucene.index.codecs.preflex;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.codecs.*;
|
||||
import org.apache.lucene.util.*;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestSurrogates extends LuceneTestCase {
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
// like Term, but uses BytesRef for text
|
||||
private static class FieldAndText implements Comparable<FieldAndText> {
|
||||
String field;
|
||||
BytesRef text;
|
||||
|
||||
public FieldAndText(Term t) {
|
||||
field = t.field();
|
||||
text = new BytesRef(t.text());
|
||||
}
|
||||
|
||||
public int compareTo(FieldAndText other) {
|
||||
if (other.field == field) {
|
||||
return text.compareTo(other.text);
|
||||
} else {
|
||||
return field.compareTo(other.field);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// chooses from a very limited alphabet to exacerbate the
|
||||
// surrogate seeking required
|
||||
private static String makeDifficultRandomUnicodeString(Random r) {
|
||||
final int end = r.nextInt(20);
|
||||
if (end == 0) {
|
||||
// allow 0 length
|
||||
return "";
|
||||
}
|
||||
final char[] buffer = new char[end];
|
||||
for (int i = 0; i < end; i++) {
|
||||
int t = r.nextInt(5);
|
||||
|
||||
if (0 == t && i < end - 1) {
|
||||
// hi
|
||||
buffer[i++] = (char) 0xd800;
|
||||
// lo
|
||||
buffer[i] = (char) 0xdc00;
|
||||
} else if (t <= 3) {
|
||||
buffer[i] = 'a';
|
||||
} else if (4 == t) {
|
||||
buffer[i] = 0xe000;
|
||||
}
|
||||
}
|
||||
|
||||
return new String(buffer, 0, end);
|
||||
}
|
||||
|
||||
private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<FieldAndText> fieldTerms) throws IOException {
|
||||
|
||||
final int numField = _TestUtil.nextInt(r, 2, 5);
|
||||
|
||||
List<Term> terms = new ArrayList<Term>();
|
||||
|
||||
int tc = 0;
|
||||
|
||||
for(int f=0;f<numField;f++) {
|
||||
String field = "f" + f;
|
||||
Term protoTerm = new Term(field);
|
||||
|
||||
fieldInfos.add(field, true, false, false, false, false, false, false);
|
||||
final int numTerms = 10000*_TestUtil.getRandomMultiplier();
|
||||
for(int i=0;i<numTerms;i++) {
|
||||
String s;
|
||||
if (r.nextInt(3) == 1) {
|
||||
s = makeDifficultRandomUnicodeString(r);
|
||||
} else {
|
||||
s = _TestUtil.randomUnicodeString(r);
|
||||
|
||||
// The surrogate dance uses 0xffff to seek-to-end
|
||||
// of blocks. Also, pre-4.0 indices are already
|
||||
// guaranteed to not contain the char 0xffff since
|
||||
// it's mapped during indexing:
|
||||
s = s.replace((char) 0xffff, (char) 0xfffe);
|
||||
}
|
||||
terms.add(protoTerm.createTerm(s + "_" + (tc++)));
|
||||
}
|
||||
}
|
||||
|
||||
fieldInfos.write(dir, segName);
|
||||
|
||||
// sorts in UTF16 order, just like preflex:
|
||||
Collections.sort(terms);
|
||||
|
||||
TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
|
||||
TermInfo ti = new TermInfo();
|
||||
BytesRef utf8 = new BytesRef(10);
|
||||
String lastText = null;
|
||||
int uniqueTermCount = 0;
|
||||
if (DEBUG) {
|
||||
System.out.println("TEST: utf16 order:");
|
||||
}
|
||||
for(Term t : terms) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(t.field());
|
||||
|
||||
String text = t.text();
|
||||
if (lastText != null && lastText.equals(text)) {
|
||||
continue;
|
||||
}
|
||||
fieldTerms.add(new FieldAndText(t));
|
||||
uniqueTermCount++;
|
||||
lastText = text;
|
||||
UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println(" " + toHexString(t));
|
||||
}
|
||||
w.add(fi.number, utf8.bytes, utf8.length, ti);
|
||||
}
|
||||
w.close();
|
||||
|
||||
Collections.sort(fieldTerms);
|
||||
if (DEBUG) {
|
||||
System.out.println("\nTEST: codepoint order");
|
||||
for(FieldAndText t: fieldTerms) {
|
||||
System.out.println(" " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString()));
|
||||
}
|
||||
}
|
||||
|
||||
dir.createOutput(segName + ".prx").close();
|
||||
dir.createOutput(segName + ".frq").close();
|
||||
|
||||
// !!hack alert!! stuffing uniqueTermCount in as docCount
|
||||
return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
|
||||
}
|
||||
|
||||
private String toHexString(Term t) {
|
||||
return t.field() + ":" + UnicodeUtil.toHexString(t.text());
|
||||
}
|
||||
|
||||
public void testSurrogatesOrder() throws Exception {
|
||||
Directory dir = new MockRAMDirectory();
|
||||
|
||||
Codec codec = new PreFlexCodec();
|
||||
|
||||
Random r = newRandom();
|
||||
FieldInfos fieldInfos = new FieldInfos();
|
||||
List<FieldAndText> fieldTerms = new ArrayList<FieldAndText>();
|
||||
SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
|
||||
|
||||
// hack alert!!
|
||||
int uniqueTermCount = si.docCount;
|
||||
|
||||
FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
|
||||
assertNotNull(fields);
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("\nTEST: now enum");
|
||||
}
|
||||
FieldsEnum fieldsEnum = fields.iterator();
|
||||
String field;
|
||||
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
|
||||
|
||||
int termCount = 0;
|
||||
while((field = fieldsEnum.next()) != null) {
|
||||
TermsEnum termsEnum = fieldsEnum.terms();
|
||||
BytesRef text;
|
||||
BytesRef lastText = null;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
|
||||
if (DEBUG) {
|
||||
System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
|
||||
System.out.println();
|
||||
}
|
||||
if (lastText == null) {
|
||||
lastText = new BytesRef(text);
|
||||
} else {
|
||||
assertTrue(lastText.compareTo(text) < 0);
|
||||
lastText.copy(text);
|
||||
}
|
||||
assertEquals(fieldTerms.get(termCount).field, field);
|
||||
assertEquals(fieldTerms.get(termCount).text, text);
|
||||
termCount++;
|
||||
}
|
||||
if (DEBUG) {
|
||||
System.out.println(" no more terms for field=" + field);
|
||||
}
|
||||
}
|
||||
assertEquals(uniqueTermCount, termCount);
|
||||
|
||||
fields.close();
|
||||
}
|
||||
}
|
|
@ -30,7 +30,7 @@ public class TestNumericUtils extends LuceneTestCase {
|
|||
NumericUtils.longToPrefixCoded(l, 0, act);
|
||||
if (last!=null) {
|
||||
// test if smaller
|
||||
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
|
||||
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
|
||||
assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
|
||||
}
|
||||
// test is back and forward conversion works
|
||||
|
@ -48,7 +48,7 @@ public class TestNumericUtils extends LuceneTestCase {
|
|||
NumericUtils.intToPrefixCoded(i, 0, act);
|
||||
if (last!=null) {
|
||||
// test if smaller
|
||||
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
|
||||
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
|
||||
assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
|
||||
}
|
||||
// test is back and forward conversion works
|
||||
|
@ -84,7 +84,7 @@ public class TestNumericUtils extends LuceneTestCase {
|
|||
|
||||
// check sort order (prefixVals should be ascending)
|
||||
for (int i=1; i<prefixVals.length; i++) {
|
||||
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
|
||||
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
|
||||
}
|
||||
|
||||
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
|
||||
|
@ -124,7 +124,7 @@ public class TestNumericUtils extends LuceneTestCase {
|
|||
|
||||
// check sort order (prefixVals should be ascending)
|
||||
for (int i=1; i<prefixVals.length; i++) {
|
||||
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
|
||||
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
|
||||
}
|
||||
|
||||
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
|
||||
|
|
Loading…
Reference in New Issue