LUCENE-2426: change index term sort order to unicode code point order (used to be UTF16 order)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@957545 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2010-06-24 13:35:40 +00:00
parent f289f22125
commit 705dfee68c
28 changed files with 888 additions and 272 deletions

View File

@ -426,7 +426,7 @@ public class InstantiatedIndexReader extends IndexReader {
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
};
}

View File

@ -123,7 +123,7 @@ public class InstantiatedTermsEnum extends TermsEnum {
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}

View File

@ -808,7 +808,7 @@ public class MemoryIndex implements Serializable {
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@ -903,7 +903,7 @@ public class MemoryIndex implements Serializable {
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}

View File

@ -32,7 +32,7 @@ import java.io.PrintStream;
import java.io.IOException;
import java.io.File;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
@ -596,6 +596,10 @@ public class CheckIndex {
boolean hasOrd = true;
final long termCountStart = status.termCount;
BytesRef lastTerm = null;
Comparator<BytesRef> termComp = terms.getComparator();
while(true) {
final BytesRef term = terms.next();
@ -603,6 +607,17 @@ public class CheckIndex {
break;
}
// make sure terms arrive in order according to
// the comp
if (lastTerm == null) {
lastTerm = new BytesRef(term);
} else {
if (termComp.compare(lastTerm, term) >= 0) {
throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
}
lastTerm.copy(term);
}
final int docFreq = terms.docFreq();
status.totFreq += docFreq;

View File

@ -53,7 +53,7 @@ public final class FieldInfos {
private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>();
private int format;
FieldInfos() { }
public FieldInfos() { }
/**
* Construct a FieldInfos object using the directory and the name of the file
@ -62,7 +62,7 @@ public final class FieldInfos {
* @param name The name of the file to open the IndexInput from in the Directory
* @throws IOException
*/
FieldInfos(Directory d, String name) throws IOException {
public FieldInfos(Directory d, String name) throws IOException {
IndexInput input = d.openInput(name);
try {
read(input, name);

View File

@ -3964,7 +3964,7 @@ public class IndexWriter implements Closeable {
// commit merged deletes
SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores,
MERGE_READ_BUFFER_SIZE,
-1);
-config.getReaderTermsIndexDivisor());
// We clone the segment readers because other
// deletes may come in while we're merging so we

View File

@ -27,6 +27,12 @@ public class SegmentReadState {
public final SegmentInfo segmentInfo;
public final FieldInfos fieldInfos;
public final int readBufferSize;
// NOTE: if this is < 0, that means "defer terms index
// load until needed". But if the codec must load the
// terms index on init (preflex is the only once currently
// that must do so), then it should negate this value to
// get the app's terms divisor:
public final int termsIndexDivisor;
public SegmentReadState(Directory dir,

View File

@ -130,7 +130,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
// TODO: we may want to make this sort in same order
// as Codec's terms dict?
final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
tvf.writeVInt(numPostings);
byte bits = 0x0;

View File

@ -144,8 +144,7 @@ public abstract class TermsEnum {
@Override
public Comparator<BytesRef> getComparator() {
// return an unused dummy to prevent NPE
return BytesRef.getUTF8SortedAsUTF16Comparator();
return null;
}
@Override

View File

@ -67,7 +67,7 @@ public class IntBlockCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@ -95,7 +95,7 @@ public class IntBlockCodec extends Codec {
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
BytesRef.getUTF8SortedAsUTF16Comparator());
BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@ -111,7 +111,7 @@ public class IntBlockCodec extends Codec {
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUTF16Comparator(),
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;

View File

@ -39,11 +39,15 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.ArrayUtil;
/** Exposes flex API on a pre-flex index, as a codec.
* @lucene.experimental */
public class PreFlexFields extends FieldsProducer {
private static final boolean DEBUG_SURROGATES = false;
public TermInfosReader tis;
public final TermInfosReader tisNoIndex;
@ -60,6 +64,15 @@ public class PreFlexFields extends FieldsProducer {
throws IOException {
si = info;
// NOTE: we must always load terms index, even for
// "sequential" scan during merging, because what is
// sequential to merger may not be to TermInfosReader
// since we do the surrogates dance:
if (indexDivisor < 0) {
indexDivisor = -indexDivisor;
}
TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);
if (indexDivisor == -1) {
tisNoIndex = r;
@ -174,7 +187,6 @@ public class PreFlexFields extends FieldsProducer {
private class PreFlexFieldsEnum extends FieldsEnum {
final Iterator<FieldInfo> it;
private final PreTermsEnum termsEnum;
private int count;
FieldInfo current;
public PreFlexFieldsEnum() throws IOException {
@ -185,7 +197,6 @@ public class PreFlexFields extends FieldsProducer {
@Override
public String next() {
if (it.hasNext()) {
count++;
current = it.next();
return current.name;
} else {
@ -195,7 +206,7 @@ public class PreFlexFields extends FieldsProducer {
@Override
public TermsEnum terms() throws IOException {
termsEnum.reset(current, count == 1);
termsEnum.reset(current);
return termsEnum;
}
}
@ -209,14 +220,15 @@ public class PreFlexFields extends FieldsProducer {
@Override
public TermsEnum iterator() throws IOException {
PreTermsEnum termsEnum = new PreTermsEnum();
termsEnum.reset(fieldInfo, false);
termsEnum.reset(fieldInfo);
return termsEnum;
}
@Override
public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order
return BytesRef.getUTF8SortedAsUTF16Comparator();
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}
@ -227,37 +239,229 @@ public class PreFlexFields extends FieldsProducer {
private BytesRef current;
private final BytesRef scratchBytesRef = new BytesRef();
void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException {
private int[] surrogateSeekPending = new int[1];
private boolean[] surrogateDidSeekBack = new boolean[1];
private int surrogateSeekUpto;
private char[] pendingPrefix;
private SegmentTermEnum seekTermEnum;
private Term protoTerm;
private int newSuffixStart;
void reset(FieldInfo fieldInfo) throws IOException {
this.fieldInfo = fieldInfo;
protoTerm = new Term(fieldInfo.name);
if (termEnum == null) {
// First time reset is called
if (isFirstField) {
termEnum = getTermsDict().terms();
skipNext = false;
} else {
termEnum = getTermsDict().terms(new Term(fieldInfo.name, ""));
skipNext = true;
}
termEnum = getTermsDict().terms(protoTerm);
seekTermEnum = getTermsDict().terms(protoTerm);
} else {
final Term t = termEnum.term();
if (t != null && t.field() == fieldInfo.name) {
// No need to seek -- we have already advanced onto
// this field. We must be @ first term because
// flex API will not advance this enum further, on
// seeing a different field.
} else {
assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned
final TermInfosReader tis = getTermsDict();
tis.seekEnum(termEnum, new Term(fieldInfo.name, ""));
}
skipNext = true;
getTermsDict().seekEnum(termEnum, protoTerm);
}
skipNext = true;
surrogateSeekUpto = 0;
newSuffixStart = 0;
surrogatesDance();
}
private void surrogatesDance() throws IOException {
// Tricky: prior to 4.0, Lucene index sorted terms in
// UTF16 order, but as of 4.0 we sort by Unicode code
// point order. These orders differ because of the
// surrrogates; so we have to fixup our enum, here, by
// carefully first seeking past the surrogates and
// then back again at the end. The process is
// recursive, since any given term could have multiple
// new occurrences of surrogate pairs, so we use a
// stack to record the pending seek-backs.
if (DEBUG_SURROGATES) {
System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
}
while(popPendingSeek());
while(pushNewSurrogate());
}
// only for debugging
private String getStack() {
if (surrogateSeekUpto == 0) {
return "null";
} else {
StringBuffer sb = new StringBuffer();
for(int i=0;i<surrogateSeekUpto;i++) {
if (i > 0) {
sb.append(' ');
}
sb.append(surrogateSeekPending[i]);
}
sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
return sb.toString();
}
}
private boolean popPendingSeek() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
}
// if a .next() has advanced beyond the
// after-surrogates range we had last seeked to, we
// must seek back to the start and resume .next from
// there. this pops the pending seek off the stack.
final Term t = termEnum.term();
if (surrogateSeekUpto > 0) {
final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
if (DEBUG_SURROGATES) {
System.out.println(" seekPrefix=" + seekPrefix);
}
if (newSuffixStart < seekPrefix) {
assert pendingPrefix != null;
assert pendingPrefix.length > seekPrefix;
pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
if (DEBUG_SURROGATES) {
System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
}
getTermsDict().seekEnum(termEnum, t2);
surrogateDidSeekBack[surrogateSeekUpto-1] = true;
// +2 because we don't want to re-check the
// surrogates we just seek'd back to
newSuffixStart = seekPrefix + 2;
return true;
} else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
assert pendingPrefix != null;
assert pendingPrefix.length > seekPrefix;
pendingPrefix[seekPrefix] = 0xffff;
Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
if (DEBUG_SURROGATES) {
System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
}
getTermsDict().seekEnum(termEnum, t2);
if (DEBUG_SURROGATES) {
System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
}
surrogateSeekUpto--;
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
// force pop
newSuffixStart = -1;
} else {
newSuffixStart = termEnum.newSuffixStart;
}
return true;
}
}
return false;
}
private boolean pushNewSurrogate() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack());
}
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
return false;
}
final String text = t.text();
final int textLen = text.length();
for(int i=Math.max(0,newSuffixStart);i<textLen;i++) {
final char ch = text.charAt(i);
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
if (DEBUG_SURROGATES) {
System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
}
// the next() that we just did read in a new
// suffix, containing a surrogate pair
// seek forward to see if there are any terms with
// this same prefix, but with characters after the
// surrogate range; if so, we must first iterate
// them, then seek back to the surrogates
char[] testPrefix = new char[i+1];
for(int j=0;j<i;j++) {
testPrefix[j] = text.charAt(j);
}
testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new String(testPrefix)));
Term t2 = seekTermEnum.term();
boolean isPrefix;
if (t2 != null && t2.field() == fieldInfo.name) {
String seekText = t2.text();
isPrefix = true;
if (DEBUG_SURROGATES) {
System.out.println(" seek found " + UnicodeUtil.toHexString(seekText));
}
for(int j=0;j<i;j++) {
if (testPrefix[j] != seekText.charAt(j)) {
isPrefix = false;
break;
}
}
if (DEBUG_SURROGATES && !isPrefix) {
System.out.println(" no end terms");
}
} else {
if (DEBUG_SURROGATES) {
System.out.println(" no end terms");
}
isPrefix = false;
}
if (isPrefix) {
// we found a term, sharing the same prefix,
// with characters after the surrogates, so we
// must first enum those, and then return the
// the surrogates afterwards. push that pending
// seek on the surrogates stack now:
pendingPrefix = testPrefix;
getTermsDict().seekEnum(termEnum, t2);
if (surrogateSeekUpto == surrogateSeekPending.length) {
surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
}
if (surrogateSeekUpto == surrogateDidSeekBack.length) {
surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
}
surrogateSeekPending[surrogateSeekUpto] = i;
surrogateDidSeekBack[surrogateSeekUpto] = false;
surrogateSeekUpto++;
if (DEBUG_SURROGATES) {
System.out.println(" do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
}
newSuffixStart = i+1;
return true;
} else {
// there are no terms after the surrogates, so
// we do nothing to the enum and just step
// through the surrogates like normal. but we
// must keep iterating through the term, in case
// another surrogate pair appears later
}
}
}
return false;
}
@Override
public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order
return BytesRef.getUTF8SortedAsUTF16Comparator();
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@ -272,14 +476,24 @@ public class PreFlexFields extends FieldsProducer {
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
if (DEBUG_SURROGATES) {
System.out.println("TE.seek() term=" + term.utf8ToString());
}
skipNext = false;
final TermInfosReader tis = getTermsDict();
final Term t0 = new Term(fieldInfo.name, term.utf8ToString());
final Term t0 = protoTerm.createTerm(term.utf8ToString());
assert termEnum != null;
if (termEnum == null) {
termEnum = tis.terms(t0);
} else {
tis.seekEnum(termEnum, t0);
}
surrogateSeekUpto = 0;
surrogatesDance();
final Term t = termEnum.term();
final BytesRef tr;
@ -304,6 +518,9 @@ public class PreFlexFields extends FieldsProducer {
@Override
public BytesRef next() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println("TE.next() skipNext=" + skipNext);
}
if (skipNext) {
skipNext = false;
if (termEnum.term() == null) {
@ -313,19 +530,37 @@ public class PreFlexFields extends FieldsProducer {
return current = scratchBytesRef;
}
}
if (termEnum.next()) {
if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
newSuffixStart = termEnum.newSuffixStart;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
}
surrogatesDance();
final Term t = termEnum.term();
if (t.field() == fieldInfo.name) {
if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
current = null;
} else {
scratchBytesRef.copy(t.text());
current = scratchBytesRef;
}
return current;
} else {
if (DEBUG_SURROGATES) {
System.out.println(" force pop");
}
// force pop
newSuffixStart = -1;
surrogatesDance();
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
return null;
} else {
scratchBytesRef.copy(t.text());
current = scratchBytesRef;
return current;
} else {
assert !t.field().equals(fieldInfo.name); // make sure field name is interned
// Crossed into new field
return null;
}
} else {
return null;
}
}

View File

@ -53,6 +53,7 @@ public final class SegmentTermEnum implements Cloneable {
long indexPointer = 0;
int indexInterval;
int skipInterval;
int newSuffixStart;
int maxSkipLevels;
private int formatM1SkipInterval;
@ -136,6 +137,7 @@ public final class SegmentTermEnum implements Cloneable {
prevBuffer.set(termBuffer);
termBuffer.read(input, fieldInfos);
newSuffixStart = termBuffer.newSuffixStart;
termInfo.docFreq = input.readVInt(); // read doc freq
termInfo.freqPointer += input.readVLong(); // read freq pointer

View File

@ -19,7 +19,6 @@ package org.apache.lucene.index.codecs.preflex;
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.Term;
@ -34,6 +33,8 @@ final class TermBuffer implements Cloneable {
private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
private BytesRef bytes = new BytesRef(10);
int newSuffixStart;
public final int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned
return compareChars(text.result, text.length, other.text.result, other.text.length);
@ -60,23 +61,33 @@ final class TermBuffer implements Cloneable {
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
if (bytes.bytes.length < totalLength) {
bytes.grow(totalLength);
}
if (dirty) {
// Fully convert all bytes since bytes is dirty
UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
if (bytes.bytes.length < totalLength)
bytes.bytes = new byte[totalLength];
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
dirty = false;
} else {
// Incrementally convert only the UTF8 bytes that are new:
if (bytes.bytes.length < totalLength)
bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength);
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
}
while(true) {
newSuffixStart = text.offsets[start];
if (newSuffixStart != -1) {
break;
}
if (--start == 0) {
newSuffixStart = 0;
break;
}
}
this.field = fieldInfos.fieldName(input.readVInt());
}
@ -124,10 +135,11 @@ final class TermBuffer implements Cloneable {
try {
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
clone.dirty = true;
clone.bytes = new BytesRef(10);
clone.text = new UnicodeUtil.UTF16Result();
clone.text.offsets = new int[text.offsets.length];
System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
clone.text.copyText(text);
return clone;
}

View File

@ -80,7 +80,7 @@ public class PulsingCodec extends Codec {
// Terms dict
success = false;
try {
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@ -111,7 +111,7 @@ public class PulsingCodec extends Codec {
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
BytesRef.getUTF8SortedAsUTF16Comparator());
BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@ -126,7 +126,7 @@ public class PulsingCodec extends Codec {
state.dir, state.fieldInfos, state.segmentInfo.name,
pulsingReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUTF16Comparator(),
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;

View File

@ -63,7 +63,7 @@ public class SepCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@ -95,7 +95,7 @@ public class SepCodec extends Codec {
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
BytesRef.getUTF8SortedAsUTF16Comparator());
BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@ -111,7 +111,7 @@ public class SepCodec extends Codec {
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUTF16Comparator(),
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;

View File

@ -104,7 +104,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
indexInterval = in.readInt();
this.indexDivisor = indexDivisor;
if (indexDivisor == -1) {
if (indexDivisor < 0) {
totalIndexInterval = indexInterval;
} else {
// In case terms index gets loaded, later, on demand
@ -131,7 +131,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
}
success = true;
} finally {
if (indexDivisor != -1) {
if (indexDivisor > 0) {
in.close();
this.in = null;
if (success) {
@ -173,7 +173,7 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
// We still create the indexReader when indexDivisor
// is -1, so that StandardTermsDictReader can call
// isIndexTerm for each field:
if (indexDivisor != -1) {
if (indexDivisor > 0) {
coreIndex = new CoreFieldIndex(indexStart,
termsStart,
packedIndexStart,
@ -218,7 +218,8 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
@Override
public void getIndexOffset(long ord, TermsIndexResult result) throws IOException {
// You must call loadTermsIndex if you had specified -1 for indexDivisor
// You must call loadTermsIndex if you had specified
// indexDivisor < 0 to ctor
if (coreIndex == null) {
throw new IllegalStateException("terms index was not loaded");
}

View File

@ -58,7 +58,7 @@ public class StandardCodec extends Codec {
success = false;
try {
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator());
FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@ -85,7 +85,7 @@ public class StandardCodec extends Codec {
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
BytesRef.getUTF8SortedAsUTF16Comparator());
BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@ -101,7 +101,7 @@ public class StandardCodec extends Codec {
state.segmentInfo.name,
postings,
state.readBufferSize,
BytesRef.getUTF8SortedAsUTF16Comparator(),
BytesRef.getUTF8SortedAsUnicodeComparator(),
TERMS_CACHE_SIZE);
success = true;
return ret;

View File

@ -103,7 +103,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
// build a cache of sorted transitions for every state
allTransitions = new Transition[runAutomaton.getSize()][];
for (State state : this.automaton.getNumberedStates()) {
state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
state.sortTransitions(Transition.CompareByMinMaxThenDest);
state.trimTransitionsArray();
allTransitions[state.getNumber()] = state.transitionsArray;
}
@ -158,11 +158,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
// seek to the next possible string;
if (nextString()) {
// reposition
// FIXME: this is really bad to turn off
// but it cannot work correctly until terms are in utf8 order.
linear = false;
if (linear)
setLinear(infinitePosition);
return seekBytesRef;
@ -188,15 +184,15 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
}
for (int i = 0; i < allTransitions[state].length; i++) {
Transition t = allTransitions[state][i];
if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
(seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
maxInterval = t.getMax();
break;
}
}
// 0xef terms don't get the optimization... not worth the trouble.
if (maxInterval != 0xef)
maxInterval = incrementUTF16(maxInterval);
// 0xff terms don't get the optimization... not worth the trouble.
if (maxInterval != 0xff)
maxInterval = incrementUTF8(maxInterval);
int length = position + 1; /* position + maxTransition */
if (linearUpperBound.bytes.length < length)
linearUpperBound.bytes = new byte[length];
@ -281,7 +277,7 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
// if the next character is U+FFFF and is not part of the useful portion,
// then by definition it puts us in a reject state, and therefore this
// path is dead. there cannot be any higher transitions. backtrack.
c = incrementUTF16(c);
c = incrementUTF8(c);
if (c == -1)
return false;
}
@ -295,8 +291,8 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
for (int i = 0; i < transitions.length; i++) {
Transition transition = transitions[i];
if (compareToUTF16(transition.getMax(), c) >= 0) {
int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
if (transition.getMax() >= c) {
int nextChar = Math.max(c, transition.getMin());
// append either the next sequential char, or the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
@ -342,9 +338,9 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
private boolean backtrack(int position) {
while (position > 0) {
int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
// if a character is 0xef its a dead-end too,
// because there is no higher character in UTF-16 sort order.
nextChar = incrementUTF16(nextChar);
// if a character is 0xff its a dead-end too,
// because there is no higher character in UTF-8 sort order.
nextChar = incrementUTF8(nextChar);
if (nextChar != -1) {
seekBytesRef.bytes[position - 1] = (byte) nextChar;
seekBytesRef.length = position;
@ -355,34 +351,11 @@ public class AutomatonTermsEnum extends FilteredTermsEnum {
return false; /* all solutions exhausted */
}
/* return the next utf8 byte in utf16 order, or -1 if exhausted */
private final int incrementUTF16(int utf8) {
/* return the next utf8 byte in utf8 order, or -1 if exhausted */
private final int incrementUTF8(int utf8) {
switch(utf8) {
case 0xed: return 0xf0;
case 0xfd: return 0xee;
case 0xee: return 0xef;
case 0xef: return -1;
case 0xff: return -1;
default: return utf8 + 1;
}
}
int compareToUTF16(int aByte, int bByte) {
if (aByte != bByte) {
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
// We know the terms are not equal, but, we may
// have to carefully fixup the bytes at the
// difference to match UTF16's sort order:
if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) {
aByte += 0x10;
}
if ((bByte&0xfe) == 0xee) {
bByte += 0x10;
}
}
return aByte - bByte;
}
return 0;
}
}

View File

@ -327,6 +327,29 @@ public final class ArrayUtil {
return array;
}
public static boolean[] grow(boolean[] array, int minSize) {
if (array.length < minSize) {
boolean[] newArray = new boolean[oversize(minSize, 1)];
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
return array;
}
public static boolean[] grow(boolean[] array) {
return grow(array, 1 + array.length);
}
public static boolean[] shrink(boolean[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize, 1);
if (newSize != array.length) {
boolean[] newArray = new boolean[newSize];
System.arraycopy(array, 0, newArray, 0, newSize);
return newArray;
} else
return array;
}
public static char[] grow(char[] array, int minSize) {
if (array.length < minSize) {
char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)];

View File

@ -217,14 +217,7 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
bytes = ArrayUtil.grow(bytes, newLength);
}
private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
return utf8SortedAsUTF16SortOrder;
}
/** Unsigned byte order comparison */
/*
public int compareTo(BytesRef other) {
if (this == other) return 0;
@ -245,52 +238,18 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
// One is a prefix of the other, or, they are equal:
return this.length - other.length;
}
*/
/** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
* in the future to unsigned byte comparison. */
public int compareTo(BytesRef other) {
if (this == other) return 0;
private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
final byte[] aBytes = this.bytes;
int aUpto = this.offset;
final byte[] bBytes = other.bytes;
int bUpto = other.offset;
final int aStop = aUpto + Math.min(this.length, other.length);
while(aUpto < aStop) {
int aByte = aBytes[aUpto++] & 0xff;
int bByte = bBytes[bUpto++] & 0xff;
if (aByte != bByte) {
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
// We know the terms are not equal, but, we may
// have to carefully fixup the bytes at the
// difference to match UTF16's sort order:
if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) {
aByte += 0x10;
}
if ((bByte&0xfe) == 0xee) {
bByte += 0x10;
}
}
return aByte - bByte;
}
}
// One is a prefix of the other, or, they are equal:
return this.length - other.length;
public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
return utf8SortedAsUnicodeSortOrder;
}
private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
// Only singleton
private UTF8SortedAsUTF16Comparator() {};
private UTF8SortedAsUnicodeComparator() {};
public int compare(BytesRef a, BytesRef b) {
final byte[] aBytes = a.bytes;
int aUpto = a.offset;
final byte[] bBytes = b.bytes;
@ -307,32 +266,15 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
int aByte = aBytes[aUpto++] & 0xff;
int bByte = bBytes[bUpto++] & 0xff;
if (aByte != bByte) {
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
// We know the terms are not equal, but, we may
// have to carefully fixup the bytes at the
// difference to match UTF16's sort order:
if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) {
aByte += 0x10;
}
if ((bByte&0xfe) == 0xee) {
bByte += 0x10;
}
}
return aByte - bByte;
int diff = aByte - bByte;
if (diff != 0) {
return diff;
}
}
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
public boolean equals(Object other) {
return this == other;
}
}
}
public void writeExternal(ObjectOutput out)

View File

@ -358,7 +358,6 @@ final public class UnicodeUtil {
out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
}
}
offsets[upto] = outUpto;
result.length = outUpto;
}
@ -483,7 +482,7 @@ final public class UnicodeUtil {
}
}
*/
public static final boolean validUTF16String(CharSequence s) {
public static boolean validUTF16String(CharSequence s) {
final int size = s.length();
for(int i=0;i<size;i++) {
char ch = s.charAt(i);
@ -507,7 +506,7 @@ final public class UnicodeUtil {
return true;
}
public static final boolean validUTF16String(char[] s, int size) {
public static boolean validUTF16String(char[] s, int size) {
for(int i=0;i<size;i++) {
char ch = s[i];
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
@ -559,7 +558,7 @@ final public class UnicodeUtil {
/** Returns the number of code points in this utf8
* sequence. Behavior is undefined if the utf8 sequence
* is invalid.*/
public static final int codePointCount(BytesRef utf8) {
public static int codePointCount(BytesRef utf8) {
int upto = utf8.offset;
final int limit = utf8.offset + utf8.length;
final byte[] bytes = utf8.bytes;
@ -673,4 +672,33 @@ final public class UnicodeUtil {
}
return new String(chars, 0, w);
}
// for debugging
public static String toHexString(String s) {
StringBuilder sb = new StringBuilder();
for(int i=0;i<s.length();i++) {
char ch = s.charAt(i);
if (i > 0) {
sb.append(' ');
}
if (ch < 128) {
sb.append(ch);
} else {
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
sb.append("H:");
} else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
sb.append("L:");
} else if (ch > UNI_SUR_LOW_END) {
if (ch == 0xffff) {
sb.append("F:");
} else {
sb.append("E:");
}
}
sb.append("0x" + Integer.toHexString(ch));
}
}
return sb.toString();
}
}

View File

@ -210,64 +210,4 @@ public class Transition implements Serializable, Cloneable {
}
public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
private static class UTF8InUTF16Order {
protected int compareCodePoint(int aByte, int bByte) {
if (aByte != bByte) {
// See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
// We know the terms are not equal, but, we may
// have to carefully fixup the bytes at the
// difference to match UTF16's sort order:
if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) {
aByte += 0x10;
}
if ((bByte&0xfe) == 0xee) {
bByte += 0x10;
}
}
return aByte - bByte;
}
return 0;
}
}
private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
public int compare(Transition t1, Transition t2) {
if (t1.to != t2.to) {
if (t1.to == null) return -1;
else if (t2.to == null) return 1;
else if (t1.to.number < t2.to.number) return -1;
else if (t1.to.number > t2.to.number) return 1;
}
int minComp = compareCodePoint(t1.min, t2.min);
if (minComp != 0) return minComp;
int maxComp = compareCodePoint(t1.max, t2.max);
if (maxComp != 0) return maxComp;
return 0;
}
}
public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
public int compare(Transition t1, Transition t2) {
int minComp = compareCodePoint(t1.min, t2.min);
if (minComp != 0) return minComp;
int maxComp = compareCodePoint(t1.max, t2.max);
if (maxComp != 0) return maxComp;
if (t1.to != t2.to) {
if (t1.to == null) return -1;
else if (t2.to == null) return 1;
else if (t1.to.number < t2.to.number) return -1;
else if (t1.to.number > t2.to.number) return 1;
}
return 0;
}
}
public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
}

View File

@ -179,7 +179,7 @@ public class TestExternalCodecs extends LuceneTestCase {
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@ -263,7 +263,7 @@ public class TestExternalCodecs extends LuceneTestCase {
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUTF16Comparator();
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override

View File

@ -4621,38 +4621,22 @@ public class TestIndexWriter extends LuceneTestCase {
private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
TermsEnum terms = MultiFields.getFields(r).terms("f").iterator();
char[] last = new char[2];
int lastLength = 0;
BytesRef last = new BytesRef();
Set<String> seenTerms = new HashSet<String>();
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
while(true) {
final BytesRef term = terms.next();
if (term == null) {
break;
}
UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
assertTrue(utf16.length <= 2);
// Make sure last term comes before current one, in
// UTF16 sort order
int i = 0;
for(i=0;i<lastLength && i<utf16.length;i++) {
assertTrue("UTF16 code unit " + termDesc(new String(utf16.result, 0, utf16.length)) + " incorrectly sorted after code unit " + termDesc(new String(last, 0, lastLength)), last[i] <= utf16.result[i]);
if (last[i] < utf16.result[i]) {
break;
}
}
// Terms should not have been identical
assertTrue(lastLength != utf16.length || i < lastLength);
assertTrue(last.compareTo(term) < 0);
last.copy(term);
final String s = new String(utf16.result, 0, utf16.length);
final String s = term.utf8ToString();
assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
seenTerms.add(s);
System.arraycopy(utf16.result, 0, last, 0, utf16.length);
lastLength = utf16.length;
}
if (isTop) {

View File

@ -1,5 +1,22 @@
package org.apache.lucene.index.codecs.intblock;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.*;
import org.apache.lucene.index.codecs.sep.*;
@ -34,7 +51,7 @@ public class TestIntBlockCodec extends LuceneTestCase {
out.close();
IntIndexInput in = new SimpleIntBlockIndexInput(dir, "test", 128);
IntIndexInput.Reader r = in.reader();
in.reader();
// read no ints
in.close();
dir.close();

View File

@ -0,0 +1,227 @@
package org.apache.lucene.index.codecs.preflex;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.*;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
Directory. A TermInfos can be written once, in order. */
final class TermInfosWriter {
/** The file format version, a negative number. */
public static final int FORMAT = -3;
// Changed strings to true utf8 with length-in-bytes not
// length-in-chars
public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
// NOTE: always change this if you switch to a new format!
public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
private FieldInfos fieldInfos;
private IndexOutput output;
private TermInfo lastTi = new TermInfo();
private long size;
// TODO: the default values for these two parameters should be settable from
// IndexWriter. However, once that's done, folks will start setting them to
// ridiculous values and complaining that things don't work well, as with
// mergeFactor. So, let's wait until a number of folks find that alternate
// values work better. Note that both of these values are stored in the
// segment, so that it's safe to change these w/o rebuilding all indexes.
/** Expert: The fraction of terms in the "dictionary" which should be stored
* in RAM. Smaller values use more memory, but make searching slightly
* faster, while larger values use less memory and make searching slightly
* slower. Searching is typically not dominated by dictionary lookup, so
* tweaking this is rarely useful.*/
int indexInterval = 128;
/** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
* used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in
* smaller indexes, greater acceleration, but fewer accelerable cases, while
* smaller values result in bigger indexes, less acceleration and more
* accelerable cases. More detailed experiments would be useful here. */
int skipInterval = 16;
/** Expert: The maximum number of skip levels. Smaller values result in
* slightly smaller indexes, but slower skipping in big posting lists.
*/
int maxSkipLevels = 10;
private long lastIndexPointer;
private boolean isIndex;
private byte[] lastTermBytes = new byte[10];
private int lastTermBytesLength = 0;
private int lastFieldNumber = -1;
private TermInfosWriter other;
private BytesRef utf8Result = new BytesRef(10);
TermInfosWriter(Directory directory, String segment, FieldInfos fis,
int interval)
throws IOException {
initialize(directory, segment, fis, interval, false);
other = new TermInfosWriter(directory, segment, fis, interval, true);
other.other = this;
}
private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
int interval, boolean isIndex) throws IOException {
initialize(directory, segment, fis, interval, isIndex);
}
private void initialize(Directory directory, String segment, FieldInfos fis,
int interval, boolean isi) throws IOException {
indexInterval = interval;
fieldInfos = fis;
isIndex = isi;
output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
output.writeInt(FORMAT_CURRENT); // write format
output.writeLong(0); // leave space for size
output.writeInt(indexInterval); // write indexInterval
output.writeInt(skipInterval); // write skipInterval
output.writeInt(maxSkipLevels); // write maxSkipLevels
assert initUTF16Results();
}
void add(Term term, TermInfo ti) throws IOException {
UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result);
add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti);
}
// Currently used only by assert statements
UnicodeUtil.UTF16Result utf16Result1;
UnicodeUtil.UTF16Result utf16Result2;
// Currently used only by assert statements
private boolean initUTF16Results() {
utf16Result1 = new UnicodeUtil.UTF16Result();
utf16Result2 = new UnicodeUtil.UTF16Result();
return true;
}
// Currently used only by assert statement
private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
if (lastFieldNumber != fieldNumber) {
final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
// If there is a field named "" (empty string) then we
// will get 0 on this comparison, yet, it's "OK". But
// it's not OK if two different field numbers map to
// the same name.
if (cmp != 0 || lastFieldNumber != -1)
return cmp;
}
UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
final int len;
if (utf16Result1.length < utf16Result2.length)
len = utf16Result1.length;
else
len = utf16Result2.length;
for(int i=0;i<len;i++) {
final char ch1 = utf16Result1.result[i];
final char ch2 = utf16Result2.result[i];
if (ch1 != ch2)
return ch1-ch2;
}
return utf16Result1.length - utf16Result2.length;
}
/** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
Term must be lexicographically greater than all previous Terms added.
TermInfo pointers must be positive and greater than all previous.*/
void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
throws IOException {
assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
(isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
"Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
" lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
" text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
if (!isIndex && size % indexInterval == 0)
other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
writeTerm(fieldNumber, termBytes, termBytesLength); // write term
output.writeVInt(ti.docFreq); // write doc freq
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
output.writeVLong(ti.proxPointer - lastTi.proxPointer);
if (ti.docFreq >= skipInterval) {
output.writeVInt(ti.skipOffset);
}
if (isIndex) {
output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
lastIndexPointer = other.output.getFilePointer(); // write pointer
}
lastFieldNumber = fieldNumber;
lastTi.set(ti);
size++;
}
private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
throws IOException {
// TODO: UTF16toUTF8 could tell us this prefix
// Compute prefix in common with last term:
int start = 0;
final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
while(start < limit) {
if (termBytes[start] != lastTermBytes[start])
break;
start++;
}
final int length = termBytesLength - start;
output.writeVInt(start); // write shared prefix length
output.writeVInt(length); // write delta length
output.writeBytes(termBytes, start, length); // write delta bytes
output.writeVInt(fieldNumber); // write field num
if (lastTermBytes.length < termBytesLength) {
lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
}
System.arraycopy(termBytes, start, lastTermBytes, start, length);
lastTermBytesLength = termBytesLength;
}
/** Called to complete TermInfos creation. */
void close() throws IOException {
output.seek(4); // write size after format
output.writeLong(size);
output.close();
if (!isIndex)
other.close();
}
}

View File

@ -0,0 +1,212 @@
package org.apache.lucene.index.codecs.preflex;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.codecs.*;
import org.apache.lucene.util.*;
import java.util.*;
import java.io.IOException;
public class TestSurrogates extends LuceneTestCase {
private static final boolean DEBUG = false;
// like Term, but uses BytesRef for text
private static class FieldAndText implements Comparable<FieldAndText> {
String field;
BytesRef text;
public FieldAndText(Term t) {
field = t.field();
text = new BytesRef(t.text());
}
public int compareTo(FieldAndText other) {
if (other.field == field) {
return text.compareTo(other.text);
} else {
return field.compareTo(other.field);
}
}
}
// chooses from a very limited alphabet to exacerbate the
// surrogate seeking required
private static String makeDifficultRandomUnicodeString(Random r) {
final int end = r.nextInt(20);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
int t = r.nextInt(5);
if (0 == t && i < end - 1) {
// hi
buffer[i++] = (char) 0xd800;
// lo
buffer[i] = (char) 0xdc00;
} else if (t <= 3) {
buffer[i] = 'a';
} else if (4 == t) {
buffer[i] = 0xe000;
}
}
return new String(buffer, 0, end);
}
private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<FieldAndText> fieldTerms) throws IOException {
final int numField = _TestUtil.nextInt(r, 2, 5);
List<Term> terms = new ArrayList<Term>();
int tc = 0;
for(int f=0;f<numField;f++) {
String field = "f" + f;
Term protoTerm = new Term(field);
fieldInfos.add(field, true, false, false, false, false, false, false);
final int numTerms = 10000*_TestUtil.getRandomMultiplier();
for(int i=0;i<numTerms;i++) {
String s;
if (r.nextInt(3) == 1) {
s = makeDifficultRandomUnicodeString(r);
} else {
s = _TestUtil.randomUnicodeString(r);
// The surrogate dance uses 0xffff to seek-to-end
// of blocks. Also, pre-4.0 indices are already
// guaranteed to not contain the char 0xffff since
// it's mapped during indexing:
s = s.replace((char) 0xffff, (char) 0xfffe);
}
terms.add(protoTerm.createTerm(s + "_" + (tc++)));
}
}
fieldInfos.write(dir, segName);
// sorts in UTF16 order, just like preflex:
Collections.sort(terms);
TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
TermInfo ti = new TermInfo();
BytesRef utf8 = new BytesRef(10);
String lastText = null;
int uniqueTermCount = 0;
if (DEBUG) {
System.out.println("TEST: utf16 order:");
}
for(Term t : terms) {
FieldInfo fi = fieldInfos.fieldInfo(t.field());
String text = t.text();
if (lastText != null && lastText.equals(text)) {
continue;
}
fieldTerms.add(new FieldAndText(t));
uniqueTermCount++;
lastText = text;
UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8);
if (DEBUG) {
System.out.println(" " + toHexString(t));
}
w.add(fi.number, utf8.bytes, utf8.length, ti);
}
w.close();
Collections.sort(fieldTerms);
if (DEBUG) {
System.out.println("\nTEST: codepoint order");
for(FieldAndText t: fieldTerms) {
System.out.println(" " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString()));
}
}
dir.createOutput(segName + ".prx").close();
dir.createOutput(segName + ".frq").close();
// !!hack alert!! stuffing uniqueTermCount in as docCount
return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
}
private String toHexString(Term t) {
return t.field() + ":" + UnicodeUtil.toHexString(t.text());
}
public void testSurrogatesOrder() throws Exception {
Directory dir = new MockRAMDirectory();
Codec codec = new PreFlexCodec();
Random r = newRandom();
FieldInfos fieldInfos = new FieldInfos();
List<FieldAndText> fieldTerms = new ArrayList<FieldAndText>();
SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
// hack alert!!
int uniqueTermCount = si.docCount;
FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
assertNotNull(fields);
if (DEBUG) {
System.out.println("\nTEST: now enum");
}
FieldsEnum fieldsEnum = fields.iterator();
String field;
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
int termCount = 0;
while((field = fieldsEnum.next()) != null) {
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
BytesRef lastText = null;
while((text = termsEnum.next()) != null) {
UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
if (DEBUG) {
System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
System.out.println();
}
if (lastText == null) {
lastText = new BytesRef(text);
} else {
assertTrue(lastText.compareTo(text) < 0);
lastText.copy(text);
}
assertEquals(fieldTerms.get(termCount).field, field);
assertEquals(fieldTerms.get(termCount).text, text);
termCount++;
}
if (DEBUG) {
System.out.println(" no more terms for field=" + field);
}
}
assertEquals(uniqueTermCount, termCount);
fields.close();
}
}

View File

@ -30,7 +30,7 @@ public class TestNumericUtils extends LuceneTestCase {
NumericUtils.longToPrefixCoded(l, 0, act);
if (last!=null) {
// test if smaller
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
}
// test is back and forward conversion works
@ -48,7 +48,7 @@ public class TestNumericUtils extends LuceneTestCase {
NumericUtils.intToPrefixCoded(i, 0, act);
if (last!=null) {
// test if smaller
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
}
// test is back and forward conversion works
@ -84,7 +84,7 @@ public class TestNumericUtils extends LuceneTestCase {
// check sort order (prefixVals should be ascending)
for (int i=1; i<prefixVals.length; i++) {
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
@ -124,7 +124,7 @@ public class TestNumericUtils extends LuceneTestCase {
// check sort order (prefixVals should be ascending)
for (int i=1; i<prefixVals.length; i++) {
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits