commit my current state

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/preflexfixes@967130 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-07-23 15:26:25 +00:00
parent 7216e5b7d2
commit af4a9deb16
30 changed files with 1386 additions and 515 deletions

View File

@ -56,6 +56,7 @@
<property name="args" value=""/> <property name="args" value=""/>
<property name="threadsPerProcessor" value="1" /> <property name="threadsPerProcessor" value="1" />
<property name="random.multiplier" value="1" /> <property name="random.multiplier" value="1" />
<property name="tests.codec" value="Standard" />
<property name="javac.deprecation" value="off"/> <property name="javac.deprecation" value="off"/>
<property name="javac.debug" value="on"/> <property name="javac.debug" value="on"/>
@ -434,6 +435,8 @@
<!-- allow tests to control debug prints --> <!-- allow tests to control debug prints -->
<sysproperty key="tests.verbose" value="${tests.verbose}"/> <sysproperty key="tests.verbose" value="${tests.verbose}"/>
<!-- set the codec tests should run with -->
<sysproperty key="tests.codec" value="${tests.codec}"/>
<!-- TODO: create propertyset for test properties, so each project can have its own set --> <!-- TODO: create propertyset for test properties, so each project can have its own set -->
<sysproperty key="random.multiplier" value="${random.multiplier}"/> <sysproperty key="random.multiplier" value="${random.multiplier}"/>

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
@ -107,8 +108,8 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
RAMDirectory ramdir = new RAMDirectory(); RAMDirectory ramdir = new RAMDirectory();
Analyzer analyzer = randomAnalyzer(); Analyzer analyzer = randomAnalyzer();
IndexWriter writer = new IndexWriter(ramdir, analyzer, IndexWriter writer = new IndexWriter(ramdir,
IndexWriter.MaxFieldLength.UNLIMITED); new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document(); Document doc = new Document();
Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED); Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED);
Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED); Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED);

View File

@ -1095,7 +1095,7 @@ final class DocumentsWriter {
continue; continue;
} }
assert checkDeleteTerm(term); assert checkDeleteTerm(term);
if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) { if (termsEnum.seek(term.bytes(), false) == TermsEnum.SeekStatus.FOUND) {
DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs); DocsEnum docsEnum = termsEnum.docs(reader.getDeletedDocs(), docs);

View File

@ -116,7 +116,7 @@ public final class MultiTermsEnum extends TermsEnum {
// different TermComps // different TermComps
final Comparator<BytesRef> subTermComp = termsEnumIndex.termsEnum.getComparator(); final Comparator<BytesRef> subTermComp = termsEnumIndex.termsEnum.getComparator();
if (subTermComp != null && !subTermComp.equals(termComp)) { if (subTermComp != null && !subTermComp.equals(termComp)) {
throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge"); throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge");
} }
} }

View File

@ -47,14 +47,14 @@ public abstract class CodecProvider {
private static String defaultCodec = "Standard"; private static String defaultCodec = "Standard";
public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock"}; public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock", "PreFlex"};
public void register(Codec codec) { public void register(Codec codec) {
if (codec.name == null) { if (codec.name == null) {
throw new IllegalArgumentException("code.name is null"); throw new IllegalArgumentException("code.name is null");
} }
// nocommit
if (!codecs.containsKey(codec.name)) { if (!codecs.containsKey(codec.name) || codec.name.equals("PreFlex")) {
codecs.put(codec.name, codec); codecs.put(codec.name, codec);
codec.getExtensions(knownExtensions); codec.getExtensions(knownExtensions);
} else if (codecs.get(codec.name) != codec) { } else if (codecs.get(codec.name) != codec) {

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.FieldsEnum;
import java.io.IOException; import java.io.IOException;
import java.io.Closeable;
/** Abstract API that consumes terms, doc, freq, prox and /** Abstract API that consumes terms, doc, freq, prox and
* payloads postings. Concrete implementations of this * payloads postings. Concrete implementations of this
@ -30,7 +31,7 @@ import java.io.IOException;
* *
* @lucene.experimental * @lucene.experimental
*/ */
public abstract class FieldsConsumer { public abstract class FieldsConsumer implements Closeable {
/** Add a new field */ /** Add a new field */
public abstract TermsConsumer addField(FieldInfo field) throws IOException; public abstract TermsConsumer addField(FieldInfo field) throws IOException;

View File

@ -40,16 +40,16 @@ import org.apache.lucene.index.codecs.FieldsProducer;
public class PreFlexCodec extends Codec { public class PreFlexCodec extends Codec {
/** Extension of terms file */ /** Extension of terms file */
static final String TERMS_EXTENSION = "tis"; public static final String TERMS_EXTENSION = "tis";
/** Extension of terms index file */ /** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tii"; public static final String TERMS_INDEX_EXTENSION = "tii";
/** Extension of freq postings file */ /** Extension of freq postings file */
static final String FREQ_EXTENSION = "frq"; public static final String FREQ_EXTENSION = "frq";
/** Extension of prox postings file */ /** Extension of prox postings file */
static final String PROX_EXTENSION = "prx"; public static final String PROX_EXTENSION = "prx";
public PreFlexCodec() { public PreFlexCodec() {
name = "PreFlex"; name = "PreFlex";
@ -62,7 +62,7 @@ public class PreFlexCodec extends Codec {
@Override @Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor); return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, true);
} }
@Override @Override

View File

@ -40,12 +40,11 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.ArrayUtil;
/** Exposes flex API on a pre-flex index, as a codec. /** Exposes flex API on a pre-flex index, as a codec.
* @lucene.experimental */ * @lucene.experimental */
public class PreFlexFields extends FieldsProducer { public class PreFlexFields extends FieldsProducer {
private static final boolean DEBUG_SURROGATES = false; private static final boolean DEBUG_SURROGATES = false;
public TermInfosReader tis; public TermInfosReader tis;
@ -59,11 +58,19 @@ public class PreFlexFields extends FieldsProducer {
private final Directory dir; private final Directory dir;
private final int readBufferSize; private final int readBufferSize;
private Directory cfsReader; private Directory cfsReader;
private final boolean unicodeSortOrder;
PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor) // If unicodeSortOrder is true, we do the surrogates dance
// so that the terms are sorted by unicode sort order.
// This should be true when segments are used for "normal"
// searching; it's only false during testing, to create a
// pre-flex index, using the preflexrw codec under
// src/test.
public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor, boolean unicodeSortOrder)
throws IOException { throws IOException {
si = info; si = info;
this.unicodeSortOrder = unicodeSortOrder;
// NOTE: we must always load terms index, even for // NOTE: we must always load terms index, even for
// "sequential" scan during merging, because what is // "sequential" scan during merging, because what is
@ -182,6 +189,12 @@ public class PreFlexFields extends FieldsProducer {
if (cfsReader != null) { if (cfsReader != null) {
cfsReader.close(); cfsReader.close();
} }
if (freqStream != null) {
freqStream.close();
}
if (proxStream != null) {
proxStream.close();
}
} }
private class PreFlexFieldsEnum extends FieldsEnum { private class PreFlexFieldsEnum extends FieldsEnum {
@ -228,7 +241,11 @@ public class PreFlexFields extends FieldsProducer {
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but // Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order // we remap on-the-fly to unicode order
return BytesRef.getUTF8SortedAsUnicodeComparator(); if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
} }
} }
@ -238,237 +255,473 @@ public class PreFlexFields extends FieldsProducer {
private boolean skipNext; private boolean skipNext;
private BytesRef current; private BytesRef current;
private int[] surrogateSeekPending = new int[1];
private boolean[] surrogateDidSeekBack = new boolean[1];
private int surrogateSeekUpto;
private char[] pendingPrefix;
private SegmentTermEnum seekTermEnum; private SegmentTermEnum seekTermEnum;
private Term protoTerm; private Term protoTerm;
private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;
// Returns true if the unicode char is "after" the
// surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
private final boolean isHighBMPChar(byte[] b, int idx) {
return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
}
// Returns true if the unicode char in the UTF8 byte
// sequence starting at idx encodes a char outside of
// BMP (ie what would be a surrogate pair in UTF16):
private final boolean isNonBMPChar(byte[] b, int idx) {
return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
}
private final byte[] scratch = new byte[4];
private final BytesRef prevTerm = new BytesRef();
private final BytesRef scratchTerm = new BytesRef();
private int newSuffixStart; private int newSuffixStart;
// Swap in S, in place of E:
private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
final int savLength = term.length;
assert term.offset == 0;
// The 3 bytes starting at downTo make up 1
// unicode character:
assert isHighBMPChar(term.bytes, pos);
// nocommit -- why does this trip?
// assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3);
// Save the bytes && length, since we need to
// restore this if seek "back" finds no matching
// terms
if (term.bytes.length < 4+pos) {
term.grow(4+pos);
}
scratch[0] = term.bytes[pos];
scratch[1] = term.bytes[pos+1];
scratch[2] = term.bytes[pos+2];
term.bytes[pos] = (byte) 0xf0;
term.bytes[pos+1] = (byte) 0x90;
term.bytes[pos+2] = (byte) 0x80;
term.bytes[pos+3] = (byte) 0x80;
term.length = 4+pos;
if (DEBUG_SURROGATES) {
System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
}
// Seek "back":
getTermsDict().seekEnum(te, protoTerm.createTerm(term));
// Test if the term we seek'd to in fact found a
// surrogate pair at the same position as the E:
Term t2 = te.term();
// Cannot be null (or move to next field) because at
// "worst" it'd seek to the same term we are on now,
// unless we are being called from seek
if (t2 == null || t2.field() != fieldInfo.name) {
return false;
}
if (DEBUG_SURROGATES) {
System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()));
}
// Now test if prefix is identical and we found
// a non-BMP char at the same position:
BytesRef b2 = t2.bytes();
assert b2.offset == 0;
boolean matches;
if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
matches = true;
for(int i=0;i<pos;i++) {
if (term.bytes[i] != b2.bytes[i]) {
matches = false;
break;
}
}
} else {
matches = false;
}
// Restore term:
term.length = savLength;
term.bytes[pos] = scratch[0];
term.bytes[pos+1] = scratch[1];
term.bytes[pos+2] = scratch[2];
return matches;
}
// Seek type 2 "continue" (back to the start of the
// surrogates): scan the stripped suffix from the
// prior term, backwards. If there was an E in that
// part, then we try to seek back to S. If that
// seek finds a matching term, we go there.
private boolean doContinue() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" try cont");
}
int downTo = prevTerm.length-1;
boolean didSeek = false;
final int limit = Math.min(newSuffixStart, scratchTerm.length-1);
while(downTo > limit) {
if (isHighBMPChar(prevTerm.bytes, downTo)) {
if (DEBUG_SURROGATES) {
System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length);
}
if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
// TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
//newSuffixStart = downTo+4;
newSuffixStart = downTo;
scratchTerm.copy(termEnum.term().bytes());
didSeek = true;
if (DEBUG_SURROGATES) {
System.out.println(" seek!");
}
break;
} else {
if (DEBUG_SURROGATES) {
System.out.println(" no seek");
}
}
}
// Shorten prevTerm in place so that we don't redo
// this loop if we come back here:
if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
prevTerm.length = downTo;
}
downTo--;
}
return didSeek;
}
// Look for seek type 3 ("pop"): if the delta from
// prev -> current was replacing an S with an E,
// we must now seek to beyond that E. This seek
// "finishes" the dance at this character
// position.
private boolean doPop() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" try pop");
}
assert newSuffixStart <= prevTerm.length;
assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;
if (prevTerm.length > newSuffixStart &&
isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {
// Seek type 2 -- put U+FFFF at this position:
// nocommit -- can we somehow use 0xff???
scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
//scratchTerm.bytes[newSuffixStart] = (byte) 0xef;
scratchTerm.bytes[newSuffixStart+1] = (byte) 0xbf;
scratchTerm.bytes[newSuffixStart+2] = (byte) 0xbf;
scratchTerm.length = newSuffixStart+3;
if (DEBUG_SURROGATES) {
System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
}
// TODO: more efficient seek? can we simply swap
// the enums?
getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));
final Term t2 = termEnum.term();
// We could hit EOF or different field since this
// was a seek "forward":
if (t2 != null && t2.field() == fieldInfo.name) {
if (DEBUG_SURROGATES) {
System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
}
final BytesRef b2 = t2.bytes();
assert b2.offset == 0;
// Set newSuffixStart -- we can't use
// termEnum's since the above seek may have
// done no scanning (eg, term was precisely
// and index term, or, was in the term seek
// cache):
scratchTerm.copy(b2);
setNewSuffixStart(prevTerm, scratchTerm);
return true;
} else if (newSuffixStart != 0 || scratchTerm.length != 0) {
if (DEBUG_SURROGATES) {
System.out.println(" got term=null (or next field)");
}
newSuffixStart = 0;
scratchTerm.length = 0;
return true;
}
}
return false;
}
// Pre-flex indices store terms in UTF16 sort order, but
// certain queries require Unicode codepoint order; this
// method carefully seeks around surrogates to handle
// this impedance mismatch
private void surrogateDance() throws IOException {
if (!unicodeSortOrder) {
return;
}
// We are invoked after TIS.next() (by UTF16 order) to
// possibly seek to a different "next" (by unicode
// order) term.
// We scan only the "delta" from the last term to the
// current term, in UTF8 bytes. We look at 1) the bytes
// stripped from the prior term, and then 2) the bytes
// appended to that prior term's prefix.
// We don't care about specific UTF8 sequences, just
// the "category" of the UTF16 character. Category S
// is a high/low surrogate pair (it non-BMP).
// Category E is any BMP char > UNI_SUR_LOW_END (and <
// U+FFFF). Category A is the rest (any unicode char
// <= UNI_SUR_HIGH_START).
// The core issue is that pre-flex indices sort the
// characters as ASE, while flex must sort as AES. So
// when scanning, when we hit S, we must 1) seek
// forward to E and enum the terms there, then 2) seek
// back to S and enum all terms there, then 3) seek to
// after E. Three different seek points (1, 2, 3).
// We can easily detect S in UTF8: if a byte has
// prefix 11110 (0xf0), then that byte and the
// following 3 bytes encode a single unicode codepoint
// in S. Similary,we can detect E: if a byte has
// prefix 1110111 (0xee), then that byte and the
// following 2 bytes encode a single unicode codepoint
// in E.
// Note that this is really a recursive process --
// maybe the char at pos 2 needs to dance, but any
// point in its dance, suddenly pos 4 needs to dance
// so you must finish pos 4 before returning to pos
// 2. But then during pos 4's dance maybe pos 7 needs
// to dance, etc. However, despite being recursive,
// we don't need to hold any state because the state
// can always be derived by looking at prior term &
// current term.
// TODO: can we avoid this copy?
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
scratchTerm.length = 0;
} else {
scratchTerm.copy(termEnum.term().bytes());
}
if (DEBUG_SURROGATES) {
System.out.println(" dance");
System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
System.out.println(" " + prevTerm.toString());
System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
System.out.println(" " + scratchTerm.toString());
}
// This code assumes TermInfosReader/SegmentTermEnum
// always use BytesRef.offset == 0
assert prevTerm.offset == 0;
assert scratchTerm.offset == 0;
// Need to loop here because we may need to do multiple
// pops, and possibly a continue in the end, ie:
//
// cont
// pop, cont
// pop, pop, cont
// <nothing>
//
while(true) {
if (doContinue()) {
break;
} else {
if (!doPop()) {
break;
}
}
}
if (DEBUG_SURROGATES) {
System.out.println(" finish bmp ends");
}
doPushes();
}
// Look for seek type 1 ("push"): if the newly added
// suffix contains any S, we must try to seek to the
// corresponding E. If we find a match, we go there;
// else we keep looking for additional S's in the new
// suffix. This "starts" the dance, at this character
// position:
private void doPushes() throws IOException {
int upTo = newSuffixStart;
if (DEBUG_SURROGATES) {
System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
}
while(upTo < scratchTerm.length) {
if (isNonBMPChar(scratchTerm.bytes, upTo) &&
(upTo > newSuffixStart ||
(upTo >= prevTerm.length ||
(!isNonBMPChar(prevTerm.bytes, upTo) &&
!isHighBMPChar(prevTerm.bytes, upTo))))) { // nocommit -- we can't cmp to prevTerm if we'd done a seek 3 or seek 2 before?
// A non-BMP char (4 bytes UTF8) starts here:
assert scratchTerm.length >= upTo + 4;
final int savLength = scratchTerm.length;
scratch[0] = scratchTerm.bytes[upTo];
scratch[1] = scratchTerm.bytes[upTo+1];
scratch[2] = scratchTerm.bytes[upTo+2];
scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
scratchTerm.bytes[upTo+1] = (byte) 0x80;
scratchTerm.bytes[upTo+2] = (byte) 0x80;
scratchTerm.length = upTo+3;
if (DEBUG_SURROGATES) {
System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
}
// Seek "forward":
// TODO: more efficient seek?
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));
scratchTerm.bytes[upTo] = scratch[0];
scratchTerm.bytes[upTo+1] = scratch[1];
scratchTerm.bytes[upTo+2] = scratch[2];
scratchTerm.length = savLength;
// Did we find a match?
final Term t2 = seekTermEnum.term();
if (DEBUG_SURROGATES) {
if (t2 == null) {
System.out.println(" hit term=null");
} else {
System.out.println(" hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes()));
}
}
// Since this was a seek "forward", we could hit
// EOF or a different field:
boolean matches;
if (t2 != null && t2.field() == fieldInfo.name) {
final BytesRef b2 = t2.bytes();
assert b2.offset == 0;
if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
matches = true;
for(int i=0;i<upTo;i++) {
if (scratchTerm.bytes[i] != b2.bytes[i]) {
matches = false;
break;
}
}
} else {
matches = false;
}
} else {
matches = false;
}
if (matches) {
if (DEBUG_SURROGATES) {
System.out.println(" matches!");
}
// OK seek "back"
// TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
scratchTerm.copy(seekTermEnum.term().bytes());
// +3 because we don't need to check the char
// at upTo: we know it's > BMP
upTo += 3;
// NOTE: we keep iterating, now, since this
// can easily "recurse". Ie, after seeking
// forward at a certain char position, we may
// find another surrogate in our [new] suffix
// and must then do another seek (recurse)
} else {
upTo++;
}
} else {
upTo++;
}
}
}
void reset(FieldInfo fieldInfo) throws IOException { void reset(FieldInfo fieldInfo) throws IOException {
//System.out.println("pff.reset te=" + termEnum);
this.fieldInfo = fieldInfo; this.fieldInfo = fieldInfo;
protoTerm = new Term(fieldInfo.name); protoTerm = new Term(fieldInfo.name);
if (termEnum == null) { if (termEnum == null) {
termEnum = getTermsDict().terms(protoTerm); termEnum = getTermsDict().terms(protoTerm);
seekTermEnum = getTermsDict().terms(protoTerm); seekTermEnum = getTermsDict().terms(protoTerm);
//System.out.println(" term=" + termEnum.term());
} else { } else {
getTermsDict().seekEnum(termEnum, protoTerm); getTermsDict().seekEnum(termEnum, protoTerm);
} }
skipNext = true; skipNext = true;
surrogateSeekUpto = 0;
newSuffixStart = 0;
surrogatesDance();
}
private void surrogatesDance() throws IOException {
// Tricky: prior to 4.0, Lucene index sorted terms in
// UTF16 order, but as of 4.0 we sort by Unicode code
// point order. These orders differ because of the
// surrrogates; so we have to fixup our enum, here, by
// carefully first seeking past the surrogates and
// then back again at the end. The process is
// recursive, since any given term could have multiple
// new occurrences of surrogate pairs, so we use a
// stack to record the pending seek-backs.
if (DEBUG_SURROGATES) {
System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
}
while(popPendingSeek());
while(pushNewSurrogate());
}
// only for debugging
private String getStack() {
if (surrogateSeekUpto == 0) {
return "null";
} else {
StringBuffer sb = new StringBuffer();
for(int i=0;i<surrogateSeekUpto;i++) {
if (i > 0) {
sb.append(' ');
}
sb.append(surrogateSeekPending[i]);
}
sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
return sb.toString();
}
}
private boolean popPendingSeek() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
}
// if a .next() has advanced beyond the
// after-surrogates range we had last seeked to, we
// must seek back to the start and resume .next from
// there. this pops the pending seek off the stack.
final Term t = termEnum.term(); final Term t = termEnum.term();
if (surrogateSeekUpto > 0) { if (t != null && t.field() == fieldInfo.name) {
final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; newSuffixStart = 0;
if (DEBUG_SURROGATES) { prevTerm.length = 0;
System.out.println(" seekPrefix=" + seekPrefix); surrogateDance();
}
if (newSuffixStart < seekPrefix) {
assert pendingPrefix != null;
assert pendingPrefix.length > seekPrefix;
pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START;
Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix));
if (DEBUG_SURROGATES) {
System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
}
getTermsDict().seekEnum(termEnum, t2);
surrogateDidSeekBack[surrogateSeekUpto-1] = true;
// +2 because we don't want to re-check the
// surrogates we just seek'd back to
newSuffixStart = seekPrefix + 2;
return true;
} else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
assert pendingPrefix != null;
assert pendingPrefix.length > seekPrefix;
pendingPrefix[seekPrefix] = 0xffff;
Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix));
if (DEBUG_SURROGATES) {
System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
}
getTermsDict().seekEnum(termEnum, t2);
if (DEBUG_SURROGATES) {
System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
}
surrogateSeekUpto--;
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
// force pop
newSuffixStart = -1;
} else {
newSuffixStart = termEnum.newSuffixStart;
}
return true;
}
} }
return false;
}
private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result();
private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result();
private boolean pushNewSurrogate() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack());
}
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
return false;
}
final BytesRef bytes = t.bytes();
UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer);
for(int i=Math.max(0,newSuffixStart);i<termBuffer.length;i++) {
final char ch = termBuffer.result[i];
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
if (DEBUG_SURROGATES) {
System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
}
// the next() that we just did read in a new
// suffix, containing a surrogate pair
// seek forward to see if there are any terms with
// this same prefix, but with characters after the
// surrogate range; if so, we must first iterate
// them, then seek back to the surrogates
char[] testPrefix = new char[i+2];
for(int j=0;j<i;j++) {
testPrefix[j] = termBuffer.result[j];
}
testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new BytesRef(testPrefix, 0, i+1)));
Term t2 = seekTermEnum.term();
boolean isPrefix;
if (t2 != null && t2.field() == fieldInfo.name) {
final BytesRef seekBytes = t2.bytes();
UnicodeUtil.UTF8toUTF16(seekBytes.bytes, seekBytes.offset, seekBytes.length, seekBuffer);
isPrefix = true;
if (DEBUG_SURROGATES) {
System.out.println(" seek found " + UnicodeUtil.toHexString(t2.text()));
}
for(int j=0;j<i;j++) {
if (testPrefix[j] != seekBuffer.result[j]) {
isPrefix = false;
break;
}
}
if (DEBUG_SURROGATES && !isPrefix) {
System.out.println(" no end terms");
}
} else {
if (DEBUG_SURROGATES) {
System.out.println(" no end terms");
}
isPrefix = false;
}
if (isPrefix) {
// we found a term, sharing the same prefix,
// with characters after the surrogates, so we
// must first enum those, and then return the
// the surrogates afterwards. push that pending
// seek on the surrogates stack now:
pendingPrefix = testPrefix;
getTermsDict().seekEnum(termEnum, t2);
if (surrogateSeekUpto == surrogateSeekPending.length) {
surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
}
if (surrogateSeekUpto == surrogateDidSeekBack.length) {
surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
}
surrogateSeekPending[surrogateSeekUpto] = i;
surrogateDidSeekBack[surrogateSeekUpto] = false;
surrogateSeekUpto++;
if (DEBUG_SURROGATES) {
System.out.println(" do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
}
newSuffixStart = i+1;
return true;
} else {
// there are no terms after the surrogates, so
// we do nothing to the enum and just step
// through the surrogates like normal. but we
// must keep iterating through the term, in case
// another surrogate pair appears later
}
}
}
return false;
} }
@Override @Override
public Comparator<BytesRef> getComparator() { public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but // Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order // we remap on-the-fly to unicode order
return BytesRef.getUTF8SortedAsUnicodeComparator(); if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
} }
@Override @Override
@ -484,7 +737,7 @@ public class PreFlexFields extends FieldsProducer {
@Override @Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
if (DEBUG_SURROGATES) { if (DEBUG_SURROGATES) {
System.out.println("TE.seek() term=" + term.utf8ToString()); System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
} }
skipNext = false; skipNext = false;
final TermInfosReader tis = getTermsDict(); final TermInfosReader tis = getTermsDict();
@ -492,50 +745,142 @@ public class PreFlexFields extends FieldsProducer {
assert termEnum != null; assert termEnum != null;
if (termEnum == null) { tis.seekEnum(termEnum, t0);
termEnum = tis.terms(t0);
} else {
tis.seekEnum(termEnum, t0);
}
surrogateSeekUpto = 0;
surrogatesDance();
final Term t = termEnum.term(); final Term t = termEnum.term();
final BytesRef tr = t == null ? null : t.bytes(); if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
// If we found an exact match, no need to do the
if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) { // surrogate dance
current = tr; if (DEBUG_SURROGATES) {
System.out.println(" seek exact match");
}
current = t.bytes();
return SeekStatus.FOUND; return SeekStatus.FOUND;
} else if (t == null || t.field() != fieldInfo.name) { } else if (t == null || t.field() != fieldInfo.name) {
// nocommit -- why can't we handle this like the
// next() into null? set term as prevTerm then dance?
if (DEBUG_SURROGATES) {
System.out.println(" seek hit EOF");
}
// We hit EOF; try end-case surrogate dance: if we
// find an E, try swapping in S, backwards:
scratchTerm.copy(term);
assert scratchTerm.offset == 0;
for(int i=scratchTerm.length-1;i>=0;i--) {
if (isHighBMPChar(scratchTerm.bytes, i)) {
if (DEBUG_SURROGATES) {
System.out.println(" found E pos=" + i + "; try seek");
}
if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
scratchTerm.copy(seekTermEnum.term().bytes());
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
newSuffixStart = 1+i;
doPushes();
// Found a match
// TODO: faster seek?
current = termEnum.term().bytes();
return SeekStatus.NOT_FOUND;
}
}
}
if (DEBUG_SURROGATES) {
System.out.println(" seek END");
}
current = null; current = null;
return SeekStatus.END; return SeekStatus.END;
} else { } else {
current = tr;
return SeekStatus.NOT_FOUND; // We found a non-exact but non-null term; this one
// is fun -- just treat it like next, by pretending
// requested term was prev:
prevTerm.copy(term);
if (DEBUG_SURROGATES) {
System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
}
final BytesRef br = t.bytes();
assert br.offset == 0;
setNewSuffixStart(term, br);
surrogateDance();
final Term t2 = termEnum.term();
if (t2 == null || t2.field() != fieldInfo.name) {
assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
current = null;
return SeekStatus.END;
} else {
current = t2.bytes();
assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
return SeekStatus.NOT_FOUND;
}
}
}
private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
final int limit = Math.min(br1.length, br2.length);
int lastStart = 0;
for(int i=0;i<limit;i++) {
if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) {
lastStart = i;
}
if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) {
newSuffixStart = lastStart;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
}
return;
}
}
newSuffixStart = limit;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
} }
} }
@Override @Override
public BytesRef next() throws IOException { public BytesRef next() throws IOException {
if (DEBUG_SURROGATES) { if (DEBUG_SURROGATES) {
System.out.println("TE.next() skipNext=" + skipNext); System.out.println("TE.next()");
} }
if (skipNext) { if (skipNext) {
if (DEBUG_SURROGATES) {
System.out.println(" skipNext=true");
}
skipNext = false; skipNext = false;
if (termEnum.term() == null) { if (termEnum.term() == null) {
return null; return null;
} else if (termEnum.term().field() != fieldInfo.name) {
return null;
} else { } else {
return current = termEnum.term().bytes(); return current = termEnum.term().bytes();
} }
} }
// TODO: can we use STE's prevBuffer here?
prevTerm.copy(termEnum.term().bytes());
if (termEnum.next() && termEnum.term().field() == fieldInfo.name) { if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
newSuffixStart = termEnum.newSuffixStart; newSuffixStart = termEnum.newSuffixStart;
if (DEBUG_SURROGATES) { if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart); System.out.println(" newSuffixStart=" + newSuffixStart);
} }
surrogatesDance(); surrogateDance();
final Term t = termEnum.term(); final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) { if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
@ -545,12 +890,15 @@ public class PreFlexFields extends FieldsProducer {
} }
return current; return current;
} else { } else {
// This field is exhausted, but we have to give
// surrogateDance a chance to seek back:
if (DEBUG_SURROGATES) { if (DEBUG_SURROGATES) {
System.out.println(" force pop"); System.out.println(" force cont");
} }
// force pop //newSuffixStart = prevTerm.length;
newSuffixStart = -1; newSuffixStart = 0;
surrogatesDance(); surrogateDance();
final Term t = termEnum.term(); final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) { if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
@ -574,20 +922,32 @@ public class PreFlexFields extends FieldsProducer {
@Override @Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException { public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
if (reuse != null) { PreDocsEnum docsEnum;
return ((PreDocsEnum) reuse).reset(termEnum, skipDocs); if (reuse == null || !(reuse instanceof PreDocsEnum)) {
docsEnum = new PreDocsEnum();
} else { } else {
return (new PreDocsEnum()).reset(termEnum, skipDocs); docsEnum = (PreDocsEnum) reuse;
if (docsEnum.getFreqStream() != freqStream) {
docsEnum = new PreDocsEnum();
}
} }
return docsEnum.reset(termEnum, skipDocs);
} }
@Override @Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException { public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (reuse != null) { PreDocsAndPositionsEnum docsPosEnum;
return ((PreDocsAndPositionsEnum) reuse).reset(termEnum, skipDocs); if (fieldInfo.omitTermFreqAndPositions) {
return null;
} else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) {
docsPosEnum = new PreDocsAndPositionsEnum();
} else { } else {
return (new PreDocsAndPositionsEnum()).reset(termEnum, skipDocs); docsPosEnum = (PreDocsAndPositionsEnum) reuse;
if (docsPosEnum.getFreqStream() != freqStream) {
docsPosEnum = new PreDocsAndPositionsEnum();
}
} }
return docsPosEnum.reset(termEnum, skipDocs);
} }
} }
@ -598,6 +958,10 @@ public class PreFlexFields extends FieldsProducer {
docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos); docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
} }
IndexInput getFreqStream() {
return freqStream;
}
public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException { public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
docs.setSkipDocs(skipDocs); docs.setSkipDocs(skipDocs);
docs.seek(termEnum); docs.seek(termEnum);
@ -650,6 +1014,10 @@ public class PreFlexFields extends FieldsProducer {
pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos); pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
} }
IndexInput getFreqStream() {
return freqStream;
}
public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException { public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
pos.setSkipDocs(skipDocs); pos.setSkipDocs(skipDocs);
pos.seek(termEnum); pos.seek(termEnum);

View File

@ -132,18 +132,21 @@ public final class SegmentTermEnum implements Cloneable {
position = p; position = p;
termBuffer.set(t); termBuffer.set(t);
prevBuffer.reset(); prevBuffer.reset();
//System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this);
termInfo.set(ti); termInfo.set(ti);
} }
/** Increments the enumeration to the next element. True if one exists.*/ /** Increments the enumeration to the next element. True if one exists.*/
public final boolean next() throws IOException { public final boolean next() throws IOException {
prevBuffer.set(termBuffer);
//System.out.println(" ste setPrev=" + prev() + " this=" + this);
if (position++ >= size - 1) { if (position++ >= size - 1) {
prevBuffer.set(termBuffer);
termBuffer.reset(); termBuffer.reset();
//System.out.println(" EOF");
return false; return false;
} }
prevBuffer.set(termBuffer);
termBuffer.read(input, fieldInfos); termBuffer.read(input, fieldInfos);
newSuffixStart = termBuffer.newSuffixStart; newSuffixStart = termBuffer.newSuffixStart;
@ -168,6 +171,7 @@ public final class SegmentTermEnum implements Cloneable {
if (isIndex) if (isIndex)
indexPointer += input.readVLong(); // read index pointer indexPointer += input.readVLong(); // read index pointer
//System.out.println(" ste ret term=" + term());
return true; return true;
} }

View File

@ -18,9 +18,10 @@ package org.apache.lucene.index.codecs.preflex;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldInfos;
@ -28,102 +29,65 @@ final class TermBuffer implements Cloneable {
private String field; private String field;
private Term term; // cached private Term term; // cached
private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes)
private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
private BytesRef bytes = new BytesRef(10); private BytesRef bytes = new BytesRef(10);
int newSuffixStart; private static final Comparator<BytesRef> utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
public final int compareTo(TermBuffer other) { int newSuffixStart; // only valid right after .read is called
public int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned if (field == other.field) // fields are interned
return compareChars(text.result, text.length, other.text.result, other.text.length); return utf8AsUTF16Comparator.compare(bytes, other.bytes);
else else
return field.compareTo(other.field); return field.compareTo(other.field);
} }
private static int compareChars(char[] chars1, int len1, public void read(IndexInput input, FieldInfos fieldInfos)
char[] chars2, int len2) {
final int end = len1 < len2 ? len1:len2;
for (int k = 0; k < end; k++) {
char c1 = chars1[k];
char c2 = chars2[k];
if (c1 != c2) {
return c1 - c2;
}
}
return len1 - len2;
}
public final void read(IndexInput input, FieldInfos fieldInfos)
throws IOException { throws IOException {
this.term = null; // invalidate cache this.term = null; // invalidate cache
int start = input.readVInt(); newSuffixStart = input.readVInt();
int length = input.readVInt(); int length = input.readVInt();
int totalLength = start + length; int totalLength = newSuffixStart + length;
if (bytes.bytes.length < totalLength) { if (bytes.bytes.length < totalLength) {
bytes.grow(totalLength); bytes.grow(totalLength);
} }
if (dirty) { bytes.length = totalLength;
// Fully convert all bytes since bytes is dirty input.readBytes(bytes.bytes, newSuffixStart, length);
UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
dirty = false;
} else {
// Incrementally convert only the UTF8 bytes that are new:
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
}
while(true) {
newSuffixStart = text.offsets[start];
if (newSuffixStart != -1) {
break;
}
if (--start == 0) {
newSuffixStart = 0;
break;
}
}
this.field = fieldInfos.fieldName(input.readVInt()); this.field = fieldInfos.fieldName(input.readVInt());
} }
public final void set(Term term) { public void set(Term term) {
if (term == null) { if (term == null) {
reset(); reset();
return; return;
} }
bytes.copy(term.bytes());
final BytesRef termBytes = term.bytes();
UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text);
dirty = true;
field = term.field(); field = term.field();
this.term = term; this.term = term;
} }
public final void set(TermBuffer other) { public void set(TermBuffer other) {
text.copyText(other.text);
dirty = true;
field = other.field; field = other.field;
term = other.term; // nocommit -- right?
//term = other.term;
term = null;
bytes.copy(other.bytes);
} }
public void reset() { public void reset() {
field = null; field = null;
text.setLength(0);
term = null; term = null;
dirty = true;
} }
public Term toTerm() { public Term toTerm() {
if (field == null) // unset if (field == null) // unset
return null; return null;
if (term == null) if (term == null) {
term = new Term(field, new BytesRef(text.result, 0, text.length), false); term = new Term(field, new BytesRef(bytes), false);
//term = new Term(field, bytes, false);
}
return term; return term;
} }
@ -134,12 +98,7 @@ final class TermBuffer implements Cloneable {
try { try {
clone = (TermBuffer)super.clone(); clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {} } catch (CloneNotSupportedException e) {}
clone.dirty = true; clone.bytes = new BytesRef(bytes);
clone.bytes = new BytesRef(10);
clone.text = new UnicodeUtil.UTF16Result();
clone.text.offsets = new int[text.offsets.length];
System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
clone.text.copyText(text);
return clone; return clone;
} }
} }

View File

@ -23,30 +23,30 @@ package org.apache.lucene.index.codecs.preflex;
* indexing. */ * indexing. */
@Deprecated @Deprecated
class TermInfo { public class TermInfo {
/** The number of documents which contain the term. */ /** The number of documents which contain the term. */
int docFreq = 0; public int docFreq = 0;
long freqPointer = 0; public long freqPointer = 0;
long proxPointer = 0; public long proxPointer = 0;
int skipOffset; public int skipOffset;
TermInfo() {} public TermInfo() {}
TermInfo(int df, long fp, long pp) { public TermInfo(int df, long fp, long pp) {
docFreq = df; docFreq = df;
freqPointer = fp; freqPointer = fp;
proxPointer = pp; proxPointer = pp;
} }
TermInfo(TermInfo ti) { public TermInfo(TermInfo ti) {
docFreq = ti.docFreq; docFreq = ti.docFreq;
freqPointer = ti.freqPointer; freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer; proxPointer = ti.proxPointer;
skipOffset = ti.skipOffset; skipOffset = ti.skipOffset;
} }
final void set(int docFreq, public final void set(int docFreq,
long freqPointer, long proxPointer, int skipOffset) { long freqPointer, long proxPointer, int skipOffset) {
this.docFreq = docFreq; this.docFreq = docFreq;
this.freqPointer = freqPointer; this.freqPointer = freqPointer;
@ -54,7 +54,7 @@ class TermInfo {
this.skipOffset = skipOffset; this.skipOffset = skipOffset;
} }
final void set(TermInfo ti) { public final void set(TermInfo ti) {
docFreq = ti.docFreq; docFreq = ti.docFreq;
freqPointer = ti.freqPointer; freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer; proxPointer = ti.proxPointer;

View File

@ -119,9 +119,12 @@ public final class TermInfosReader {
indexTerms = new Term[indexSize]; indexTerms = new Term[indexSize];
indexInfos = new TermInfo[indexSize]; indexInfos = new TermInfo[indexSize];
indexPointers = new long[indexSize]; indexPointers = new long[indexSize];
for (int i = 0; indexEnum.next(); i++) { for (int i=0;indexEnum.next(); i++) {
indexTerms[i] = indexEnum.term(); indexTerms[i] = indexEnum.term();
assert indexTerms[i] != null;
assert indexTerms[i].text() != null;
assert indexTerms[i].field() != null;
indexInfos[i] = indexEnum.termInfo(); indexInfos[i] = indexEnum.termInfo();
indexPointers[i] = indexEnum.indexPointer; indexPointers[i] = indexEnum.indexPointer;
@ -160,14 +163,14 @@ public final class TermInfosReader {
return origEnum.maxSkipLevels; return origEnum.maxSkipLevels;
} }
final void close() throws IOException { void close() throws IOException {
if (origEnum != null) if (origEnum != null)
origEnum.close(); origEnum.close();
threadResources.close(); threadResources.close();
} }
/** Returns the number of term/value pairs in the set. */ /** Returns the number of term/value pairs in the set. */
final long size() { long size() {
return size; return size;
} }
@ -183,12 +186,13 @@ public final class TermInfosReader {
/** Returns the offset of the greatest index entry which is less than or equal to term.*/ /** Returns the offset of the greatest index entry which is less than or equal to term.*/
private final int getIndexOffset(Term term) { private int getIndexOffset(Term term) {
int lo = 0; // binary search indexTerms[] int lo = 0; // binary search indexTerms[]
int hi = indexTerms.length - 1; int hi = indexTerms.length - 1;
while (hi >= lo) { while (hi >= lo) {
int mid = (lo + hi) >>> 1; int mid = (lo + hi) >>> 1;
assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
int delta = term.compareToUTF16(indexTerms[mid]); int delta = term.compareToUTF16(indexTerms[mid]);
if (delta < 0) if (delta < 0)
hi = mid - 1; hi = mid - 1;
@ -200,7 +204,7 @@ public final class TermInfosReader {
return hi; return hi;
} }
private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
enumerator.seek(indexPointers[indexOffset], enumerator.seek(indexPointers[indexOffset],
((long) indexOffset * totalIndexInterval) - 1, ((long) indexOffset * totalIndexInterval) - 1,
indexTerms[indexOffset], indexInfos[indexOffset]); indexTerms[indexOffset], indexInfos[indexOffset]);
@ -231,6 +235,9 @@ public final class TermInfosReader {
} }
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException { TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException {
if (size == 0) {
return null;
}
// optimize sequential access: first try scanning cached enum w/o seeking // optimize sequential access: first try scanning cached enum w/o seeking
if (enumerator.term() != null // term is at or past current if (enumerator.term() != null // term is at or past current
@ -242,7 +249,6 @@ public final class TermInfosReader {
// no need to seek // no need to seek
final TermInfo ti; final TermInfo ti;
int numScans = enumerator.scanTo(term); int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo(); ti = enumerator.termInfo();
@ -279,6 +285,7 @@ public final class TermInfosReader {
seekEnum(enumerator, indexPos); seekEnum(enumerator, indexPos);
enumerator.scanTo(term); enumerator.scanTo(term);
final TermInfo ti; final TermInfo ti;
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) { if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo(); ti = enumerator.termInfo();
if (tiOrd == null) { if (tiOrd == null) {
@ -294,7 +301,7 @@ public final class TermInfosReader {
} }
// called only from asserts // called only from asserts
private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
if (ti1.docFreq != ti2.docFreq) { if (ti1.docFreq != ti2.docFreq) {
return false; return false;
} }
@ -319,7 +326,7 @@ public final class TermInfosReader {
} }
/** Returns the position of a Term in the set or -1. */ /** Returns the position of a Term in the set or -1. */
final long getPosition(Term term) throws IOException { long getPosition(Term term) throws IOException {
if (size == 0) return -1; if (size == 0) return -1;
ensureIndexIsRead(); ensureIndexIsRead();

View File

@ -792,6 +792,7 @@ class FieldCacheImpl implements FieldCache {
throws IOException { throws IOException {
String field = StringHelper.intern(entryKey.field); String field = StringHelper.intern(entryKey.field);
Terms terms = MultiFields.getTerms(reader, field); Terms terms = MultiFields.getTerms(reader, field);
final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue(); final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue();

View File

@ -333,10 +333,12 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
// difference to match UTF16's sort order: // difference to match UTF16's sort order:
if (aByte >= 0xee && bByte >= 0xee) { if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) { if ((aByte & 0xfe) == 0xee) {
aByte += 0x10; // nocommit
aByte += 0xe;
} }
if ((bByte&0xfe) == 0xee) { if ((bByte&0xfe) == 0xee) {
bByte += 0x10; // nocommit
bByte += 0xe;
} }
} }
return aByte - bByte; return aByte - bByte;
@ -346,10 +348,6 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
// One is a prefix of the other, or, they are equal: // One is a prefix of the other, or, they are equal:
return a.length - b.length; return a.length - b.length;
} }
public boolean equals(Object other) {
return this == other;
}
} }
public void writeExternal(ObjectOutput out) public void writeExternal(ObjectOutput out)

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.intblock.IntBlockCodec; import org.apache.lucene.index.codecs.intblock.IntBlockCodec;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec; import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.apache.lucene.index.codecs.pulsing.PulsingCodec; import org.apache.lucene.index.codecs.pulsing.PulsingCodec;
import org.apache.lucene.index.codecs.sep.SepCodec; import org.apache.lucene.index.codecs.sep.SepCodec;
import org.apache.lucene.index.codecs.standard.StandardCodec; import org.apache.lucene.index.codecs.standard.StandardCodec;
@ -94,6 +95,16 @@ public class RandomIndexWriter implements Closeable {
} }
public IndexReader getReader() throws IOException { public IndexReader getReader() throws IOException {
// nocommit: hack!
if (w.codecs.getWriter(null).name.equals("PreFlex")) {
w.commit();
return IndexReader.open(w.getDirectory(),
null,
false,
_TestUtil.nextInt(r, 1, 10),
_TestUtil.alwaysCodec(new PreFlexCodec()));
}
if (r.nextBoolean()) { if (r.nextBoolean()) {
return w.getReader(); return w.getReader();
} else { } else {
@ -119,10 +130,15 @@ public class RandomIndexWriter implements Closeable {
RandomCodecProvider(Random random) { RandomCodecProvider(Random random) {
register(new StandardCodec()); register(new StandardCodec());
register(new IntBlockCodec()); register(new IntBlockCodec());
register(new PreFlexCodec()); // nocommit
//register(new PreFlexCodec());
register(new PreFlexRWCodec());
register(new PulsingCodec()); register(new PulsingCodec());
register(new SepCodec()); register(new SepCodec());
codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)]; // nocommit
//codec =
//CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)];
codec = "PreFlex";
} }
@Override @Override

View File

@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
@ -139,7 +138,6 @@ public class TestAddIndexes extends LuceneTestCase {
setUpDirs(dir, aux); setUpDirs(dir, aux);
IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND)); IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
writer.addIndexes(new Directory[] {aux}); writer.addIndexes(new Directory[] {aux});
// Adds 10 docs, then replaces them with another 10 // Adds 10 docs, then replaces them with another 10

View File

@ -493,14 +493,21 @@ public class TestCodecs extends MultiCodecTestCase {
// Test random seek by ord: // Test random seek by ord:
final int idx = TestCodecs.this.nextInt(field.terms.length); final int idx = TestCodecs.this.nextInt(field.terms.length);
term = field.terms[idx]; term = field.terms[idx];
status = termsEnum.seek(idx); try {
assertEquals(status, TermsEnum.SeekStatus.FOUND); status = termsEnum.seek(idx);
assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2))); } catch (UnsupportedOperationException uoe) {
assertEquals(term.docs.length, termsEnum.docFreq()); // ok -- skip it
if (field.omitTF) { status = null;
this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false); }
} else { if (status != null) {
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true); assertEquals(status, TermsEnum.SeekStatus.FOUND);
assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
assertEquals(term.docs.length, termsEnum.docFreq());
if (field.omitTF) {
this.verifyDocs(term.docs, term.positions, termsEnum.docs(null, null), false);
} else {
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
}
} }
// Test seek to non-existent terms: // Test seek to non-existent terms:
@ -520,9 +527,12 @@ public class TestCodecs extends MultiCodecTestCase {
// Seek to each term by ord, backwards // Seek to each term by ord, backwards
for(int i=field.terms.length-1;i>=0;i--) { for(int i=field.terms.length-1;i>=0;i--) {
assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i)); try {
assertEquals(field.terms[i].docs.length, termsEnum.docFreq()); assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i));
assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2))); assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
} catch (UnsupportedOperationException uoe) {
}
} }
// Seek to non-existent empty-string term // Seek to non-existent empty-string term

View File

@ -20,6 +20,8 @@ package org.apache.lucene.index;
import java.io.*; import java.io.*;
import java.util.*; import java.util.*;
import org.apache.lucene.store.*; import org.apache.lucene.store.*;
import org.apache.lucene.index.codecs.*;
import org.apache.lucene.index.codecs.standard.*;
import org.apache.lucene.search.*; import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*; import org.apache.lucene.document.*;
@ -64,7 +66,8 @@ public class TestFlex extends LuceneTestCase {
public void testTermOrd() throws Exception { public void testTermOrd() throws Exception {
Directory d = new MockRAMDirectory(); Directory d = new MockRAMDirectory();
IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc); w.addDocument(doc);

View File

@ -1675,7 +1675,7 @@ public class TestIndexReader extends LuceneTestCase
// LUCENE-1586: getUniqueTermCount // LUCENE-1586: getUniqueTermCount
public void testUniqueTermCount() throws Exception { public void testUniqueTermCount() throws Exception {
Directory dir = new MockRAMDirectory(); Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED));
@ -1708,7 +1708,7 @@ public class TestIndexReader extends LuceneTestCase
// LUCENE-1609: don't load terms index // LUCENE-1609: don't load terms index
public void testNoTermsIndex() throws Throwable { public void testNoTermsIndex() throws Throwable {
Directory dir = new MockRAMDirectory(); Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED)); doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED));
@ -1725,7 +1725,7 @@ public class TestIndexReader extends LuceneTestCase
} }
assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor()); assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor());
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
writer.addDocument(doc); writer.addDocument(doc);
writer.close(); writer.close();

View File

@ -4559,7 +4559,7 @@ public class TestIndexWriter extends LuceneTestCase {
dir.close(); dir.close();
} }
// LUCENE-2095: make sure with multiple threads commit // LUCENE-2095: make sure with multiple threads commit
// doesn't return until all changes are in fact in the // doesn't return until all changes are in fact in the
// index // index
public void testCommitThreadSafety() throws Throwable { public void testCommitThreadSafety() throws Throwable {
@ -4673,7 +4673,9 @@ public class TestIndexWriter extends LuceneTestCase {
// sort in UTF16 sort order by default // sort in UTF16 sort order by default
public void testTermUTF16SortOrder() throws Throwable { public void testTermUTF16SortOrder() throws Throwable {
Directory dir = new MockRAMDirectory(); Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); // nocommit -- allow preflexrw but must force preflex
// for reading
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document d = new Document(); Document d = new Document();
// Single segment // Single segment
Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED); Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
@ -4682,7 +4684,7 @@ public class TestIndexWriter extends LuceneTestCase {
Random rnd = newRandom(); Random rnd = newRandom();
final Set<String> allTerms = new HashSet<String>(); final Set<String> allTerms = new HashSet<String>();
for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) { for(int i=0;i<10*_TestUtil.getRandomMultiplier();i++) {
final String s; final String s;
if (rnd.nextBoolean()) { if (rnd.nextBoolean()) {
@ -4705,14 +4707,13 @@ public class TestIndexWriter extends LuceneTestCase {
allTerms.add(s); allTerms.add(s);
f.setValue(s); f.setValue(s);
//System.out.println("add " + termDesc(s));
writer.addDocument(d); writer.addDocument(d);
if ((1+i) % 42 == 0) { if ((1+i) % 42 == 0) {
writer.commit(); writer.commit();
} }
} }
IndexReader r = writer.getReader(); IndexReader r = writer.getReader();
// Test each sub-segment // Test each sub-segment

View File

@ -394,18 +394,18 @@ public class TestIndexWriterDelete extends LuceneTestCase {
} }
public void testDeletesOnDiskFull() throws IOException { public void testDeletesOnDiskFull() throws IOException {
testOperationsOnDiskFull(false); doTestOperationsOnDiskFull(false);
} }
public void testUpdatesOnDiskFull() throws IOException { public void testUpdatesOnDiskFull() throws IOException {
testOperationsOnDiskFull(true); doTestOperationsOnDiskFull(true);
} }
/** /**
* Make sure if modifier tries to commit but hits disk full that modifier * Make sure if modifier tries to commit but hits disk full that modifier
* remains consistent and usable. Similar to TestIndexReader.testDiskFull(). * remains consistent and usable. Similar to TestIndexReader.testDiskFull().
*/ */
private void testOperationsOnDiskFull(boolean updates) throws IOException { private void doTestOperationsOnDiskFull(boolean updates) throws IOException {
Term searchTerm = new Term("content", "aaa"); Term searchTerm = new Term("content", "aaa");
int START_COUNT = 157; int START_COUNT = 157;
@ -700,6 +700,7 @@ public class TestIndexWriterDelete extends LuceneTestCase {
try { try {
modifier.commit(); modifier.commit();
} catch (IOException ioe) { } catch (IOException ioe) {
// expected
failed = true; failed = true;
} }

View File

@ -27,11 +27,12 @@ public class TestMultiFields extends LuceneTestCase {
public void testRandom() throws Exception { public void testRandom() throws Exception {
Random r = newRandom();
for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) { for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) {
Directory dir = new MockRAMDirectory(); Directory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES));
Random r = new Random(); IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES));
Map<BytesRef,List<Integer>> docs = new HashMap<BytesRef,List<Integer>>(); Map<BytesRef,List<Integer>> docs = new HashMap<BytesRef,List<Integer>>();
Set<Integer> deleted = new HashSet<Integer>(); Set<Integer> deleted = new HashSet<Integer>();
@ -45,7 +46,7 @@ public class TestMultiFields extends LuceneTestCase {
doc.add(id); doc.add(id);
boolean onlyUniqueTerms = r.nextBoolean(); boolean onlyUniqueTerms = r.nextBoolean();
Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
for(int i=0;i<numDocs;i++) { for(int i=0;i<numDocs;i++) {
if (!onlyUniqueTerms && r.nextBoolean() && terms.size() > 0) { if (!onlyUniqueTerms && r.nextBoolean() && terms.size() > 0) {
@ -61,6 +62,7 @@ public class TestMultiFields extends LuceneTestCase {
} }
docs.get(term).add(i); docs.get(term).add(i);
terms.add(term); terms.add(term);
uniqueTerms.add(term);
f.setValue(s); f.setValue(s);
} }
id.setValue(""+i); id.setValue(""+i);
@ -75,8 +77,19 @@ public class TestMultiFields extends LuceneTestCase {
} }
} }
if (VERBOSE) {
List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms);
Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator());
System.out.println("UTF16 order:");
for(BytesRef b : termsList) {
System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()));
}
}
// nocommit
IndexReader reader = w.getReader(); IndexReader reader = w.getReader();
w.close(); w.close();
//System.out.println("TEST reader=" + reader);
Bits delDocs = MultiFields.getDeletedDocs(reader); Bits delDocs = MultiFields.getDeletedDocs(reader);
for(int delDoc : deleted) { for(int delDoc : deleted) {

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
@ -64,7 +65,7 @@ public class TestSegmentTermEnum extends LuceneTestCase {
public void testPrevTermAtEnd() throws IOException public void testPrevTermAtEnd() throws IOException
{ {
Directory dir = new MockRAMDirectory(); Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer())); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
addDoc(writer, "aaa bbb"); addDoc(writer, "aaa bbb");
writer.close(); writer.close();
SegmentReader reader = SegmentReader.getOnlySegmentReader(dir); SegmentReader reader = SegmentReader.getOnlySegmentReader(dir);

View File

@ -18,8 +18,11 @@ package org.apache.lucene.index.codecs.preflex;
*/ */
import org.apache.lucene.store.*; import org.apache.lucene.store.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
import org.apache.lucene.index.codecs.*; import org.apache.lucene.index.codecs.*;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.apache.lucene.util.*; import org.apache.lucene.util.*;
import java.util.*; import java.util.*;
@ -30,8 +33,6 @@ import org.junit.Test;
public class TestSurrogates extends LuceneTestCaseJ4 { public class TestSurrogates extends LuceneTestCaseJ4 {
// chooses from a very limited alphabet to exacerbate the
// surrogate seeking required
private static String makeDifficultRandomUnicodeString(Random r) { private static String makeDifficultRandomUnicodeString(Random r) {
final int end = r.nextInt(20); final int end = r.nextInt(20);
if (end == 0) { if (end == 0) {
@ -44,154 +45,295 @@ public class TestSurrogates extends LuceneTestCaseJ4 {
if (0 == t && i < end - 1) { if (0 == t && i < end - 1) {
// hi // hi
buffer[i++] = (char) 0xd800; buffer[i++] = (char) (0xd800 + r.nextInt(2));
// lo // lo
buffer[i] = (char) 0xdc00; buffer[i] = (char) (0xdc00 + r.nextInt(2));
} else if (t <= 3) { } else if (t <= 3) {
buffer[i] = 'a'; buffer[i] = (char) ('a' + r.nextInt(2));
} else if (4 == t) { } else if (4 == t) {
buffer[i] = 0xe000; buffer[i] = (char) (0xe000 + r.nextInt(2));
} }
} }
return new String(buffer, 0, end); return new String(buffer, 0, end);
} }
private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<Term> fieldTerms) throws IOException { private String toHexString(Term t) {
return t.field() + ":" + UnicodeUtil.toHexString(t.text());
}
private String getRandomString(Random r) {
String s;
if (r.nextInt(3) == 1) {
s = makeDifficultRandomUnicodeString(r);
} else {
s = _TestUtil.randomUnicodeString(r);
}
return s;
}
private static class SortTermAsUTF16Comparator implements Comparator<Term> {
public int compare(Term o1, Term o2) {
return o1.compareToUTF16(o2);
}
}
private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator();
// single straight enum
private void doTestStraightEnum(List<Term> fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException {
if (VERBOSE) {
System.out.println("\nTEST: top now enum reader=" + reader);
}
FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator();
{
// Test straight enum:
String field;
int termCount = 0;
while((field = fieldsEnum.next()) != null) {
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
BytesRef lastText = null;
while((text = termsEnum.next()) != null) {
Term exp = fieldTerms.get(termCount);
if (VERBOSE) {
System.out.println(" got term=" + field + ":" + UnicodeUtil.toHexString(text.utf8ToString()));
System.out.println(" exp=" + exp.field() + ":" + UnicodeUtil.toHexString(exp.text().toString()));
System.out.println();
}
if (lastText == null) {
lastText = new BytesRef(text);
} else {
assertTrue(lastText.compareTo(text) < 0);
lastText.copy(text);
}
assertEquals(exp.field(), field);
assertEquals(exp.bytes(), text);
termCount++;
}
if (VERBOSE) {
System.out.println(" no more terms for field=" + field);
}
}
assertEquals(uniqueTermCount, termCount);
}
}
// randomly seeks to term that we know exists, then next's
// from there
private void doTestSeekExists(Random r, List<Term> fieldTerms, IndexReader reader) throws IOException {
final Map<String,TermsEnum> tes = new HashMap<String,TermsEnum>();
// Test random seek to existing term, then enum:
if (VERBOSE) {
System.out.println("\nTEST: top now seek");
}
for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) {
// pick random field+term
int spot = r.nextInt(fieldTerms.size());
Term term = fieldTerms.get(spot);
String field = term.field();
if (VERBOSE) {
System.out.println("TEST: exist seek field=" + field + " term=" + UnicodeUtil.toHexString(term.text()));
}
// seek to it
TermsEnum te = tes.get(field);
if (te == null) {
te = MultiFields.getTerms(reader, field).iterator();
tes.put(field, te);
}
if (VERBOSE) {
System.out.println(" done get enum");
}
// seek should find the term
assertEquals(TermsEnum.SeekStatus.FOUND,
te.seek(term.bytes()));
// now .next() this many times:
int ct = _TestUtil.nextInt(r, 5, 100);
for(int i=0;i<ct;i++) {
if (VERBOSE) {
System.out.println("TEST: now next()");
}
if (1+spot+i >= fieldTerms.size()) {
break;
}
term = fieldTerms.get(1+spot+i);
if (term.field() != field) {
assertNull(te.next());
break;
} else {
BytesRef t = te.next();
if (VERBOSE) {
System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
}
assertEquals(term.bytes(), t);
}
}
}
}
private void doTestSeekDoesNotExist(Random r, int numField, List<Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException {
final Map<String,TermsEnum> tes = new HashMap<String,TermsEnum>();
if (VERBOSE) {
System.out.println("TEST: top random seeks");
}
{
for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) {
// seek to random spot
String field = ("f" + r.nextInt(numField)).intern();
Term tx = new Term(field, getRandomString(r));
int spot = Arrays.binarySearch(fieldTermsArray, tx);
if (spot < 0) {
if (VERBOSE) {
System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text()));
}
// term does not exist:
TermsEnum te = tes.get(field);
if (te == null) {
te = MultiFields.getTerms(reader, field).iterator();
tes.put(field, te);
}
if (VERBOSE) {
System.out.println(" got enum");
}
spot = -spot - 1;
if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) {
assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes()));
} else {
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes()));
if (VERBOSE) {
System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString()));
System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text()));
}
assertEquals(fieldTerms.get(spot).bytes(),
te.term());
// now .next() this many times:
int ct = _TestUtil.nextInt(r, 5, 100);
for(int i=0;i<ct;i++) {
if (VERBOSE) {
System.out.println("TEST: now next()");
}
if (1+spot+i >= fieldTerms.size()) {
break;
}
Term term = fieldTerms.get(1+spot+i);
if (term.field() != field) {
assertNull(te.next());
break;
} else {
BytesRef t = te.next();
if (VERBOSE) {
System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
}
assertEquals(term.bytes(), t);
}
}
}
}
}
}
}
@Test
public void testSurrogatesOrder() throws Exception {
Random r = newRandom();
Directory dir = new MockRAMDirectory();
RandomIndexWriter w = new RandomIndexWriter(r,
dir,
new IndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec())));
final int numField = _TestUtil.nextInt(r, 2, 5); final int numField = _TestUtil.nextInt(r, 2, 5);
List<Term> terms = new ArrayList<Term>(); int uniqueTermCount = 0;
int tc = 0; int tc = 0;
List<Term> fieldTerms = new ArrayList<Term>();
for(int f=0;f<numField;f++) { for(int f=0;f<numField;f++) {
String field = "f" + f; String field = "f" + f;
Term protoTerm = new Term(field); Term protoTerm = new Term(field);
fieldInfos.add(field, true, false, false, false, false, false, false);
final int numTerms = 10000*_TestUtil.getRandomMultiplier(); final int numTerms = 10000*_TestUtil.getRandomMultiplier();
final Set<String> uniqueTerms = new HashSet<String>();
for(int i=0;i<numTerms;i++) { for(int i=0;i<numTerms;i++) {
String s; String term = getRandomString(r) + "_ " + (tc++);
if (r.nextInt(3) == 1) { uniqueTerms.add(term);
s = makeDifficultRandomUnicodeString(r); fieldTerms.add(new Term(field, term));
} else { Document doc = new Document();
s = _TestUtil.randomUnicodeString(r); doc.add(new Field(field, term, Field.Store.NO, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
// The surrogate dance uses 0xffff to seek-to-end
// of blocks. Also, pre-4.0 indices are already
// guaranteed to not contain the char 0xffff since
// it's mapped during indexing:
s = s.replace((char) 0xffff, (char) 0xfffe);
}
terms.add(protoTerm.createTerm(s + "_" + (tc++)));
} }
uniqueTermCount += uniqueTerms.size();
} }
fieldInfos.write(dir, segName); IndexReader reader = w.getReader();
// sorts in UTF16 order, just like preflex:
Collections.sort(terms, new Comparator<Term>() {
public int compare(Term o1, Term o2) {
return o1.compareToUTF16(o2);
}
});
TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
TermInfo ti = new TermInfo();
String lastText = null;
int uniqueTermCount = 0;
if (VERBOSE) { if (VERBOSE) {
System.out.println("TEST: utf16 order:"); Collections.sort(fieldTerms, termAsUTF16Comparator);
}
for(Term t : terms) {
FieldInfo fi = fieldInfos.fieldInfo(t.field());
String text = t.text(); System.out.println("\nTEST: UTF16 order");
if (lastText != null && lastText.equals(text)) { for(Term t: fieldTerms) {
continue;
}
fieldTerms.add(t);
uniqueTermCount++;
lastText = text;
if (VERBOSE) {
System.out.println(" " + toHexString(t)); System.out.println(" " + toHexString(t));
} }
w.add(fi.number, t.bytes().bytes, t.bytes().length, ti);
} }
w.close();
// sorts in code point order:
Collections.sort(fieldTerms); Collections.sort(fieldTerms);
if (VERBOSE) { if (VERBOSE) {
System.out.println("\nTEST: codepoint order"); System.out.println("\nTEST: codepoint order");
for(Term t: fieldTerms) { for(Term t: fieldTerms) {
System.out.println(" " + t.field() + ":" + toHexString(t)); System.out.println(" " + toHexString(t));
} }
} }
dir.createOutput(segName + ".prx").close(); Term[] fieldTermsArray = fieldTerms.toArray(new Term[fieldTerms.size()]);
dir.createOutput(segName + ".frq").close();
// !!hack alert!! stuffing uniqueTermCount in as docCount //SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
}
private String toHexString(Term t) { //FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
return t.field() + ":" + UnicodeUtil.toHexString(t.text()); //assertNotNull(fields);
}
@Test
public void testSurrogatesOrder() throws Exception {
Directory dir = new MockRAMDirectory();
Codec codec = new PreFlexCodec(); doTestStraightEnum(fieldTerms, reader, uniqueTermCount);
doTestSeekExists(r, fieldTerms, reader);
doTestSeekDoesNotExist(r, numField, fieldTerms, fieldTermsArray, reader);
Random r = newRandom(); reader.close();
FieldInfos fieldInfos = new FieldInfos();
List<Term> fieldTerms = new ArrayList<Term>();
SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
// hack alert!!
int uniqueTermCount = si.docCount;
FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
assertNotNull(fields);
if (VERBOSE) {
System.out.println("\nTEST: now enum");
}
FieldsEnum fieldsEnum = fields.iterator();
String field;
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
int termCount = 0;
while((field = fieldsEnum.next()) != null) {
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
BytesRef lastText = null;
while((text = termsEnum.next()) != null) {
if (VERBOSE) {
UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
System.out.println();
}
if (lastText == null) {
lastText = new BytesRef(text);
} else {
assertTrue(lastText.compareTo(text) < 0);
lastText.copy(text);
}
assertEquals(fieldTerms.get(termCount).field(), field);
assertEquals(fieldTerms.get(termCount).bytes(), text);
termCount++;
}
if (VERBOSE) {
System.out.println(" no more terms for field=" + field);
}
}
assertEquals(uniqueTermCount, termCount);
fields.close();
} }
} }

View File

@ -0,0 +1,212 @@
package org.apache.lucene.index.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.preflex.TermInfo;
import org.apache.lucene.store.IndexOutput;
import java.io.IOException;
import java.util.Comparator;
class PreFlexFieldsWriter extends FieldsConsumer {
private final TermInfosWriter termsOut;
private final IndexOutput freqOut;
private final IndexOutput proxOut;
private final DefaultSkipListWriter skipListWriter;
private final int totalNumDocs;
public PreFlexFieldsWriter(SegmentWriteState state) throws IOException {
termsOut = new TermInfosWriter(state.directory,
state.segmentName,
state.fieldInfos,
state.termIndexInterval);
state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_EXTENSION));
state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
final String freqFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.FREQ_EXTENSION);
freqOut = state.directory.createOutput(freqFile);
state.flushedFiles.add(freqFile);
totalNumDocs = state.numDocs;
if (state.fieldInfos.hasProx()) {
final String proxFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.PROX_EXTENSION);
proxOut = state.directory.createOutput(proxFile);
state.flushedFiles.add(proxFile);
} else {
proxOut = null;
}
skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
termsOut.maxSkipLevels,
totalNumDocs,
freqOut,
proxOut);
//System.out.println("\nw start seg=" + segment);
}
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
assert field.number != -1;
//System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number);
return new PreFlexTermsWriter(field);
}
@Override
public void close() throws IOException {
termsOut.close();
freqOut.close();
if (proxOut != null) {
proxOut.close();
}
}
private class PreFlexTermsWriter extends TermsConsumer {
private final FieldInfo fieldInfo;
private final boolean omitTF;
private final boolean storePayloads;
private final TermInfo termInfo = new TermInfo();
private final PostingsWriter postingsWriter = new PostingsWriter();
public PreFlexTermsWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
omitTF = fieldInfo.omitTermFreqAndPositions;
storePayloads = fieldInfo.storePayloads;
}
private class PostingsWriter extends PostingsConsumer {
private int lastDocID;
private int lastPayloadLength = -1;
private int lastPosition;
private int df;
public PostingsWriter reset() {
df = 0;
lastDocID = 0;
lastPayloadLength = -1;
return this;
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
//System.out.println(" w doc=" + docID);
final int delta = docID - lastDocID;
if (docID < 0 || (df > 0 && delta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
}
if ((++df % termsOut.skipInterval) == 0) {
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.bufferSkip(df);
}
lastDocID = docID;
assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;
if (omitTF) {
freqOut.writeVInt(delta);
} else {
final int code = delta << 1;
if (termDocFreq == 1) {
freqOut.writeVInt(code|1);
} else {
freqOut.writeVInt(code);
freqOut.writeVInt(termDocFreq);
}
}
lastPosition = 0;
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
assert proxOut != null;
//System.out.println(" w pos=" + position + " payl=" + payload);
final int delta = position - lastPosition;
lastPosition = position;
if (storePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
//System.out.println(" write payload len=" + payloadLength);
lastPayloadLength = payloadLength;
proxOut.writeVInt((delta<<1)|1);
proxOut.writeVInt(payloadLength);
} else {
proxOut.writeVInt(delta << 1);
}
if (payloadLength > 0) {
proxOut.writeBytes(payload.bytes, payload.offset, payload.length);
}
} else {
proxOut.writeVInt(delta);
}
}
@Override
public void finishDoc() throws IOException {
}
}
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
//System.out.println(" w term=" + text.utf8ToString());
skipListWriter.resetSkip();
termInfo.freqPointer = freqOut.getFilePointer();
if (proxOut != null) {
termInfo.proxPointer = proxOut.getFilePointer();
}
return postingsWriter.reset();
}
@Override
public void finishTerm(BytesRef text, int numDocs) throws IOException {
if (numDocs > 0) {
long skipPointer = skipListWriter.writeSkip(freqOut);
termInfo.docFreq = numDocs;
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
termsOut.add(fieldInfo.number,
text,
termInfo);
}
}
@Override
public void finish() throws IOException {
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
}

View File

@ -0,0 +1,52 @@
package org.apache.lucene.index.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.codecs.preflex.PreFlexFields;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
/** Codec, only for testing, that can write and read the
* pre-flex index format.
*
* @lucene.experimental
*/
public class PreFlexRWCodec extends PreFlexCodec {
public PreFlexRWCodec() {
// NOTE: we use same name as core PreFlex codec so that
// it can read the segments we write!
super();
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new PreFlexFieldsWriter(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
//System.out.println("preflexrw");
return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, false);
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.index.codecs.preflex; package org.apache.lucene.index.codecs.preflexrw;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -19,9 +19,12 @@ package org.apache.lucene.index.codecs.preflex;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.store.*; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.index.*; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.*; import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.preflex.TermInfo;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
@ -71,8 +74,7 @@ final class TermInfosWriter {
private long lastIndexPointer; private long lastIndexPointer;
private boolean isIndex; private boolean isIndex;
private byte[] lastTermBytes = new byte[10]; private final BytesRef lastTerm = new BytesRef();
private int lastTermBytesLength = 0;
private int lastFieldNumber = -1; private int lastFieldNumber = -1;
private TermInfosWriter other; private TermInfosWriter other;
@ -104,13 +106,10 @@ final class TermInfosWriter {
assert initUTF16Results(); assert initUTF16Results();
} }
void add(Term term, TermInfo ti) throws IOException {
add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti);
}
// Currently used only by assert statements // Currently used only by assert statements
UnicodeUtil.UTF16Result utf16Result1; UnicodeUtil.UTF16Result utf16Result1;
UnicodeUtil.UTF16Result utf16Result2; UnicodeUtil.UTF16Result utf16Result2;
private final BytesRef scratchBytes = new BytesRef();
// Currently used only by assert statements // Currently used only by assert statements
private boolean initUTF16Results() { private boolean initUTF16Results() {
@ -120,7 +119,7 @@ final class TermInfosWriter {
} }
// Currently used only by assert statement // Currently used only by assert statement
private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { private int compareToLastTerm(int fieldNumber, BytesRef term) {
if (lastFieldNumber != fieldNumber) { if (lastFieldNumber != fieldNumber) {
final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
@ -132,8 +131,13 @@ final class TermInfosWriter {
return cmp; return cmp;
} }
UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); scratchBytes.copy(term);
UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); assert lastTerm.offset == 0;
UnicodeUtil.UTF8toUTF16(lastTerm.bytes, 0, lastTerm.length, utf16Result1);
assert scratchBytes.offset == 0;
UnicodeUtil.UTF8toUTF16(scratchBytes.bytes, 0, scratchBytes.length, utf16Result2);
final int len; final int len;
if (utf16Result1.length < utf16Result2.length) if (utf16Result1.length < utf16Result2.length)
len = utf16Result1.length; len = utf16Result1.length;
@ -152,22 +156,22 @@ final class TermInfosWriter {
/** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set. /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
Term must be lexicographically greater than all previous Terms added. Term must be lexicographically greater than all previous Terms added.
TermInfo pointers must be positive and greater than all previous.*/ TermInfo pointers must be positive and greater than all previous.*/
void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) public void add(int fieldNumber, BytesRef term, TermInfo ti)
throws IOException { throws IOException {
assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || assert compareToLastTerm(fieldNumber, term) < 0 ||
(isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : (isIndex && term.length == 0 && lastTerm.length == 0) :
"Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
" lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
" text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); " text=" + term.utf8ToString() + " lastText=" + lastTerm.utf8ToString();
assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
if (!isIndex && size % indexInterval == 0) if (!isIndex && size % indexInterval == 0)
other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term other.add(lastFieldNumber, lastTerm, lastTi); // add an index term
writeTerm(fieldNumber, termBytes, termBytesLength); // write term writeTerm(fieldNumber, term); // write term
output.writeVInt(ti.docFreq); // write doc freq output.writeVInt(ti.docFreq); // write doc freq
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
@ -187,29 +191,27 @@ final class TermInfosWriter {
size++; size++;
} }
private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) private void writeTerm(int fieldNumber, BytesRef term)
throws IOException { throws IOException {
//System.out.println(" tiw.write field=" + fieldNumber + " term=" + term.utf8ToString());
// TODO: UTF16toUTF8 could tell us this prefix // TODO: UTF16toUTF8 could tell us this prefix
// Compute prefix in common with last term: // Compute prefix in common with last term:
int start = 0; int start = 0;
final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; final int limit = term.length < lastTerm.length ? term.length : lastTerm.length;
while(start < limit) { while(start < limit) {
if (termBytes[start] != lastTermBytes[start]) if (term.bytes[start+term.offset] != lastTerm.bytes[start+lastTerm.offset])
break; break;
start++; start++;
} }
final int length = termBytesLength - start; final int length = term.length - start;
output.writeVInt(start); // write shared prefix length output.writeVInt(start); // write shared prefix length
output.writeVInt(length); // write delta length output.writeVInt(length); // write delta length
output.writeBytes(termBytes, start, length); // write delta bytes output.writeBytes(term.bytes, start+term.offset, length); // write delta bytes
output.writeVInt(fieldNumber); // write field num output.writeVInt(fieldNumber); // write field num
if (lastTermBytes.length < termBytesLength) { lastTerm.copy(term);
lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
}
System.arraycopy(termBytes, start, lastTermBytes, start, length);
lastTermBytesLength = termBytesLength;
} }
/** Called to complete TermInfos creation. */ /** Called to complete TermInfos creation. */

View File

@ -34,6 +34,8 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.search.FieldCache.CacheEntry;
import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
/** /**
* Base class for all Lucene unit tests. * Base class for all Lucene unit tests.
@ -72,6 +74,8 @@ public abstract class LuceneTestCase extends TestCase {
private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null; private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null;
private String savedDefaultCodec;
/** Used to track if setUp and tearDown are called correctly from subclasses */ /** Used to track if setUp and tearDown are called correctly from subclasses */
private boolean setup; private boolean setup;
@ -110,6 +114,8 @@ public abstract class LuceneTestCase extends TestCase {
ConcurrentMergeScheduler.setTestMode(); ConcurrentMergeScheduler.setTestMode();
savedBoolMaxClauseCount = BooleanQuery.getMaxClauseCount(); savedBoolMaxClauseCount = BooleanQuery.getMaxClauseCount();
savedDefaultCodec = CodecProvider.getDefaultCodec();
CodecProvider.setDefaultCodec(_TestUtil.getTestCodec());
} }
/** /**
@ -135,7 +141,8 @@ public abstract class LuceneTestCase extends TestCase {
assertTrue("ensure your setUp() calls super.setUp()!!!", setup); assertTrue("ensure your setUp() calls super.setUp()!!!", setup);
setup = false; setup = false;
BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount); BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount);
CodecProvider.setDefaultCodec(savedDefaultCodec);
try { try {
Thread.setDefaultUncaughtExceptionHandler(savedUncaughtExceptionHandler); Thread.setDefaultUncaughtExceptionHandler(savedUncaughtExceptionHandler);
if (!uncaughtExceptions.isEmpty()) { if (!uncaughtExceptions.isEmpty()) {
@ -298,4 +305,10 @@ public abstract class LuceneTestCase extends TestCase {
// static members // static members
private static final Random seedRnd = new Random(); private static final Random seedRnd = new Random();
// register preflex-rw statically.
static {
CodecProvider.getDefault().register(new PreFlexRWCodec());
}
} }

View File

@ -22,9 +22,14 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.CacheEntry; import org.apache.lucene.search.FieldCache.CacheEntry;
import org.apache.lucene.util.FieldCacheSanityChecker.Insanity; import org.apache.lucene.util.FieldCacheSanityChecker.Insanity;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.junit.After; import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert; import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule; import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.rules.TestWatchman; import org.junit.rules.TestWatchman;
@ -127,6 +132,20 @@ public class LuceneTestCaseJ4 {
private static final Map<Class<? extends LuceneTestCaseJ4>,Object> checkedClasses = private static final Map<Class<? extends LuceneTestCaseJ4>,Object> checkedClasses =
Collections.synchronizedMap(new WeakHashMap<Class<? extends LuceneTestCaseJ4>,Object>()); Collections.synchronizedMap(new WeakHashMap<Class<? extends LuceneTestCaseJ4>,Object>());
// saves default codec: we do this statically as many build indexes in @beforeClass
private static String savedDefaultCodec;
@BeforeClass
public static void beforeClassLuceneTestCaseJ4() {
savedDefaultCodec = CodecProvider.getDefaultCodec();
CodecProvider.setDefaultCodec(_TestUtil.getTestCodec());
}
@AfterClass
public static void afterClassLuceneTestCaseJ4() {
CodecProvider.setDefaultCodec(savedDefaultCodec);
}
// This is how we get control when errors occur. // This is how we get control when errors occur.
// Think of this as start/end/success/failed // Think of this as start/end/success/failed
// events. // events.
@ -405,4 +424,8 @@ public class LuceneTestCaseJ4 {
private String name = "<unknown>"; private String name = "<unknown>";
// register PreFlexRWCodec statically
static {
CodecProvider.getDefault().register(new PreFlexRWCodec());
}
} }

View File

@ -23,6 +23,9 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.CheckIndex; import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.PrintStream; import java.io.PrintStream;
@ -129,8 +132,24 @@ public class _TestUtil {
} }
final char[] buffer = new char[end]; final char[] buffer = new char[end];
for (int i = 0; i < end; i++) { for (int i = 0; i < end; i++) {
int t = r.nextInt(5); int t = r.nextInt(5);
//buffer[i] = (char) (97 + r.nextInt(26)); //buffer[i] = (char) (97 + r.nextInt(26));
/*
if (0 == t && i < end - 1) {
// hi
buffer[i++] = (char) 0xd800;
// lo
buffer[i] = (char) 0xdc00;
} else if (t <= 3) {
buffer[i] = 'a';
} else if (4 == t) {
buffer[i] = 0xe000;
}
*/
if (0 == t && i < end - 1) { if (0 == t && i < end - 1) {
// Make a surrogate pair // Make a surrogate pair
// High surrogate // High surrogate
@ -218,4 +237,27 @@ public class _TestUtil {
public static int getRandomMultiplier() { public static int getRandomMultiplier() {
return Integer.parseInt(System.getProperty("random.multiplier", "1")); return Integer.parseInt(System.getProperty("random.multiplier", "1"));
} }
/** gets the codec to run tests with */
public static String getTestCodec() {
return System.getProperty("tests.codec", "Standard");
}
public static CodecProvider alwaysCodec(final Codec c) {
return new CodecProvider() {
@Override
public Codec getWriter(SegmentWriteState state) {
return c;
}
@Override
public Codec lookup(String name) {
return c;
}
};
}
public static CodecProvider alwaysCodec(final String codec) {
return alwaysCodec(CodecProvider.getDefault().lookup(codec));
}
} }