commit my current state

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/preflexfixes@967130 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-07-23 15:26:25 +00:00
parent 7216e5b7d2
commit af4a9deb16
30 changed files with 1386 additions and 515 deletions

View File

@ -56,6 +56,7 @@
<property name="args" value=""/>
<property name="threadsPerProcessor" value="1" />
<property name="random.multiplier" value="1" />
<property name="tests.codec" value="Standard" />
<property name="javac.deprecation" value="off"/>
<property name="javac.debug" value="on"/>
@ -434,6 +435,8 @@
<!-- allow tests to control debug prints -->
<sysproperty key="tests.verbose" value="${tests.verbose}"/>
<!-- set the codec tests should run with -->
<sysproperty key="tests.codec" value="${tests.codec}"/>
<!-- TODO: create propertyset for test properties, so each project can have its own set -->
<sysproperty key="random.multiplier" value="${random.multiplier}"/>

View File

@ -33,6 +33,7 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
@ -107,8 +108,8 @@ public class MemoryIndexTest extends BaseTokenStreamTestCase {
RAMDirectory ramdir = new RAMDirectory();
Analyzer analyzer = randomAnalyzer();
IndexWriter writer = new IndexWriter(ramdir, analyzer,
IndexWriter.MaxFieldLength.UNLIMITED);
IndexWriter writer = new IndexWriter(ramdir,
new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
Field field1 = new Field("foo", fooField.toString(), Field.Store.NO, Field.Index.ANALYZED);
Field field2 = new Field("term", termField.toString(), Field.Store.NO, Field.Index.ANALYZED);

View File

@ -116,7 +116,7 @@ public final class MultiTermsEnum extends TermsEnum {
// different TermComps
final Comparator<BytesRef> subTermComp = termsEnumIndex.termsEnum.getComparator();
if (subTermComp != null && !subTermComp.equals(termComp)) {
throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge");
throw new IllegalStateException("sub-readers have different BytesRef.Comparators: " + subTermComp + " vs " + termComp + "; cannot merge");
}
}

View File

@ -47,14 +47,14 @@ public abstract class CodecProvider {
private static String defaultCodec = "Standard";
public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock"};
public final static String[] CORE_CODECS = new String[] {"Standard", "Sep", "Pulsing", "IntBlock", "PreFlex"};
public void register(Codec codec) {
if (codec.name == null) {
throw new IllegalArgumentException("code.name is null");
}
if (!codecs.containsKey(codec.name)) {
// nocommit
if (!codecs.containsKey(codec.name) || codec.name.equals("PreFlex")) {
codecs.put(codec.name, codec);
codec.getExtensions(knownExtensions);
} else if (codecs.get(codec.name) != codec) {

View File

@ -22,6 +22,7 @@ import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import java.io.IOException;
import java.io.Closeable;
/** Abstract API that consumes terms, doc, freq, prox and
* payloads postings. Concrete implementations of this
@ -30,7 +31,7 @@ import java.io.IOException;
*
* @lucene.experimental
*/
public abstract class FieldsConsumer {
public abstract class FieldsConsumer implements Closeable {
/** Add a new field */
public abstract TermsConsumer addField(FieldInfo field) throws IOException;

View File

@ -40,16 +40,16 @@ import org.apache.lucene.index.codecs.FieldsProducer;
public class PreFlexCodec extends Codec {
/** Extension of terms file */
static final String TERMS_EXTENSION = "tis";
public static final String TERMS_EXTENSION = "tis";
/** Extension of terms index file */
static final String TERMS_INDEX_EXTENSION = "tii";
public static final String TERMS_INDEX_EXTENSION = "tii";
/** Extension of freq postings file */
static final String FREQ_EXTENSION = "frq";
public static final String FREQ_EXTENSION = "frq";
/** Extension of prox postings file */
static final String PROX_EXTENSION = "prx";
public static final String PROX_EXTENSION = "prx";
public PreFlexCodec() {
name = "PreFlex";
@ -62,7 +62,7 @@ public class PreFlexCodec extends Codec {
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor);
return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, true);
}
@Override

View File

@ -40,7 +40,6 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.ArrayUtil;
/** Exposes flex API on a pre-flex index, as a codec.
* @lucene.experimental */
@ -59,11 +58,19 @@ public class PreFlexFields extends FieldsProducer {
private final Directory dir;
private final int readBufferSize;
private Directory cfsReader;
private final boolean unicodeSortOrder;
PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
// If unicodeSortOrder is true, we do the surrogates dance
// so that the terms are sorted by unicode sort order.
// This should be true when segments are used for "normal"
// searching; it's only false during testing, to create a
// pre-flex index, using the preflexrw codec under
// src/test.
public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor, boolean unicodeSortOrder)
throws IOException {
si = info;
this.unicodeSortOrder = unicodeSortOrder;
// NOTE: we must always load terms index, even for
// "sequential" scan during merging, because what is
@ -182,6 +189,12 @@ public class PreFlexFields extends FieldsProducer {
if (cfsReader != null) {
cfsReader.close();
}
if (freqStream != null) {
freqStream.close();
}
if (proxStream != null) {
proxStream.close();
}
}
private class PreFlexFieldsEnum extends FieldsEnum {
@ -228,7 +241,11 @@ public class PreFlexFields extends FieldsProducer {
public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
}
@ -238,237 +255,473 @@ public class PreFlexFields extends FieldsProducer {
private boolean skipNext;
private BytesRef current;
private int[] surrogateSeekPending = new int[1];
private boolean[] surrogateDidSeekBack = new boolean[1];
private int surrogateSeekUpto;
private char[] pendingPrefix;
private SegmentTermEnum seekTermEnum;
private Term protoTerm;
private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;
// Returns true if the unicode char is "after" the
// surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
private final boolean isHighBMPChar(byte[] b, int idx) {
return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
}
// Returns true if the unicode char in the UTF8 byte
// sequence starting at idx encodes a char outside of
// BMP (ie what would be a surrogate pair in UTF16):
private final boolean isNonBMPChar(byte[] b, int idx) {
return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
}
private final byte[] scratch = new byte[4];
private final BytesRef prevTerm = new BytesRef();
private final BytesRef scratchTerm = new BytesRef();
private int newSuffixStart;
// Swap in S, in place of E:
private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
final int savLength = term.length;
assert term.offset == 0;
// The 3 bytes starting at downTo make up 1
// unicode character:
assert isHighBMPChar(term.bytes, pos);
// nocommit -- why does this trip?
// assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3);
// Save the bytes && length, since we need to
// restore this if seek "back" finds no matching
// terms
if (term.bytes.length < 4+pos) {
term.grow(4+pos);
}
scratch[0] = term.bytes[pos];
scratch[1] = term.bytes[pos+1];
scratch[2] = term.bytes[pos+2];
term.bytes[pos] = (byte) 0xf0;
term.bytes[pos+1] = (byte) 0x90;
term.bytes[pos+2] = (byte) 0x80;
term.bytes[pos+3] = (byte) 0x80;
term.length = 4+pos;
if (DEBUG_SURROGATES) {
System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
}
// Seek "back":
getTermsDict().seekEnum(te, protoTerm.createTerm(term));
// Test if the term we seek'd to in fact found a
// surrogate pair at the same position as the E:
Term t2 = te.term();
// Cannot be null (or move to next field) because at
// "worst" it'd seek to the same term we are on now,
// unless we are being called from seek
if (t2 == null || t2.field() != fieldInfo.name) {
return false;
}
if (DEBUG_SURROGATES) {
System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()));
}
// Now test if prefix is identical and we found
// a non-BMP char at the same position:
BytesRef b2 = t2.bytes();
assert b2.offset == 0;
boolean matches;
if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
matches = true;
for(int i=0;i<pos;i++) {
if (term.bytes[i] != b2.bytes[i]) {
matches = false;
break;
}
}
} else {
matches = false;
}
// Restore term:
term.length = savLength;
term.bytes[pos] = scratch[0];
term.bytes[pos+1] = scratch[1];
term.bytes[pos+2] = scratch[2];
return matches;
}
// Seek type 2 "continue" (back to the start of the
// surrogates): scan the stripped suffix from the
// prior term, backwards. If there was an E in that
// part, then we try to seek back to S. If that
// seek finds a matching term, we go there.
private boolean doContinue() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" try cont");
}
int downTo = prevTerm.length-1;
boolean didSeek = false;
final int limit = Math.min(newSuffixStart, scratchTerm.length-1);
while(downTo > limit) {
if (isHighBMPChar(prevTerm.bytes, downTo)) {
if (DEBUG_SURROGATES) {
System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length);
}
if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
// TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
//newSuffixStart = downTo+4;
newSuffixStart = downTo;
scratchTerm.copy(termEnum.term().bytes());
didSeek = true;
if (DEBUG_SURROGATES) {
System.out.println(" seek!");
}
break;
} else {
if (DEBUG_SURROGATES) {
System.out.println(" no seek");
}
}
}
// Shorten prevTerm in place so that we don't redo
// this loop if we come back here:
if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
prevTerm.length = downTo;
}
downTo--;
}
return didSeek;
}
// Look for seek type 3 ("pop"): if the delta from
// prev -> current was replacing an S with an E,
// we must now seek to beyond that E. This seek
// "finishes" the dance at this character
// position.
private boolean doPop() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" try pop");
}
assert newSuffixStart <= prevTerm.length;
assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;
if (prevTerm.length > newSuffixStart &&
isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {
// Seek type 2 -- put U+FFFF at this position:
// nocommit -- can we somehow use 0xff???
scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
//scratchTerm.bytes[newSuffixStart] = (byte) 0xef;
scratchTerm.bytes[newSuffixStart+1] = (byte) 0xbf;
scratchTerm.bytes[newSuffixStart+2] = (byte) 0xbf;
scratchTerm.length = newSuffixStart+3;
if (DEBUG_SURROGATES) {
System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
}
// TODO: more efficient seek? can we simply swap
// the enums?
getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));
final Term t2 = termEnum.term();
// We could hit EOF or different field since this
// was a seek "forward":
if (t2 != null && t2.field() == fieldInfo.name) {
if (DEBUG_SURROGATES) {
System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
}
final BytesRef b2 = t2.bytes();
assert b2.offset == 0;
// Set newSuffixStart -- we can't use
// termEnum's since the above seek may have
// done no scanning (eg, term was precisely
// and index term, or, was in the term seek
// cache):
scratchTerm.copy(b2);
setNewSuffixStart(prevTerm, scratchTerm);
return true;
} else if (newSuffixStart != 0 || scratchTerm.length != 0) {
if (DEBUG_SURROGATES) {
System.out.println(" got term=null (or next field)");
}
newSuffixStart = 0;
scratchTerm.length = 0;
return true;
}
}
return false;
}
// Pre-flex indices store terms in UTF16 sort order, but
// certain queries require Unicode codepoint order; this
// method carefully seeks around surrogates to handle
// this impedance mismatch
private void surrogateDance() throws IOException {
if (!unicodeSortOrder) {
return;
}
// We are invoked after TIS.next() (by UTF16 order) to
// possibly seek to a different "next" (by unicode
// order) term.
// We scan only the "delta" from the last term to the
// current term, in UTF8 bytes. We look at 1) the bytes
// stripped from the prior term, and then 2) the bytes
// appended to that prior term's prefix.
// We don't care about specific UTF8 sequences, just
// the "category" of the UTF16 character. Category S
// is a high/low surrogate pair (it non-BMP).
// Category E is any BMP char > UNI_SUR_LOW_END (and <
// U+FFFF). Category A is the rest (any unicode char
// <= UNI_SUR_HIGH_START).
// The core issue is that pre-flex indices sort the
// characters as ASE, while flex must sort as AES. So
// when scanning, when we hit S, we must 1) seek
// forward to E and enum the terms there, then 2) seek
// back to S and enum all terms there, then 3) seek to
// after E. Three different seek points (1, 2, 3).
// We can easily detect S in UTF8: if a byte has
// prefix 11110 (0xf0), then that byte and the
// following 3 bytes encode a single unicode codepoint
// in S. Similary,we can detect E: if a byte has
// prefix 1110111 (0xee), then that byte and the
// following 2 bytes encode a single unicode codepoint
// in E.
// Note that this is really a recursive process --
// maybe the char at pos 2 needs to dance, but any
// point in its dance, suddenly pos 4 needs to dance
// so you must finish pos 4 before returning to pos
// 2. But then during pos 4's dance maybe pos 7 needs
// to dance, etc. However, despite being recursive,
// we don't need to hold any state because the state
// can always be derived by looking at prior term &
// current term.
// TODO: can we avoid this copy?
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
scratchTerm.length = 0;
} else {
scratchTerm.copy(termEnum.term().bytes());
}
if (DEBUG_SURROGATES) {
System.out.println(" dance");
System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
System.out.println(" " + prevTerm.toString());
System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
System.out.println(" " + scratchTerm.toString());
}
// This code assumes TermInfosReader/SegmentTermEnum
// always use BytesRef.offset == 0
assert prevTerm.offset == 0;
assert scratchTerm.offset == 0;
// Need to loop here because we may need to do multiple
// pops, and possibly a continue in the end, ie:
//
// cont
// pop, cont
// pop, pop, cont
// <nothing>
//
while(true) {
if (doContinue()) {
break;
} else {
if (!doPop()) {
break;
}
}
}
if (DEBUG_SURROGATES) {
System.out.println(" finish bmp ends");
}
doPushes();
}
// Look for seek type 1 ("push"): if the newly added
// suffix contains any S, we must try to seek to the
// corresponding E. If we find a match, we go there;
// else we keep looking for additional S's in the new
// suffix. This "starts" the dance, at this character
// position:
private void doPushes() throws IOException {
int upTo = newSuffixStart;
if (DEBUG_SURROGATES) {
System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
}
while(upTo < scratchTerm.length) {
if (isNonBMPChar(scratchTerm.bytes, upTo) &&
(upTo > newSuffixStart ||
(upTo >= prevTerm.length ||
(!isNonBMPChar(prevTerm.bytes, upTo) &&
!isHighBMPChar(prevTerm.bytes, upTo))))) { // nocommit -- we can't cmp to prevTerm if we'd done a seek 3 or seek 2 before?
// A non-BMP char (4 bytes UTF8) starts here:
assert scratchTerm.length >= upTo + 4;
final int savLength = scratchTerm.length;
scratch[0] = scratchTerm.bytes[upTo];
scratch[1] = scratchTerm.bytes[upTo+1];
scratch[2] = scratchTerm.bytes[upTo+2];
scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
scratchTerm.bytes[upTo+1] = (byte) 0x80;
scratchTerm.bytes[upTo+2] = (byte) 0x80;
scratchTerm.length = upTo+3;
if (DEBUG_SURROGATES) {
System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
}
// Seek "forward":
// TODO: more efficient seek?
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));
scratchTerm.bytes[upTo] = scratch[0];
scratchTerm.bytes[upTo+1] = scratch[1];
scratchTerm.bytes[upTo+2] = scratch[2];
scratchTerm.length = savLength;
// Did we find a match?
final Term t2 = seekTermEnum.term();
if (DEBUG_SURROGATES) {
if (t2 == null) {
System.out.println(" hit term=null");
} else {
System.out.println(" hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes()));
}
}
// Since this was a seek "forward", we could hit
// EOF or a different field:
boolean matches;
if (t2 != null && t2.field() == fieldInfo.name) {
final BytesRef b2 = t2.bytes();
assert b2.offset == 0;
if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
matches = true;
for(int i=0;i<upTo;i++) {
if (scratchTerm.bytes[i] != b2.bytes[i]) {
matches = false;
break;
}
}
} else {
matches = false;
}
} else {
matches = false;
}
if (matches) {
if (DEBUG_SURROGATES) {
System.out.println(" matches!");
}
// OK seek "back"
// TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
scratchTerm.copy(seekTermEnum.term().bytes());
// +3 because we don't need to check the char
// at upTo: we know it's > BMP
upTo += 3;
// NOTE: we keep iterating, now, since this
// can easily "recurse". Ie, after seeking
// forward at a certain char position, we may
// find another surrogate in our [new] suffix
// and must then do another seek (recurse)
} else {
upTo++;
}
} else {
upTo++;
}
}
}
void reset(FieldInfo fieldInfo) throws IOException {
//System.out.println("pff.reset te=" + termEnum);
this.fieldInfo = fieldInfo;
protoTerm = new Term(fieldInfo.name);
if (termEnum == null) {
termEnum = getTermsDict().terms(protoTerm);
seekTermEnum = getTermsDict().terms(protoTerm);
//System.out.println(" term=" + termEnum.term());
} else {
getTermsDict().seekEnum(termEnum, protoTerm);
}
skipNext = true;
surrogateSeekUpto = 0;
final Term t = termEnum.term();
if (t != null && t.field() == fieldInfo.name) {
newSuffixStart = 0;
surrogatesDance();
prevTerm.length = 0;
surrogateDance();
}
private void surrogatesDance() throws IOException {
// Tricky: prior to 4.0, Lucene index sorted terms in
// UTF16 order, but as of 4.0 we sort by Unicode code
// point order. These orders differ because of the
// surrrogates; so we have to fixup our enum, here, by
// carefully first seeking past the surrogates and
// then back again at the end. The process is
// recursive, since any given term could have multiple
// new occurrences of surrogate pairs, so we use a
// stack to record the pending seek-backs.
if (DEBUG_SURROGATES) {
System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
}
while(popPendingSeek());
while(pushNewSurrogate());
}
// only for debugging
private String getStack() {
if (surrogateSeekUpto == 0) {
return "null";
} else {
StringBuffer sb = new StringBuffer();
for(int i=0;i<surrogateSeekUpto;i++) {
if (i > 0) {
sb.append(' ');
}
sb.append(surrogateSeekPending[i]);
}
sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
return sb.toString();
}
}
private boolean popPendingSeek() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
}
// if a .next() has advanced beyond the
// after-surrogates range we had last seeked to, we
// must seek back to the start and resume .next from
// there. this pops the pending seek off the stack.
final Term t = termEnum.term();
if (surrogateSeekUpto > 0) {
final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
if (DEBUG_SURROGATES) {
System.out.println(" seekPrefix=" + seekPrefix);
}
if (newSuffixStart < seekPrefix) {
assert pendingPrefix != null;
assert pendingPrefix.length > seekPrefix;
pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
pendingPrefix[1+seekPrefix] = UnicodeUtil.UNI_SUR_LOW_START;
Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 2+seekPrefix));
if (DEBUG_SURROGATES) {
System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
}
getTermsDict().seekEnum(termEnum, t2);
surrogateDidSeekBack[surrogateSeekUpto-1] = true;
// +2 because we don't want to re-check the
// surrogates we just seek'd back to
newSuffixStart = seekPrefix + 2;
return true;
} else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
assert pendingPrefix != null;
assert pendingPrefix.length > seekPrefix;
pendingPrefix[seekPrefix] = 0xffff;
Term t2 = protoTerm.createTerm(new BytesRef(pendingPrefix, 0, 1+seekPrefix));
if (DEBUG_SURROGATES) {
System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
}
getTermsDict().seekEnum(termEnum, t2);
if (DEBUG_SURROGATES) {
System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
}
surrogateSeekUpto--;
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
// force pop
newSuffixStart = -1;
} else {
newSuffixStart = termEnum.newSuffixStart;
}
return true;
}
}
return false;
}
private UnicodeUtil.UTF16Result termBuffer = new UnicodeUtil.UTF16Result();
private UnicodeUtil.UTF16Result seekBuffer = new UnicodeUtil.UTF16Result();
private boolean pushNewSurrogate() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack());
}
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
return false;
}
final BytesRef bytes = t.bytes();
UnicodeUtil.UTF8toUTF16(bytes.bytes, bytes.offset, bytes.length, termBuffer);
for(int i=Math.max(0,newSuffixStart);i<termBuffer.length;i++) {
final char ch = termBuffer.result[i];
if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
if (DEBUG_SURROGATES) {
System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
}
// the next() that we just did read in a new
// suffix, containing a surrogate pair
// seek forward to see if there are any terms with
// this same prefix, but with characters after the
// surrogate range; if so, we must first iterate
// them, then seek back to the surrogates
char[] testPrefix = new char[i+2];
for(int j=0;j<i;j++) {
testPrefix[j] = termBuffer.result[j];
}
testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new BytesRef(testPrefix, 0, i+1)));
Term t2 = seekTermEnum.term();
boolean isPrefix;
if (t2 != null && t2.field() == fieldInfo.name) {
final BytesRef seekBytes = t2.bytes();
UnicodeUtil.UTF8toUTF16(seekBytes.bytes, seekBytes.offset, seekBytes.length, seekBuffer);
isPrefix = true;
if (DEBUG_SURROGATES) {
System.out.println(" seek found " + UnicodeUtil.toHexString(t2.text()));
}
for(int j=0;j<i;j++) {
if (testPrefix[j] != seekBuffer.result[j]) {
isPrefix = false;
break;
}
}
if (DEBUG_SURROGATES && !isPrefix) {
System.out.println(" no end terms");
}
} else {
if (DEBUG_SURROGATES) {
System.out.println(" no end terms");
}
isPrefix = false;
}
if (isPrefix) {
// we found a term, sharing the same prefix,
// with characters after the surrogates, so we
// must first enum those, and then return the
// the surrogates afterwards. push that pending
// seek on the surrogates stack now:
pendingPrefix = testPrefix;
getTermsDict().seekEnum(termEnum, t2);
if (surrogateSeekUpto == surrogateSeekPending.length) {
surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
}
if (surrogateSeekUpto == surrogateDidSeekBack.length) {
surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
}
surrogateSeekPending[surrogateSeekUpto] = i;
surrogateDidSeekBack[surrogateSeekUpto] = false;
surrogateSeekUpto++;
if (DEBUG_SURROGATES) {
System.out.println(" do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
}
newSuffixStart = i+1;
return true;
} else {
// there are no terms after the surrogates, so
// we do nothing to the enum and just step
// through the surrogates like normal. but we
// must keep iterating through the term, in case
// another surrogate pair appears later
}
}
}
return false;
}
@Override
public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
@Override
@ -484,7 +737,7 @@ public class PreFlexFields extends FieldsProducer {
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
if (DEBUG_SURROGATES) {
System.out.println("TE.seek() term=" + term.utf8ToString());
System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
}
skipNext = false;
final TermInfosReader tis = getTermsDict();
@ -492,50 +745,142 @@ public class PreFlexFields extends FieldsProducer {
assert termEnum != null;
if (termEnum == null) {
termEnum = tis.terms(t0);
} else {
tis.seekEnum(termEnum, t0);
}
surrogateSeekUpto = 0;
surrogatesDance();
final Term t = termEnum.term();
final BytesRef tr = t == null ? null : t.bytes();
if (t != null && t.field() == fieldInfo.name && term.bytesEquals(tr)) {
current = tr;
if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
// If we found an exact match, no need to do the
// surrogate dance
if (DEBUG_SURROGATES) {
System.out.println(" seek exact match");
}
current = t.bytes();
return SeekStatus.FOUND;
} else if (t == null || t.field() != fieldInfo.name) {
// nocommit -- why can't we handle this like the
// next() into null? set term as prevTerm then dance?
if (DEBUG_SURROGATES) {
System.out.println(" seek hit EOF");
}
// We hit EOF; try end-case surrogate dance: if we
// find an E, try swapping in S, backwards:
scratchTerm.copy(term);
assert scratchTerm.offset == 0;
for(int i=scratchTerm.length-1;i>=0;i--) {
if (isHighBMPChar(scratchTerm.bytes, i)) {
if (DEBUG_SURROGATES) {
System.out.println(" found E pos=" + i + "; try seek");
}
if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
scratchTerm.copy(seekTermEnum.term().bytes());
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
newSuffixStart = 1+i;
doPushes();
// Found a match
// TODO: faster seek?
current = termEnum.term().bytes();
return SeekStatus.NOT_FOUND;
}
}
}
if (DEBUG_SURROGATES) {
System.out.println(" seek END");
}
current = null;
return SeekStatus.END;
} else {
current = tr;
// We found a non-exact but non-null term; this one
// is fun -- just treat it like next, by pretending
// requested term was prev:
prevTerm.copy(term);
if (DEBUG_SURROGATES) {
System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
}
final BytesRef br = t.bytes();
assert br.offset == 0;
setNewSuffixStart(term, br);
surrogateDance();
final Term t2 = termEnum.term();
if (t2 == null || t2.field() != fieldInfo.name) {
assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
current = null;
return SeekStatus.END;
} else {
current = t2.bytes();
assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
return SeekStatus.NOT_FOUND;
}
}
}
private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
final int limit = Math.min(br1.length, br2.length);
int lastStart = 0;
for(int i=0;i<limit;i++) {
if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) {
lastStart = i;
}
if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) {
newSuffixStart = lastStart;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
}
return;
}
}
newSuffixStart = limit;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
}
}
@Override
public BytesRef next() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println("TE.next() skipNext=" + skipNext);
System.out.println("TE.next()");
}
if (skipNext) {
if (DEBUG_SURROGATES) {
System.out.println(" skipNext=true");
}
skipNext = false;
if (termEnum.term() == null) {
return null;
} else if (termEnum.term().field() != fieldInfo.name) {
return null;
} else {
return current = termEnum.term().bytes();
}
}
// TODO: can we use STE's prevBuffer here?
prevTerm.copy(termEnum.term().bytes());
if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
newSuffixStart = termEnum.newSuffixStart;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
System.out.println(" newSuffixStart=" + newSuffixStart);
}
surrogatesDance();
surrogateDance();
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
@ -545,12 +890,15 @@ public class PreFlexFields extends FieldsProducer {
}
return current;
} else {
// This field is exhausted, but we have to give
// surrogateDance a chance to seek back:
if (DEBUG_SURROGATES) {
System.out.println(" force pop");
System.out.println(" force cont");
}
// force pop
newSuffixStart = -1;
surrogatesDance();
//newSuffixStart = prevTerm.length;
newSuffixStart = 0;
surrogateDance();
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
@ -574,21 +922,33 @@ public class PreFlexFields extends FieldsProducer {
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
if (reuse != null) {
return ((PreDocsEnum) reuse).reset(termEnum, skipDocs);
PreDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof PreDocsEnum)) {
docsEnum = new PreDocsEnum();
} else {
return (new PreDocsEnum()).reset(termEnum, skipDocs);
docsEnum = (PreDocsEnum) reuse;
if (docsEnum.getFreqStream() != freqStream) {
docsEnum = new PreDocsEnum();
}
}
return docsEnum.reset(termEnum, skipDocs);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
if (reuse != null) {
return ((PreDocsAndPositionsEnum) reuse).reset(termEnum, skipDocs);
PreDocsAndPositionsEnum docsPosEnum;
if (fieldInfo.omitTermFreqAndPositions) {
return null;
} else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) {
docsPosEnum = new PreDocsAndPositionsEnum();
} else {
return (new PreDocsAndPositionsEnum()).reset(termEnum, skipDocs);
docsPosEnum = (PreDocsAndPositionsEnum) reuse;
if (docsPosEnum.getFreqStream() != freqStream) {
docsPosEnum = new PreDocsAndPositionsEnum();
}
}
return docsPosEnum.reset(termEnum, skipDocs);
}
}
private final class PreDocsEnum extends DocsEnum {
@ -598,6 +958,10 @@ public class PreFlexFields extends FieldsProducer {
docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
}
IndexInput getFreqStream() {
return freqStream;
}
public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
docs.setSkipDocs(skipDocs);
docs.seek(termEnum);
@ -650,6 +1014,10 @@ public class PreFlexFields extends FieldsProducer {
pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
}
IndexInput getFreqStream() {
return freqStream;
}
public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
pos.setSkipDocs(skipDocs);
pos.seek(termEnum);

View File

@ -132,18 +132,21 @@ public final class SegmentTermEnum implements Cloneable {
position = p;
termBuffer.set(t);
prevBuffer.reset();
//System.out.println(" ste doSeek prev=" + prevBuffer.toTerm() + " this=" + this);
termInfo.set(ti);
}
/** Increments the enumeration to the next element. True if one exists.*/
public final boolean next() throws IOException {
if (position++ >= size - 1) {
prevBuffer.set(termBuffer);
//System.out.println(" ste setPrev=" + prev() + " this=" + this);
if (position++ >= size - 1) {
termBuffer.reset();
//System.out.println(" EOF");
return false;
}
prevBuffer.set(termBuffer);
termBuffer.read(input, fieldInfos);
newSuffixStart = termBuffer.newSuffixStart;
@ -168,6 +171,7 @@ public final class SegmentTermEnum implements Cloneable {
if (isIndex)
indexPointer += input.readVLong(); // read index pointer
//System.out.println(" ste ret term=" + term());
return true;
}

View File

@ -18,9 +18,10 @@ package org.apache.lucene.index.codecs.preflex;
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.FieldInfos;
@ -28,102 +29,65 @@ final class TermBuffer implements Cloneable {
private String field;
private Term term; // cached
private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes)
private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
private BytesRef bytes = new BytesRef(10);
int newSuffixStart;
private static final Comparator<BytesRef> utf8AsUTF16Comparator = BytesRef.getUTF8SortedAsUTF16Comparator();
public final int compareTo(TermBuffer other) {
int newSuffixStart; // only valid right after .read is called
public int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned
return compareChars(text.result, text.length, other.text.result, other.text.length);
return utf8AsUTF16Comparator.compare(bytes, other.bytes);
else
return field.compareTo(other.field);
}
private static int compareChars(char[] chars1, int len1,
char[] chars2, int len2) {
final int end = len1 < len2 ? len1:len2;
for (int k = 0; k < end; k++) {
char c1 = chars1[k];
char c2 = chars2[k];
if (c1 != c2) {
return c1 - c2;
}
}
return len1 - len2;
}
public final void read(IndexInput input, FieldInfos fieldInfos)
public void read(IndexInput input, FieldInfos fieldInfos)
throws IOException {
this.term = null; // invalidate cache
int start = input.readVInt();
newSuffixStart = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
int totalLength = newSuffixStart + length;
if (bytes.bytes.length < totalLength) {
bytes.grow(totalLength);
}
if (dirty) {
// Fully convert all bytes since bytes is dirty
UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
dirty = false;
} else {
// Incrementally convert only the UTF8 bytes that are new:
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
}
while(true) {
newSuffixStart = text.offsets[start];
if (newSuffixStart != -1) {
break;
}
if (--start == 0) {
newSuffixStart = 0;
break;
}
}
input.readBytes(bytes.bytes, newSuffixStart, length);
this.field = fieldInfos.fieldName(input.readVInt());
}
public final void set(Term term) {
public void set(Term term) {
if (term == null) {
reset();
return;
}
final BytesRef termBytes = term.bytes();
UnicodeUtil.UTF8toUTF16(termBytes.bytes, termBytes.offset, termBytes.length, text);
dirty = true;
bytes.copy(term.bytes());
field = term.field();
this.term = term;
}
public final void set(TermBuffer other) {
text.copyText(other.text);
dirty = true;
public void set(TermBuffer other) {
field = other.field;
term = other.term;
// nocommit -- right?
//term = other.term;
term = null;
bytes.copy(other.bytes);
}
public void reset() {
field = null;
text.setLength(0);
term = null;
dirty = true;
}
public Term toTerm() {
if (field == null) // unset
return null;
if (term == null)
term = new Term(field, new BytesRef(text.result, 0, text.length), false);
if (term == null) {
term = new Term(field, new BytesRef(bytes), false);
//term = new Term(field, bytes, false);
}
return term;
}
@ -134,12 +98,7 @@ final class TermBuffer implements Cloneable {
try {
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
clone.dirty = true;
clone.bytes = new BytesRef(10);
clone.text = new UnicodeUtil.UTF16Result();
clone.text.offsets = new int[text.offsets.length];
System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
clone.text.copyText(text);
clone.bytes = new BytesRef(bytes);
return clone;
}
}

View File

@ -23,30 +23,30 @@ package org.apache.lucene.index.codecs.preflex;
* indexing. */
@Deprecated
class TermInfo {
public class TermInfo {
/** The number of documents which contain the term. */
int docFreq = 0;
public int docFreq = 0;
long freqPointer = 0;
long proxPointer = 0;
int skipOffset;
public long freqPointer = 0;
public long proxPointer = 0;
public int skipOffset;
TermInfo() {}
public TermInfo() {}
TermInfo(int df, long fp, long pp) {
public TermInfo(int df, long fp, long pp) {
docFreq = df;
freqPointer = fp;
proxPointer = pp;
}
TermInfo(TermInfo ti) {
public TermInfo(TermInfo ti) {
docFreq = ti.docFreq;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;
skipOffset = ti.skipOffset;
}
final void set(int docFreq,
public final void set(int docFreq,
long freqPointer, long proxPointer, int skipOffset) {
this.docFreq = docFreq;
this.freqPointer = freqPointer;
@ -54,7 +54,7 @@ class TermInfo {
this.skipOffset = skipOffset;
}
final void set(TermInfo ti) {
public final void set(TermInfo ti) {
docFreq = ti.docFreq;
freqPointer = ti.freqPointer;
proxPointer = ti.proxPointer;

View File

@ -122,6 +122,9 @@ public final class TermInfosReader {
for (int i=0;indexEnum.next(); i++) {
indexTerms[i] = indexEnum.term();
assert indexTerms[i] != null;
assert indexTerms[i].text() != null;
assert indexTerms[i].field() != null;
indexInfos[i] = indexEnum.termInfo();
indexPointers[i] = indexEnum.indexPointer;
@ -160,14 +163,14 @@ public final class TermInfosReader {
return origEnum.maxSkipLevels;
}
final void close() throws IOException {
void close() throws IOException {
if (origEnum != null)
origEnum.close();
threadResources.close();
}
/** Returns the number of term/value pairs in the set. */
final long size() {
long size() {
return size;
}
@ -183,12 +186,13 @@ public final class TermInfosReader {
/** Returns the offset of the greatest index entry which is less than or equal to term.*/
private final int getIndexOffset(Term term) {
private int getIndexOffset(Term term) {
int lo = 0; // binary search indexTerms[]
int hi = indexTerms.length - 1;
while (hi >= lo) {
int mid = (lo + hi) >>> 1;
assert indexTerms[mid] != null : "indexTerms = " + indexTerms.length + " mid=" + mid;
int delta = term.compareToUTF16(indexTerms[mid]);
if (delta < 0)
hi = mid - 1;
@ -200,7 +204,7 @@ public final class TermInfosReader {
return hi;
}
private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
private void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
enumerator.seek(indexPointers[indexOffset],
((long) indexOffset * totalIndexInterval) - 1,
indexTerms[indexOffset], indexInfos[indexOffset]);
@ -231,6 +235,9 @@ public final class TermInfosReader {
}
TermInfo seekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd) throws IOException {
if (size == 0) {
return null;
}
// optimize sequential access: first try scanning cached enum w/o seeking
if (enumerator.term() != null // term is at or past current
@ -242,7 +249,6 @@ public final class TermInfosReader {
// no need to seek
final TermInfo ti;
int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo();
@ -279,6 +285,7 @@ public final class TermInfosReader {
seekEnum(enumerator, indexPos);
enumerator.scanTo(term);
final TermInfo ti;
if (enumerator.term() != null && term.compareToUTF16(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (tiOrd == null) {
@ -294,7 +301,7 @@ public final class TermInfosReader {
}
// called only from asserts
private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
private boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) {
if (ti1.docFreq != ti2.docFreq) {
return false;
}
@ -319,7 +326,7 @@ public final class TermInfosReader {
}
/** Returns the position of a Term in the set or -1. */
final long getPosition(Term term) throws IOException {
long getPosition(Term term) throws IOException {
if (size == 0) return -1;
ensureIndexIsRead();

View File

@ -792,6 +792,7 @@ class FieldCacheImpl implements FieldCache {
throws IOException {
String field = StringHelper.intern(entryKey.field);
Terms terms = MultiFields.getTerms(reader, field);
final boolean fasterButMoreRAM = ((Boolean) entryKey.custom).booleanValue();

View File

@ -333,10 +333,12 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
// difference to match UTF16's sort order:
if (aByte >= 0xee && bByte >= 0xee) {
if ((aByte & 0xfe) == 0xee) {
aByte += 0x10;
// nocommit
aByte += 0xe;
}
if ((bByte&0xfe) == 0xee) {
bByte += 0x10;
// nocommit
bByte += 0xe;
}
}
return aByte - bByte;
@ -346,10 +348,6 @@ public final class BytesRef implements Comparable<BytesRef>, Externalizable {
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
}
public boolean equals(Object other) {
return this == other;
}
}
public void writeExternal(ObjectOutput out)

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.intblock.IntBlockCodec;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.apache.lucene.index.codecs.pulsing.PulsingCodec;
import org.apache.lucene.index.codecs.sep.SepCodec;
import org.apache.lucene.index.codecs.standard.StandardCodec;
@ -94,6 +95,16 @@ public class RandomIndexWriter implements Closeable {
}
public IndexReader getReader() throws IOException {
// nocommit: hack!
if (w.codecs.getWriter(null).name.equals("PreFlex")) {
w.commit();
return IndexReader.open(w.getDirectory(),
null,
false,
_TestUtil.nextInt(r, 1, 10),
_TestUtil.alwaysCodec(new PreFlexCodec()));
}
if (r.nextBoolean()) {
return w.getReader();
} else {
@ -119,10 +130,15 @@ public class RandomIndexWriter implements Closeable {
RandomCodecProvider(Random random) {
register(new StandardCodec());
register(new IntBlockCodec());
register(new PreFlexCodec());
// nocommit
//register(new PreFlexCodec());
register(new PreFlexRWCodec());
register(new PulsingCodec());
register(new SepCodec());
codec = CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)];
// nocommit
//codec =
//CodecProvider.CORE_CODECS[random.nextInt(CodecProvider.CORE_CODECS.length)];
codec = "PreFlex";
}
@Override

View File

@ -19,7 +19,6 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -139,7 +138,6 @@ public class TestAddIndexes extends LuceneTestCase {
setUpDirs(dir, aux);
IndexWriter writer = newWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setOpenMode(OpenMode.APPEND));
writer.addIndexes(new Directory[] {aux});
// Adds 10 docs, then replaces them with another 10

View File

@ -493,7 +493,13 @@ public class TestCodecs extends MultiCodecTestCase {
// Test random seek by ord:
final int idx = TestCodecs.this.nextInt(field.terms.length);
term = field.terms[idx];
try {
status = termsEnum.seek(idx);
} catch (UnsupportedOperationException uoe) {
// ok -- skip it
status = null;
}
if (status != null) {
assertEquals(status, TermsEnum.SeekStatus.FOUND);
assertTrue(termsEnum.term().bytesEquals(new BytesRef(term.text2)));
assertEquals(term.docs.length, termsEnum.docFreq());
@ -502,6 +508,7 @@ public class TestCodecs extends MultiCodecTestCase {
} else {
this.verifyDocs(term.docs, term.positions, termsEnum.docsAndPositions(null, null), true);
}
}
// Test seek to non-existent terms:
for(int i=0;i<100;i++) {
@ -520,9 +527,12 @@ public class TestCodecs extends MultiCodecTestCase {
// Seek to each term by ord, backwards
for(int i=field.terms.length-1;i>=0;i--) {
try {
assertEquals(Thread.currentThread().getName() + ": field=" + field.fieldInfo.name + " term=" + field.terms[i].text2, TermsEnum.SeekStatus.FOUND, termsEnum.seek(i));
assertEquals(field.terms[i].docs.length, termsEnum.docFreq());
assertTrue(termsEnum.term().bytesEquals(new BytesRef(field.terms[i].text2)));
} catch (UnsupportedOperationException uoe) {
}
}
// Seek to non-existent empty-string term

View File

@ -20,6 +20,8 @@ package org.apache.lucene.index;
import java.io.*;
import java.util.*;
import org.apache.lucene.store.*;
import org.apache.lucene.index.codecs.*;
import org.apache.lucene.index.codecs.standard.*;
import org.apache.lucene.search.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*;
@ -64,7 +66,8 @@ public class TestFlex extends LuceneTestCase {
public void testTermOrd() throws Exception {
Directory d = new MockRAMDirectory();
IndexWriter w = new IndexWriter(d, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
IndexWriter w = new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
doc.add(new Field("f", "a b c", Field.Store.NO, Field.Index.ANALYZED));
w.addDocument(doc);

View File

@ -1675,7 +1675,7 @@ public class TestIndexReader extends LuceneTestCase
// LUCENE-1586: getUniqueTermCount
public void testUniqueTermCount() throws Exception {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED));
@ -1708,7 +1708,7 @@ public class TestIndexReader extends LuceneTestCase
// LUCENE-1609: don't load terms index
public void testNoTermsIndex() throws Throwable {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document doc = new Document();
doc.add(new Field("field", "a b c d e f g h i j k l m n o p q r s t u v w x y z", Field.Store.NO, Field.Index.ANALYZED));
doc.add(new Field("number", "0 1 2 3 4 5 6 7 8 9", Field.Store.NO, Field.Index.ANALYZED));
@ -1725,7 +1725,7 @@ public class TestIndexReader extends LuceneTestCase
}
assertEquals(-1, ((SegmentReader) r.getSequentialSubReaders()[0]).getTermInfosIndexDivisor());
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
writer.addDocument(doc);
writer.close();

View File

@ -4673,7 +4673,9 @@ public class TestIndexWriter extends LuceneTestCase {
// sort in UTF16 sort order by default
public void testTermUTF16SortOrder() throws Throwable {
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new MockAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
// nocommit -- allow preflexrw but must force preflex
// for reading
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
Document d = new Document();
// Single segment
Field f = new Field("f", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
@ -4682,7 +4684,7 @@ public class TestIndexWriter extends LuceneTestCase {
Random rnd = newRandom();
final Set<String> allTerms = new HashSet<String>();
for(int i=0;i<200*_TestUtil.getRandomMultiplier();i++) {
for(int i=0;i<10*_TestUtil.getRandomMultiplier();i++) {
final String s;
if (rnd.nextBoolean()) {
@ -4705,7 +4707,6 @@ public class TestIndexWriter extends LuceneTestCase {
allTerms.add(s);
f.setValue(s);
//System.out.println("add " + termDesc(s));
writer.addDocument(d);
if ((1+i) % 42 == 0) {

View File

@ -394,18 +394,18 @@ public class TestIndexWriterDelete extends LuceneTestCase {
}
public void testDeletesOnDiskFull() throws IOException {
testOperationsOnDiskFull(false);
doTestOperationsOnDiskFull(false);
}
public void testUpdatesOnDiskFull() throws IOException {
testOperationsOnDiskFull(true);
doTestOperationsOnDiskFull(true);
}
/**
* Make sure if modifier tries to commit but hits disk full that modifier
* remains consistent and usable. Similar to TestIndexReader.testDiskFull().
*/
private void testOperationsOnDiskFull(boolean updates) throws IOException {
private void doTestOperationsOnDiskFull(boolean updates) throws IOException {
Term searchTerm = new Term("content", "aaa");
int START_COUNT = 157;
@ -700,6 +700,7 @@ public class TestIndexWriterDelete extends LuceneTestCase {
try {
modifier.commit();
} catch (IOException ioe) {
// expected
failed = true;
}

View File

@ -27,11 +27,12 @@ public class TestMultiFields extends LuceneTestCase {
public void testRandom() throws Exception {
Random r = newRandom();
for(int iter=0;iter<2*_TestUtil.getRandomMultiplier();iter++) {
Directory dir = new MockRAMDirectory();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES));
Random r = new Random();
IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setMergePolicy(NoMergePolicy.COMPOUND_FILES));
Map<BytesRef,List<Integer>> docs = new HashMap<BytesRef,List<Integer>>();
Set<Integer> deleted = new HashSet<Integer>();
@ -45,7 +46,7 @@ public class TestMultiFields extends LuceneTestCase {
doc.add(id);
boolean onlyUniqueTerms = r.nextBoolean();
Set<BytesRef> uniqueTerms = new HashSet<BytesRef>();
for(int i=0;i<numDocs;i++) {
if (!onlyUniqueTerms && r.nextBoolean() && terms.size() > 0) {
@ -61,6 +62,7 @@ public class TestMultiFields extends LuceneTestCase {
}
docs.get(term).add(i);
terms.add(term);
uniqueTerms.add(term);
f.setValue(s);
}
id.setValue(""+i);
@ -75,8 +77,19 @@ public class TestMultiFields extends LuceneTestCase {
}
}
if (VERBOSE) {
List<BytesRef> termsList = new ArrayList<BytesRef>(uniqueTerms);
Collections.sort(termsList, BytesRef.getUTF8SortedAsUTF16Comparator());
System.out.println("UTF16 order:");
for(BytesRef b : termsList) {
System.out.println(" " + UnicodeUtil.toHexString(b.utf8ToString()));
}
}
// nocommit
IndexReader reader = w.getReader();
w.close();
//System.out.println("TEST reader=" + reader);
Bits delDocs = MultiFields.getDeletedDocs(reader);
for(int delDoc : deleted) {

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@ -64,7 +65,7 @@ public class TestSegmentTermEnum extends LuceneTestCase {
public void testPrevTermAtEnd() throws IOException
{
Directory dir = new MockRAMDirectory();
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()));
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec("Standard")));
addDoc(writer, "aaa bbb");
writer.close();
SegmentReader reader = SegmentReader.getOnlySegmentReader(dir);

View File

@ -18,8 +18,11 @@ package org.apache.lucene.index.codecs.preflex;
*/
import org.apache.lucene.store.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.codecs.*;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.apache.lucene.util.*;
import java.util.*;
@ -30,8 +33,6 @@ import org.junit.Test;
public class TestSurrogates extends LuceneTestCaseJ4 {
// chooses from a very limited alphabet to exacerbate the
// surrogate seeking required
private static String makeDifficultRandomUnicodeString(Random r) {
final int end = r.nextInt(20);
if (end == 0) {
@ -44,136 +45,62 @@ public class TestSurrogates extends LuceneTestCaseJ4 {
if (0 == t && i < end - 1) {
// hi
buffer[i++] = (char) 0xd800;
buffer[i++] = (char) (0xd800 + r.nextInt(2));
// lo
buffer[i] = (char) 0xdc00;
buffer[i] = (char) (0xdc00 + r.nextInt(2));
} else if (t <= 3) {
buffer[i] = 'a';
buffer[i] = (char) ('a' + r.nextInt(2));
} else if (4 == t) {
buffer[i] = 0xe000;
buffer[i] = (char) (0xe000 + r.nextInt(2));
}
}
return new String(buffer, 0, end);
}
private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<Term> fieldTerms) throws IOException {
private String toHexString(Term t) {
return t.field() + ":" + UnicodeUtil.toHexString(t.text());
}
final int numField = _TestUtil.nextInt(r, 2, 5);
List<Term> terms = new ArrayList<Term>();
int tc = 0;
for(int f=0;f<numField;f++) {
String field = "f" + f;
Term protoTerm = new Term(field);
fieldInfos.add(field, true, false, false, false, false, false, false);
final int numTerms = 10000*_TestUtil.getRandomMultiplier();
for(int i=0;i<numTerms;i++) {
private String getRandomString(Random r) {
String s;
if (r.nextInt(3) == 1) {
s = makeDifficultRandomUnicodeString(r);
} else {
s = _TestUtil.randomUnicodeString(r);
// The surrogate dance uses 0xffff to seek-to-end
// of blocks. Also, pre-4.0 indices are already
// guaranteed to not contain the char 0xffff since
// it's mapped during indexing:
s = s.replace((char) 0xffff, (char) 0xfffe);
}
terms.add(protoTerm.createTerm(s + "_" + (tc++)));
}
return s;
}
fieldInfos.write(dir, segName);
// sorts in UTF16 order, just like preflex:
Collections.sort(terms, new Comparator<Term>() {
private static class SortTermAsUTF16Comparator implements Comparator<Term> {
public int compare(Term o1, Term o2) {
return o1.compareToUTF16(o2);
}
});
TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
TermInfo ti = new TermInfo();
String lastText = null;
int uniqueTermCount = 0;
if (VERBOSE) {
System.out.println("TEST: utf16 order:");
}
for(Term t : terms) {
FieldInfo fi = fieldInfos.fieldInfo(t.field());
String text = t.text();
if (lastText != null && lastText.equals(text)) {
continue;
}
fieldTerms.add(t);
uniqueTermCount++;
lastText = text;
private static final SortTermAsUTF16Comparator termAsUTF16Comparator = new SortTermAsUTF16Comparator();
// single straight enum
private void doTestStraightEnum(List<Term> fieldTerms, IndexReader reader, int uniqueTermCount) throws IOException {
if (VERBOSE) {
System.out.println(" " + toHexString(t));
System.out.println("\nTEST: top now enum reader=" + reader);
}
w.add(fi.number, t.bytes().bytes, t.bytes().length, ti);
}
w.close();
FieldsEnum fieldsEnum = MultiFields.getFields(reader).iterator();
Collections.sort(fieldTerms);
if (VERBOSE) {
System.out.println("\nTEST: codepoint order");
for(Term t: fieldTerms) {
System.out.println(" " + t.field() + ":" + toHexString(t));
}
}
dir.createOutput(segName + ".prx").close();
dir.createOutput(segName + ".frq").close();
// !!hack alert!! stuffing uniqueTermCount in as docCount
return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
}
private String toHexString(Term t) {
return t.field() + ":" + UnicodeUtil.toHexString(t.text());
}
@Test
public void testSurrogatesOrder() throws Exception {
Directory dir = new MockRAMDirectory();
Codec codec = new PreFlexCodec();
Random r = newRandom();
FieldInfos fieldInfos = new FieldInfos();
List<Term> fieldTerms = new ArrayList<Term>();
SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
// hack alert!!
int uniqueTermCount = si.docCount;
FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
assertNotNull(fields);
if (VERBOSE) {
System.out.println("\nTEST: now enum");
}
FieldsEnum fieldsEnum = fields.iterator();
{
// Test straight enum:
String field;
UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
int termCount = 0;
while((field = fieldsEnum.next()) != null) {
TermsEnum termsEnum = fieldsEnum.terms();
BytesRef text;
BytesRef lastText = null;
while((text = termsEnum.next()) != null) {
Term exp = fieldTerms.get(termCount);
if (VERBOSE) {
UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
System.out.println(" got term=" + field + ":" + UnicodeUtil.toHexString(text.utf8ToString()));
System.out.println(" exp=" + exp.field() + ":" + UnicodeUtil.toHexString(exp.text().toString()));
System.out.println();
}
if (lastText == null) {
@ -182,8 +109,8 @@ public class TestSurrogates extends LuceneTestCaseJ4 {
assertTrue(lastText.compareTo(text) < 0);
lastText.copy(text);
}
assertEquals(fieldTerms.get(termCount).field(), field);
assertEquals(fieldTerms.get(termCount).bytes(), text);
assertEquals(exp.field(), field);
assertEquals(exp.bytes(), text);
termCount++;
}
if (VERBOSE) {
@ -191,7 +118,222 @@ public class TestSurrogates extends LuceneTestCaseJ4 {
}
}
assertEquals(uniqueTermCount, termCount);
}
}
fields.close();
// randomly seeks to term that we know exists, then next's
// from there
private void doTestSeekExists(Random r, List<Term> fieldTerms, IndexReader reader) throws IOException {
final Map<String,TermsEnum> tes = new HashMap<String,TermsEnum>();
// Test random seek to existing term, then enum:
if (VERBOSE) {
System.out.println("\nTEST: top now seek");
}
for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) {
// pick random field+term
int spot = r.nextInt(fieldTerms.size());
Term term = fieldTerms.get(spot);
String field = term.field();
if (VERBOSE) {
System.out.println("TEST: exist seek field=" + field + " term=" + UnicodeUtil.toHexString(term.text()));
}
// seek to it
TermsEnum te = tes.get(field);
if (te == null) {
te = MultiFields.getTerms(reader, field).iterator();
tes.put(field, te);
}
if (VERBOSE) {
System.out.println(" done get enum");
}
// seek should find the term
assertEquals(TermsEnum.SeekStatus.FOUND,
te.seek(term.bytes()));
// now .next() this many times:
int ct = _TestUtil.nextInt(r, 5, 100);
for(int i=0;i<ct;i++) {
if (VERBOSE) {
System.out.println("TEST: now next()");
}
if (1+spot+i >= fieldTerms.size()) {
break;
}
term = fieldTerms.get(1+spot+i);
if (term.field() != field) {
assertNull(te.next());
break;
} else {
BytesRef t = te.next();
if (VERBOSE) {
System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
}
assertEquals(term.bytes(), t);
}
}
}
}
private void doTestSeekDoesNotExist(Random r, int numField, List<Term> fieldTerms, Term[] fieldTermsArray, IndexReader reader) throws IOException {
final Map<String,TermsEnum> tes = new HashMap<String,TermsEnum>();
if (VERBOSE) {
System.out.println("TEST: top random seeks");
}
{
for(int iter=0;iter<100*_TestUtil.getRandomMultiplier();iter++) {
// seek to random spot
String field = ("f" + r.nextInt(numField)).intern();
Term tx = new Term(field, getRandomString(r));
int spot = Arrays.binarySearch(fieldTermsArray, tx);
if (spot < 0) {
if (VERBOSE) {
System.out.println("TEST: non-exist seek to " + field + ":" + UnicodeUtil.toHexString(tx.text()));
}
// term does not exist:
TermsEnum te = tes.get(field);
if (te == null) {
te = MultiFields.getTerms(reader, field).iterator();
tes.put(field, te);
}
if (VERBOSE) {
System.out.println(" got enum");
}
spot = -spot - 1;
if (spot == fieldTerms.size() || fieldTerms.get(spot).field() != field) {
assertEquals(TermsEnum.SeekStatus.END, te.seek(tx.bytes()));
} else {
assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seek(tx.bytes()));
if (VERBOSE) {
System.out.println(" got term=" + UnicodeUtil.toHexString(te.term().utf8ToString()));
System.out.println(" exp term=" + UnicodeUtil.toHexString(fieldTerms.get(spot).text()));
}
assertEquals(fieldTerms.get(spot).bytes(),
te.term());
// now .next() this many times:
int ct = _TestUtil.nextInt(r, 5, 100);
for(int i=0;i<ct;i++) {
if (VERBOSE) {
System.out.println("TEST: now next()");
}
if (1+spot+i >= fieldTerms.size()) {
break;
}
Term term = fieldTerms.get(1+spot+i);
if (term.field() != field) {
assertNull(te.next());
break;
} else {
BytesRef t = te.next();
if (VERBOSE) {
System.out.println(" got term=" + (t == null ? null : UnicodeUtil.toHexString(t.utf8ToString())));
System.out.println(" exp=" + UnicodeUtil.toHexString(term.text().toString()));
}
assertEquals(term.bytes(), t);
}
}
}
}
}
}
}
@Test
public void testSurrogatesOrder() throws Exception {
Random r = newRandom();
Directory dir = new MockRAMDirectory();
RandomIndexWriter w = new RandomIndexWriter(r,
dir,
new IndexWriterConfig(TEST_VERSION_CURRENT,
new MockAnalyzer()).setCodecProvider(_TestUtil.alwaysCodec(new PreFlexRWCodec())));
final int numField = _TestUtil.nextInt(r, 2, 5);
int uniqueTermCount = 0;
int tc = 0;
List<Term> fieldTerms = new ArrayList<Term>();
for(int f=0;f<numField;f++) {
String field = "f" + f;
Term protoTerm = new Term(field);
final int numTerms = 10000*_TestUtil.getRandomMultiplier();
final Set<String> uniqueTerms = new HashSet<String>();
for(int i=0;i<numTerms;i++) {
String term = getRandomString(r) + "_ " + (tc++);
uniqueTerms.add(term);
fieldTerms.add(new Term(field, term));
Document doc = new Document();
doc.add(new Field(field, term, Field.Store.NO, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
uniqueTermCount += uniqueTerms.size();
}
IndexReader reader = w.getReader();
if (VERBOSE) {
Collections.sort(fieldTerms, termAsUTF16Comparator);
System.out.println("\nTEST: UTF16 order");
for(Term t: fieldTerms) {
System.out.println(" " + toHexString(t));
}
}
// sorts in code point order:
Collections.sort(fieldTerms);
if (VERBOSE) {
System.out.println("\nTEST: codepoint order");
for(Term t: fieldTerms) {
System.out.println(" " + toHexString(t));
}
}
Term[] fieldTermsArray = fieldTerms.toArray(new Term[fieldTerms.size()]);
//SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
//FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
//assertNotNull(fields);
doTestStraightEnum(fieldTerms, reader, uniqueTermCount);
doTestSeekExists(r, fieldTerms, reader);
doTestSeekDoesNotExist(r, numField, fieldTerms, fieldTermsArray, reader);
reader.close();
}
}

View File

@ -0,0 +1,212 @@
package org.apache.lucene.index.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.TermsConsumer;
import org.apache.lucene.index.codecs.PostingsConsumer;
import org.apache.lucene.index.codecs.standard.DefaultSkipListWriter;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.codecs.preflex.TermInfo;
import org.apache.lucene.store.IndexOutput;
import java.io.IOException;
import java.util.Comparator;
class PreFlexFieldsWriter extends FieldsConsumer {
private final TermInfosWriter termsOut;
private final IndexOutput freqOut;
private final IndexOutput proxOut;
private final DefaultSkipListWriter skipListWriter;
private final int totalNumDocs;
public PreFlexFieldsWriter(SegmentWriteState state) throws IOException {
termsOut = new TermInfosWriter(state.directory,
state.segmentName,
state.fieldInfos,
state.termIndexInterval);
state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_EXTENSION));
state.flushedFiles.add(IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
final String freqFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.FREQ_EXTENSION);
freqOut = state.directory.createOutput(freqFile);
state.flushedFiles.add(freqFile);
totalNumDocs = state.numDocs;
if (state.fieldInfos.hasProx()) {
final String proxFile = IndexFileNames.segmentFileName(state.segmentName, "", PreFlexCodec.PROX_EXTENSION);
proxOut = state.directory.createOutput(proxFile);
state.flushedFiles.add(proxFile);
} else {
proxOut = null;
}
skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval,
termsOut.maxSkipLevels,
totalNumDocs,
freqOut,
proxOut);
//System.out.println("\nw start seg=" + segment);
}
@Override
public TermsConsumer addField(FieldInfo field) throws IOException {
assert field.number != -1;
//System.out.println("w field=" + field.name + " storePayload=" + field.storePayloads + " number=" + field.number);
return new PreFlexTermsWriter(field);
}
@Override
public void close() throws IOException {
termsOut.close();
freqOut.close();
if (proxOut != null) {
proxOut.close();
}
}
private class PreFlexTermsWriter extends TermsConsumer {
private final FieldInfo fieldInfo;
private final boolean omitTF;
private final boolean storePayloads;
private final TermInfo termInfo = new TermInfo();
private final PostingsWriter postingsWriter = new PostingsWriter();
public PreFlexTermsWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
omitTF = fieldInfo.omitTermFreqAndPositions;
storePayloads = fieldInfo.storePayloads;
}
private class PostingsWriter extends PostingsConsumer {
private int lastDocID;
private int lastPayloadLength = -1;
private int lastPosition;
private int df;
public PostingsWriter reset() {
df = 0;
lastDocID = 0;
lastPayloadLength = -1;
return this;
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
//System.out.println(" w doc=" + docID);
final int delta = docID - lastDocID;
if (docID < 0 || (df > 0 && delta <= 0)) {
throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )");
}
if ((++df % termsOut.skipInterval) == 0) {
skipListWriter.setSkipData(lastDocID, storePayloads, lastPayloadLength);
skipListWriter.bufferSkip(df);
}
lastDocID = docID;
assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs;
if (omitTF) {
freqOut.writeVInt(delta);
} else {
final int code = delta << 1;
if (termDocFreq == 1) {
freqOut.writeVInt(code|1);
} else {
freqOut.writeVInt(code);
freqOut.writeVInt(termDocFreq);
}
}
lastPosition = 0;
}
@Override
public void addPosition(int position, BytesRef payload) throws IOException {
assert proxOut != null;
//System.out.println(" w pos=" + position + " payl=" + payload);
final int delta = position - lastPosition;
lastPosition = position;
if (storePayloads) {
final int payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
//System.out.println(" write payload len=" + payloadLength);
lastPayloadLength = payloadLength;
proxOut.writeVInt((delta<<1)|1);
proxOut.writeVInt(payloadLength);
} else {
proxOut.writeVInt(delta << 1);
}
if (payloadLength > 0) {
proxOut.writeBytes(payload.bytes, payload.offset, payload.length);
}
} else {
proxOut.writeVInt(delta);
}
}
@Override
public void finishDoc() throws IOException {
}
}
@Override
public PostingsConsumer startTerm(BytesRef text) throws IOException {
//System.out.println(" w term=" + text.utf8ToString());
skipListWriter.resetSkip();
termInfo.freqPointer = freqOut.getFilePointer();
if (proxOut != null) {
termInfo.proxPointer = proxOut.getFilePointer();
}
return postingsWriter.reset();
}
@Override
public void finishTerm(BytesRef text, int numDocs) throws IOException {
if (numDocs > 0) {
long skipPointer = skipListWriter.writeSkip(freqOut);
termInfo.docFreq = numDocs;
termInfo.skipOffset = (int) (skipPointer - termInfo.freqPointer);
//System.out.println(" w finish term=" + text.utf8ToString() + " fnum=" + fieldInfo.number);
termsOut.add(fieldInfo.number,
text,
termInfo);
}
}
@Override
public void finish() throws IOException {
}
@Override
public Comparator<BytesRef> getComparator() throws IOException {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
}

View File

@ -0,0 +1,52 @@
package org.apache.lucene.index.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.codecs.preflex.PreFlexCodec;
import org.apache.lucene.index.codecs.preflex.PreFlexFields;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
/** Codec, only for testing, that can write and read the
* pre-flex index format.
*
* @lucene.experimental
*/
public class PreFlexRWCodec extends PreFlexCodec {
public PreFlexRWCodec() {
// NOTE: we use same name as core PreFlex codec so that
// it can read the segments we write!
super();
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new PreFlexFieldsWriter(state);
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
//System.out.println("preflexrw");
return new PreFlexFields(state.dir, state.fieldInfos, state.segmentInfo, state.readBufferSize, state.termsIndexDivisor, false);
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.index.codecs.preflex;
package org.apache.lucene.index.codecs.preflexrw;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -19,9 +19,12 @@ package org.apache.lucene.index.codecs.preflex;
import java.io.IOException;
import org.apache.lucene.store.*;
import org.apache.lucene.index.*;
import org.apache.lucene.util.*;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.codecs.preflex.TermInfo;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
@ -71,8 +74,7 @@ final class TermInfosWriter {
private long lastIndexPointer;
private boolean isIndex;
private byte[] lastTermBytes = new byte[10];
private int lastTermBytesLength = 0;
private final BytesRef lastTerm = new BytesRef();
private int lastFieldNumber = -1;
private TermInfosWriter other;
@ -104,13 +106,10 @@ final class TermInfosWriter {
assert initUTF16Results();
}
void add(Term term, TermInfo ti) throws IOException {
add(fieldInfos.fieldNumber(term.field()), term.bytes().bytes, term.bytes().length, ti);
}
// Currently used only by assert statements
UnicodeUtil.UTF16Result utf16Result1;
UnicodeUtil.UTF16Result utf16Result2;
private final BytesRef scratchBytes = new BytesRef();
// Currently used only by assert statements
private boolean initUTF16Results() {
@ -120,7 +119,7 @@ final class TermInfosWriter {
}
// Currently used only by assert statement
private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
private int compareToLastTerm(int fieldNumber, BytesRef term) {
if (lastFieldNumber != fieldNumber) {
final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
@ -132,8 +131,13 @@ final class TermInfosWriter {
return cmp;
}
UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
scratchBytes.copy(term);
assert lastTerm.offset == 0;
UnicodeUtil.UTF8toUTF16(lastTerm.bytes, 0, lastTerm.length, utf16Result1);
assert scratchBytes.offset == 0;
UnicodeUtil.UTF8toUTF16(scratchBytes.bytes, 0, scratchBytes.length, utf16Result2);
final int len;
if (utf16Result1.length < utf16Result2.length)
len = utf16Result1.length;
@ -152,22 +156,22 @@ final class TermInfosWriter {
/** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
Term must be lexicographically greater than all previous Terms added.
TermInfo pointers must be positive and greater than all previous.*/
void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
public void add(int fieldNumber, BytesRef term, TermInfo ti)
throws IOException {
assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
(isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
assert compareToLastTerm(fieldNumber, term) < 0 ||
(isIndex && term.length == 0 && lastTerm.length == 0) :
"Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
" lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
" text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
" text=" + term.utf8ToString() + " lastText=" + lastTerm.utf8ToString();
assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
if (!isIndex && size % indexInterval == 0)
other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
other.add(lastFieldNumber, lastTerm, lastTi); // add an index term
writeTerm(fieldNumber, termBytes, termBytesLength); // write term
writeTerm(fieldNumber, term); // write term
output.writeVInt(ti.docFreq); // write doc freq
output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
@ -187,29 +191,27 @@ final class TermInfosWriter {
size++;
}
private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
private void writeTerm(int fieldNumber, BytesRef term)
throws IOException {
//System.out.println(" tiw.write field=" + fieldNumber + " term=" + term.utf8ToString());
// TODO: UTF16toUTF8 could tell us this prefix
// Compute prefix in common with last term:
int start = 0;
final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
final int limit = term.length < lastTerm.length ? term.length : lastTerm.length;
while(start < limit) {
if (termBytes[start] != lastTermBytes[start])
if (term.bytes[start+term.offset] != lastTerm.bytes[start+lastTerm.offset])
break;
start++;
}
final int length = termBytesLength - start;
final int length = term.length - start;
output.writeVInt(start); // write shared prefix length
output.writeVInt(length); // write delta length
output.writeBytes(termBytes, start, length); // write delta bytes
output.writeBytes(term.bytes, start+term.offset, length); // write delta bytes
output.writeVInt(fieldNumber); // write field num
if (lastTermBytes.length < termBytesLength) {
lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
}
System.arraycopy(termBytes, start, lastTermBytes, start, length);
lastTermBytesLength = termBytesLength;
lastTerm.copy(term);
}
/** Called to complete TermInfos creation. */

View File

@ -34,6 +34,8 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.CacheEntry;
import org.apache.lucene.util.FieldCacheSanityChecker.Insanity;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
/**
* Base class for all Lucene unit tests.
@ -72,6 +74,8 @@ public abstract class LuceneTestCase extends TestCase {
private volatile Thread.UncaughtExceptionHandler savedUncaughtExceptionHandler = null;
private String savedDefaultCodec;
/** Used to track if setUp and tearDown are called correctly from subclasses */
private boolean setup;
@ -110,6 +114,8 @@ public abstract class LuceneTestCase extends TestCase {
ConcurrentMergeScheduler.setTestMode();
savedBoolMaxClauseCount = BooleanQuery.getMaxClauseCount();
savedDefaultCodec = CodecProvider.getDefaultCodec();
CodecProvider.setDefaultCodec(_TestUtil.getTestCodec());
}
/**
@ -135,6 +141,7 @@ public abstract class LuceneTestCase extends TestCase {
assertTrue("ensure your setUp() calls super.setUp()!!!", setup);
setup = false;
BooleanQuery.setMaxClauseCount(savedBoolMaxClauseCount);
CodecProvider.setDefaultCodec(savedDefaultCodec);
try {
Thread.setDefaultUncaughtExceptionHandler(savedUncaughtExceptionHandler);
@ -298,4 +305,10 @@ public abstract class LuceneTestCase extends TestCase {
// static members
private static final Random seedRnd = new Random();
// register preflex-rw statically.
static {
CodecProvider.getDefault().register(new PreFlexRWCodec());
}
}

View File

@ -22,9 +22,14 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.CacheEntry;
import org.apache.lucene.util.FieldCacheSanityChecker.Insanity;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.preflexrw.PreFlexRWCodec;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestWatchman;
@ -127,6 +132,20 @@ public class LuceneTestCaseJ4 {
private static final Map<Class<? extends LuceneTestCaseJ4>,Object> checkedClasses =
Collections.synchronizedMap(new WeakHashMap<Class<? extends LuceneTestCaseJ4>,Object>());
// saves default codec: we do this statically as many build indexes in @beforeClass
private static String savedDefaultCodec;
@BeforeClass
public static void beforeClassLuceneTestCaseJ4() {
savedDefaultCodec = CodecProvider.getDefaultCodec();
CodecProvider.setDefaultCodec(_TestUtil.getTestCodec());
}
@AfterClass
public static void afterClassLuceneTestCaseJ4() {
CodecProvider.setDefaultCodec(savedDefaultCodec);
}
// This is how we get control when errors occur.
// Think of this as start/end/success/failed
// events.
@ -405,4 +424,8 @@ public class LuceneTestCaseJ4 {
private String name = "<unknown>";
// register PreFlexRWCodec statically
static {
CodecProvider.getDefault().register(new PreFlexRWCodec());
}
}

View File

@ -23,6 +23,9 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.Directory;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
@ -129,8 +132,24 @@ public class _TestUtil {
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
int t = r.nextInt(5);
//buffer[i] = (char) (97 + r.nextInt(26));
/*
if (0 == t && i < end - 1) {
// hi
buffer[i++] = (char) 0xd800;
// lo
buffer[i] = (char) 0xdc00;
} else if (t <= 3) {
buffer[i] = 'a';
} else if (4 == t) {
buffer[i] = 0xe000;
}
*/
if (0 == t && i < end - 1) {
// Make a surrogate pair
// High surrogate
@ -218,4 +237,27 @@ public class _TestUtil {
public static int getRandomMultiplier() {
return Integer.parseInt(System.getProperty("random.multiplier", "1"));
}
/** gets the codec to run tests with */
public static String getTestCodec() {
return System.getProperty("tests.codec", "Standard");
}
public static CodecProvider alwaysCodec(final Codec c) {
return new CodecProvider() {
@Override
public Codec getWriter(SegmentWriteState state) {
return c;
}
@Override
public Codec lookup(String name) {
return c;
}
};
}
public static CodecProvider alwaysCodec(final String codec) {
return alwaysCodec(CodecProvider.getDefault().lookup(codec));
}
}