mirror of https://github.com/apache/lucene.git
cleanup codes, downgrade nocommits
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1519242 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1b4c3f688c
commit
e1f85e73ed
|
@ -408,7 +408,7 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
// nocommit: this can be achieved by making use of Util.getByOutput()
|
||||
// TODO: this can be achieved by making use of Util.getByOutput()
|
||||
// and should have related tests
|
||||
@Override
|
||||
public void seekExact(long ord) throws IOException {
|
||||
|
@ -541,14 +541,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
this.fstReader = fst.getBytesReader();
|
||||
this.fstOutputs = index.outputs;
|
||||
this.fsa = compiled.runAutomaton;
|
||||
/*
|
||||
PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
|
||||
Util.toDot(dict,pw1, false, false);
|
||||
pw1.close();
|
||||
PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
|
||||
pw2.write(compiled.toDot());
|
||||
pw2.close();
|
||||
*/
|
||||
this.level = -1;
|
||||
this.stack = new Frame[16];
|
||||
for (int i = 0 ; i < stack.length; i++) {
|
||||
|
@ -679,8 +671,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
return frame;
|
||||
}
|
||||
|
||||
// nocommit: expected to use readFirstTargetArc here?
|
||||
|
||||
/** Load frame for target arc(node) on fst */
|
||||
Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
|
||||
if (!canGrow(top)) {
|
||||
|
@ -743,7 +733,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
|||
return !frame.arc.isLast();
|
||||
}
|
||||
|
||||
// nocommit: need to load ord lazily?
|
||||
void pushFrame(Frame frame) {
|
||||
final FST.Arc<Long> arc = frame.arc;
|
||||
arc.output = fstOutputs.add(topFrame().arc.output, arc.output);
|
||||
|
|
|
@ -55,13 +55,12 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
|
|||
public static final int TERMS_VERSION_START = 0;
|
||||
public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;
|
||||
public static final int SKIP_INTERVAL = 8;
|
||||
//static final boolean TEST = false;
|
||||
|
||||
final PostingsWriterBase postingsWriter;
|
||||
final FieldInfos fieldInfos;
|
||||
final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
|
||||
IndexOutput blockOut = null;
|
||||
IndexOutput indexOut = null; // nocommit: hmm, do we really need two streams?
|
||||
IndexOutput indexOut = null;
|
||||
|
||||
public TempFSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
|
||||
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
|
||||
|
@ -134,8 +133,6 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
|
|||
out.writeLong(dirStart);
|
||||
}
|
||||
|
||||
// nocommit: nuke this? we don't need to buffer so much data,
|
||||
// since close() can do this naturally
|
||||
private static class FieldMetaData {
|
||||
public FieldInfo fieldInfo;
|
||||
public long numTerms;
|
||||
|
@ -145,12 +142,16 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
|
|||
public int longsSize;
|
||||
public FST<Long> dict;
|
||||
|
||||
// nocommit: block encode each part
|
||||
// (so that we'll have metaLongsOut[])
|
||||
public RAMOutputStream skipOut; // vint encode next skip point (all values start from 0, fully decoded when reading)
|
||||
public RAMOutputStream statsOut; // vint encode df, (ttf-df)
|
||||
public RAMOutputStream metaLongsOut; // vint encode monotonic long[] and length for corresponding byte[]
|
||||
public RAMOutputStream metaBytesOut; // put all bytes blob here
|
||||
// TODO: block encode each part
|
||||
|
||||
// vint encode next skip point (fully decoded when reading)
|
||||
public RAMOutputStream skipOut;
|
||||
// vint encode df, (ttf-df)
|
||||
public RAMOutputStream statsOut;
|
||||
// vint encode monotonic long[] and length for corresponding byte[]
|
||||
public RAMOutputStream metaLongsOut;
|
||||
// generic byte[]
|
||||
public RAMOutputStream metaBytesOut;
|
||||
}
|
||||
|
||||
final class TermsWriter extends TermsConsumer {
|
||||
|
|
|
@ -63,7 +63,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
|
||||
final PostingsReaderBase postingsReader;
|
||||
final IndexInput in;
|
||||
//static boolean DEBUG = false;
|
||||
//static boolean TEST = false;
|
||||
|
||||
public TempFSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException {
|
||||
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
|
||||
|
@ -283,12 +283,6 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
// nocommit: do we need this? for SegmentTermsEnum, we can maintain
|
||||
// a stack to record how current term is constructed on FST, (and ord on each alphabet)
|
||||
// so that during seek we don't have to start from the first arc.
|
||||
// however, we'll be implementing a new fstEnum instead of wrapping current one.
|
||||
//
|
||||
// nocommit: this can also be achieved by making use of Util.getByOutput()
|
||||
@Override
|
||||
public void seekExact(long ord) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
|
@ -428,19 +422,11 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
|
||||
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
|
||||
super();
|
||||
//if (DEBUG) System.out.println("Enum init, startTerm=" + startTerm);
|
||||
//if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
|
||||
this.fst = dict;
|
||||
this.fstReader = fst.getBytesReader();
|
||||
this.fstOutputs = dict.outputs;
|
||||
this.fsa = compiled.runAutomaton;
|
||||
/*
|
||||
PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
|
||||
Util.toDot(dict,pw1, false, false);
|
||||
pw1.close();
|
||||
PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
|
||||
pw2.write(compiled.toDot());
|
||||
pw2.close();
|
||||
*/
|
||||
this.level = -1;
|
||||
this.stack = new Frame[16];
|
||||
for (int i = 0 ; i < stack.length; i++) {
|
||||
|
@ -511,7 +497,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
//if (DEBUG) System.out.println("Enum next()");
|
||||
//if (TEST) System.out.println("Enum next()");
|
||||
if (pending) {
|
||||
pending = false;
|
||||
loadMetaData();
|
||||
|
@ -546,7 +532,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
private BytesRef doSeekCeil(BytesRef target) throws IOException {
|
||||
//if (DEBUG) System.out.println("Enum doSeekCeil()");
|
||||
//if (TEST) System.out.println("Enum doSeekCeil()");
|
||||
Frame frame= null;
|
||||
int label, upto = 0, limit = target.length;
|
||||
while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
|
||||
|
@ -580,12 +566,6 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
return null;
|
||||
}
|
||||
|
||||
// nocommit: might be great if we can set flag BIT_LAST_ARC
|
||||
// nocommit: actually we can use first arc as candidate...
|
||||
// it always has NO_OUTPUT as output, and BIT_LAST_ARC set.
|
||||
// but we'll have problem if later FST supports output sharing
|
||||
// on first arc!
|
||||
|
||||
/** Virtual frame, never pop */
|
||||
Frame loadVirtualFrame(Frame frame) throws IOException {
|
||||
frame.fstArc.output = fstOutputs.getNoOutput();
|
||||
|
@ -601,8 +581,6 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
return frame;
|
||||
}
|
||||
|
||||
// nocommit: expected to use readFirstTargetArc here?
|
||||
|
||||
/** Load frame for target arc(node) on fst */
|
||||
Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
|
||||
if (!canGrow(top)) {
|
||||
|
@ -610,18 +588,13 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
}
|
||||
frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
|
||||
frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
|
||||
//if (DEBUG) System.out.println(" loadExpand frame="+frame);
|
||||
//if (TEST) System.out.println(" loadExpand frame="+frame);
|
||||
if (frame.fsaState == -1) {
|
||||
return loadNextFrame(top, frame);
|
||||
}
|
||||
return frame;
|
||||
}
|
||||
|
||||
// nocommit: actually, here we're looking for a valid state for fsa,
|
||||
// so if numArcs is large in fst, we should try a reverse lookup?
|
||||
// but we don have methods like advance(label) in fst, even
|
||||
// binary search hurts.
|
||||
|
||||
/** Load frame for sibling arc(node) on fst */
|
||||
Frame loadNextFrame(Frame top, Frame frame) throws IOException {
|
||||
if (!canRewind(frame)) {
|
||||
|
@ -634,7 +607,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
break;
|
||||
}
|
||||
}
|
||||
//if (DEBUG) System.out.println(" loadNext frame="+frame);
|
||||
//if (TEST) System.out.println(" loadNext frame="+frame);
|
||||
if (frame.fsaState == -1) {
|
||||
return null;
|
||||
}
|
||||
|
@ -650,7 +623,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
return null;
|
||||
}
|
||||
frame.fsaState = fsa.step(top.fsaState, arc.label);
|
||||
//if (DEBUG) System.out.println(" loadCeil frame="+frame);
|
||||
//if (TEST) System.out.println(" loadCeil frame="+frame);
|
||||
if (frame.fsaState == -1) {
|
||||
return loadNextFrame(top, frame);
|
||||
}
|
||||
|
@ -673,14 +646,14 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
void pushFrame(Frame frame) {
|
||||
term = grow(frame.fstArc.label);
|
||||
level++;
|
||||
//if (DEBUG) System.out.println(" term=" + term + " level=" + level);
|
||||
//if (TEST) System.out.println(" term=" + term + " level=" + level);
|
||||
}
|
||||
|
||||
Frame popFrame() {
|
||||
term = shrink();
|
||||
level--;
|
||||
metaUpto = metaUpto > level ? level : metaUpto;
|
||||
//if (DEBUG) System.out.println(" term=" + term + " level=" + level);
|
||||
//if (TEST) System.out.println(" term=" + term + " level=" + level);
|
||||
return stack[level+1];
|
||||
}
|
||||
|
||||
|
|
|
@ -22,8 +22,6 @@ import java.util.Arrays;
|
|||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
@ -128,16 +126,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
@Override
|
||||
//
|
||||
// The return value will be the smaller one, when these two are
|
||||
// 'comparable', i.e. every value in long[] fits the same ordering.
|
||||
// 'comparable', i.e.
|
||||
// 1. every value in t1 is not larger than in t2, or
|
||||
// 2. every value in t1 is not smaller than t2.
|
||||
//
|
||||
// NOTE:
|
||||
// Only long[] part is 'shared' and pushed towards root.
|
||||
// byte[] and term stats will be on deeper arcs.
|
||||
// byte[] and term stats will be kept on deeper arcs.
|
||||
//
|
||||
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
|
||||
//if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
|
||||
if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
|
||||
if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
|
||||
//if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
|
||||
return NO_OUTPUT;
|
||||
}
|
||||
assert t1.longs.length == t2.longs.length;
|
||||
|
@ -172,15 +172,15 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
ret = new TempMetaData(min, null, 0, -1);
|
||||
}
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
//if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
|
||||
//if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
|
||||
if (t2 == NO_OUTPUT) {
|
||||
if (DEBUG) System.out.println("ret:"+t1);
|
||||
//if (DEBUG) System.out.println("ret:"+t1);
|
||||
return t1;
|
||||
}
|
||||
assert t1.longs.length == t2.longs.length;
|
||||
|
@ -201,20 +201,21 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
} else {
|
||||
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
//if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// nocommit: we might refactor out an 'addSelf' later,
|
||||
// which improves 5~7% for fuzzy queries
|
||||
// TODO: if we refactor a 'addSelf(TempMetaDat other)',
|
||||
// we can gain about 5~7% for fuzzy queries, however on the other hand
|
||||
// we seem to put much stress on FST Outputs decoding?
|
||||
@Override
|
||||
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
|
||||
//if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
|
||||
if (t1 == NO_OUTPUT) {
|
||||
if (DEBUG) System.out.println("ret:"+t2);
|
||||
//if (DEBUG) System.out.println("ret:"+t2);
|
||||
return t2;
|
||||
} else if (t2 == NO_OUTPUT) {
|
||||
if (DEBUG) System.out.println("ret:"+t1);
|
||||
//if (DEBUG) System.out.println("ret:"+t1);
|
||||
return t1;
|
||||
}
|
||||
assert t1.longs.length == t2.longs.length;
|
||||
|
@ -233,7 +234,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
} else {
|
||||
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
//if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
|
|||
public abstract BlockTermState newTermState() throws IOException;
|
||||
|
||||
/** Start a new term. Note that a matching call to {@link
|
||||
* #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least one
|
||||
* #finishTerm(BlockTermState)} is done, only if the term has at least one
|
||||
* document. */
|
||||
public abstract void startTerm() throws IOException;
|
||||
|
||||
|
@ -67,12 +67,18 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
|
|||
/**
|
||||
* Encode metadata as long[] and byte[]. {@code absolute} controls
|
||||
* whether current term is delta encoded according to latest term.
|
||||
*
|
||||
* NOTE: sometimes long[] might contain values that doesn't make sense,
|
||||
* e.g. for Lucene41PostingsFormat, when singletonDocID != -1, docStartFP is not defined.
|
||||
* Here postings side should always use the last docStartFP, to keep each element in
|
||||
* metadata long[] monotonic.
|
||||
*/
|
||||
public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
|
||||
|
||||
/**
|
||||
* Return the fixed length of longs,
|
||||
* Return the fixed length of long[] metadata (which is fixed per field),
|
||||
* called when the writing switches to another field. */
|
||||
// TODO: better name?
|
||||
public abstract int setField(FieldInfo fieldInfo);
|
||||
|
||||
@Override
|
||||
|
|
|
@ -215,8 +215,6 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
termsOut.writeVInt(BLOCK_SIZE);
|
||||
}
|
||||
|
||||
// nocommit better name?
|
||||
|
||||
@Override
|
||||
public int setField(FieldInfo fieldInfo) {
|
||||
IndexOptions indexOptions = fieldInfo.getIndexOptions();
|
||||
|
@ -540,16 +538,12 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
|
|||
docCount = 0;
|
||||
}
|
||||
|
||||
// nocommit explain about the "don't care" values
|
||||
|
||||
@Override
|
||||
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
|
||||
IntBlockTermState state = (IntBlockTermState)_state;
|
||||
if (absolute) {
|
||||
lastState = newTermState();
|
||||
}
|
||||
//System.out.println("PW: state=" + state);
|
||||
//System.out.println(" last=" + lastState);
|
||||
if (VERSION_CURRENT < VERSION_META_ARRAY) { // impersonation
|
||||
_encodeTerm(out, fieldInfo, state);
|
||||
return;
|
||||
|
|
Loading…
Reference in New Issue