cleanup codes, downgrade nocommits

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1519242 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Han Jiang 2013-09-01 04:32:01 +00:00
parent 1b4c3f688c
commit e1f85e73ed
6 changed files with 46 additions and 82 deletions

View File

@ -408,7 +408,7 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
}
// nocommit: this can be achieved by making use of Util.getByOutput()
// TODO: this can be achieved by making use of Util.getByOutput()
// and should have related tests
@Override
public void seekExact(long ord) throws IOException {
@ -541,14 +541,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
this.fstReader = fst.getBytesReader();
this.fstOutputs = index.outputs;
this.fsa = compiled.runAutomaton;
/*
PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
Util.toDot(dict,pw1, false, false);
pw1.close();
PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
pw2.write(compiled.toDot());
pw2.close();
*/
this.level = -1;
this.stack = new Frame[16];
for (int i = 0 ; i < stack.length; i++) {
@ -679,8 +671,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
return frame;
}
// nocommit: expected to use readFirstTargetArc here?
/** Load frame for target arc(node) on fst */
Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
if (!canGrow(top)) {
@ -743,7 +733,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
return !frame.arc.isLast();
}
// nocommit: need to load ord lazily?
void pushFrame(Frame frame) {
final FST.Arc<Long> arc = frame.arc;
arc.output = fstOutputs.add(topFrame().arc.output, arc.output);

View File

@ -55,13 +55,12 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
public static final int TERMS_VERSION_START = 0;
public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;
public static final int SKIP_INTERVAL = 8;
//static final boolean TEST = false;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
IndexOutput blockOut = null;
IndexOutput indexOut = null; // nocommit: hmm, do we really need two streams?
IndexOutput indexOut = null;
public TempFSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
@ -134,8 +133,6 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
out.writeLong(dirStart);
}
// nocommit: nuke this? we don't need to buffer so much data,
// since close() can do this naturally
private static class FieldMetaData {
public FieldInfo fieldInfo;
public long numTerms;
@ -145,12 +142,16 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
public int longsSize;
public FST<Long> dict;
// nocommit: block encode each part
// (so that we'll have metaLongsOut[])
public RAMOutputStream skipOut; // vint encode next skip point (all values start from 0, fully decoded when reading)
public RAMOutputStream statsOut; // vint encode df, (ttf-df)
public RAMOutputStream metaLongsOut; // vint encode monotonic long[] and length for corresponding byte[]
public RAMOutputStream metaBytesOut; // put all bytes blob here
// TODO: block encode each part
// vint encode next skip point (fully decoded when reading)
public RAMOutputStream skipOut;
// vint encode df, (ttf-df)
public RAMOutputStream statsOut;
// vint encode monotonic long[] and length for corresponding byte[]
public RAMOutputStream metaLongsOut;
// generic byte[]
public RAMOutputStream metaBytesOut;
}
final class TermsWriter extends TermsConsumer {

View File

@ -63,7 +63,7 @@ public class TempFSTTermsReader extends FieldsProducer {
final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
final PostingsReaderBase postingsReader;
final IndexInput in;
//static boolean DEBUG = false;
//static boolean TEST = false;
public TempFSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException {
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
@ -283,12 +283,6 @@ public class TempFSTTermsReader extends FieldsProducer {
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
}
// nocommit: do we need this? for SegmentTermsEnum, we can maintain
// a stack to record how current term is constructed on FST, (and ord on each alphabet)
// so that during seek we don't have to start from the first arc.
// however, we'll be implementing a new fstEnum instead of wrapping current one.
//
// nocommit: this can also be achieved by making use of Util.getByOutput()
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
@ -428,19 +422,11 @@ public class TempFSTTermsReader extends FieldsProducer {
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
super();
//if (DEBUG) System.out.println("Enum init, startTerm=" + startTerm);
//if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
this.fst = dict;
this.fstReader = fst.getBytesReader();
this.fstOutputs = dict.outputs;
this.fsa = compiled.runAutomaton;
/*
PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
Util.toDot(dict,pw1, false, false);
pw1.close();
PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
pw2.write(compiled.toDot());
pw2.close();
*/
this.level = -1;
this.stack = new Frame[16];
for (int i = 0 ; i < stack.length; i++) {
@ -511,7 +497,7 @@ public class TempFSTTermsReader extends FieldsProducer {
@Override
public BytesRef next() throws IOException {
//if (DEBUG) System.out.println("Enum next()");
//if (TEST) System.out.println("Enum next()");
if (pending) {
pending = false;
loadMetaData();
@ -546,7 +532,7 @@ public class TempFSTTermsReader extends FieldsProducer {
}
private BytesRef doSeekCeil(BytesRef target) throws IOException {
//if (DEBUG) System.out.println("Enum doSeekCeil()");
//if (TEST) System.out.println("Enum doSeekCeil()");
Frame frame= null;
int label, upto = 0, limit = target.length;
while (upto < limit) { // to target prefix, or ceil label (rewind prefix)
@ -580,12 +566,6 @@ public class TempFSTTermsReader extends FieldsProducer {
return null;
}
// nocommit: might be great if we can set flag BIT_LAST_ARC
// nocommit: actually we can use first arc as candidate...
// it always has NO_OUTPUT as output, and BIT_LAST_ARC set.
// but we'll have problem if later FST supports output sharing
// on first arc!
/** Virtual frame, never pop */
Frame loadVirtualFrame(Frame frame) throws IOException {
frame.fstArc.output = fstOutputs.getNoOutput();
@ -601,8 +581,6 @@ public class TempFSTTermsReader extends FieldsProducer {
return frame;
}
// nocommit: expected to use readFirstTargetArc here?
/** Load frame for target arc(node) on fst */
Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
if (!canGrow(top)) {
@ -610,18 +588,13 @@ public class TempFSTTermsReader extends FieldsProducer {
}
frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
//if (DEBUG) System.out.println(" loadExpand frame="+frame);
//if (TEST) System.out.println(" loadExpand frame="+frame);
if (frame.fsaState == -1) {
return loadNextFrame(top, frame);
}
return frame;
}
// nocommit: actually, here we're looking for a valid state for fsa,
// so if numArcs is large in fst, we should try a reverse lookup?
// but we don have methods like advance(label) in fst, even
// binary search hurts.
/** Load frame for sibling arc(node) on fst */
Frame loadNextFrame(Frame top, Frame frame) throws IOException {
if (!canRewind(frame)) {
@ -634,7 +607,7 @@ public class TempFSTTermsReader extends FieldsProducer {
break;
}
}
//if (DEBUG) System.out.println(" loadNext frame="+frame);
//if (TEST) System.out.println(" loadNext frame="+frame);
if (frame.fsaState == -1) {
return null;
}
@ -650,7 +623,7 @@ public class TempFSTTermsReader extends FieldsProducer {
return null;
}
frame.fsaState = fsa.step(top.fsaState, arc.label);
//if (DEBUG) System.out.println(" loadCeil frame="+frame);
//if (TEST) System.out.println(" loadCeil frame="+frame);
if (frame.fsaState == -1) {
return loadNextFrame(top, frame);
}
@ -673,14 +646,14 @@ public class TempFSTTermsReader extends FieldsProducer {
void pushFrame(Frame frame) {
term = grow(frame.fstArc.label);
level++;
//if (DEBUG) System.out.println(" term=" + term + " level=" + level);
//if (TEST) System.out.println(" term=" + term + " level=" + level);
}
Frame popFrame() {
term = shrink();
level--;
metaUpto = metaUpto > level ? level : metaUpto;
//if (DEBUG) System.out.println(" term=" + term + " level=" + level);
//if (TEST) System.out.println(" term=" + term + " level=" + level);
return stack[level+1];
}

View File

@ -22,8 +22,6 @@ import java.util.Arrays;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.Outputs;
@ -128,16 +126,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
@Override
//
// The return value will be the smaller one, when these two are
// 'comparable', i.e. every value in long[] fits the same ordering.
// 'comparable', i.e.
// 1. every value in t1 is not larger than in t2, or
// 2. every value in t1 is not smaller than t2.
//
// NOTE:
// Only long[] part is 'shared' and pushed towards root.
// byte[] and term stats will be on deeper arcs.
// byte[] and term stats will be kept on deeper arcs.
//
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
//if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
//if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
return NO_OUTPUT;
}
assert t1.longs.length == t2.longs.length;
@ -172,15 +172,15 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
ret = new TempMetaData(min, null, 0, -1);
}
}
if (DEBUG) System.out.println("ret:"+ret);
//if (DEBUG) System.out.println("ret:"+ret);
return ret;
}
@Override
public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
//if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
if (t2 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t1);
//if (DEBUG) System.out.println("ret:"+t1);
return t1;
}
assert t1.longs.length == t2.longs.length;
@ -201,20 +201,21 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
} else {
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret);
//if (DEBUG) System.out.println("ret:"+ret);
return ret;
}
// nocommit: we might refactor out an 'addSelf' later,
// which improves 5~7% for fuzzy queries
// TODO: if we refactor a 'addSelf(TempMetaDat other)',
// we can gain about 5~7% for fuzzy queries, however on the other hand
// we seem to put much stress on FST Outputs decoding?
@Override
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
//if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
if (t1 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t2);
//if (DEBUG) System.out.println("ret:"+t2);
return t2;
} else if (t2 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t1);
//if (DEBUG) System.out.println("ret:"+t1);
return t1;
}
assert t1.longs.length == t2.longs.length;
@ -233,7 +234,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
} else {
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret);
//if (DEBUG) System.out.println("ret:"+ret);
return ret;
}

View File

@ -55,7 +55,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
public abstract BlockTermState newTermState() throws IOException;
/** Start a new term. Note that a matching call to {@link
* #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least one
* #finishTerm(BlockTermState)} is done, only if the term has at least one
* document. */
public abstract void startTerm() throws IOException;
@ -67,12 +67,18 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
/**
* Encode metadata as long[] and byte[]. {@code absolute} controls
* whether current term is delta encoded according to latest term.
*
* NOTE: sometimes long[] might contain values that doesn't make sense,
* e.g. for Lucene41PostingsFormat, when singletonDocID != -1, docStartFP is not defined.
* Here postings side should always use the last docStartFP, to keep each element in
* metadata long[] monotonic.
*/
public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
/**
* Return the fixed length of longs,
* Return the fixed length of long[] metadata (which is fixed per field),
* called when the writing switches to another field. */
// TODO: better name?
public abstract int setField(FieldInfo fieldInfo);
@Override

View File

@ -215,8 +215,6 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
termsOut.writeVInt(BLOCK_SIZE);
}
// nocommit better name?
@Override
public int setField(FieldInfo fieldInfo) {
IndexOptions indexOptions = fieldInfo.getIndexOptions();
@ -540,16 +538,12 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
docCount = 0;
}
// nocommit explain about the "don't care" values
@Override
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
IntBlockTermState state = (IntBlockTermState)_state;
if (absolute) {
lastState = newTermState();
}
//System.out.println("PW: state=" + state);
//System.out.println(" last=" + lastState);
if (VERSION_CURRENT < VERSION_META_ARRAY) { // impersonation
_encodeTerm(out, fieldInfo, state);
return;