cleanup codes, downgrade nocommits

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1519242 13f79535-47bb-0310-9956-ffa450edef68
2013-09-01 04:32:01 +00:00 · 2013-09-01 04:32:01 +00:00 · e1f85e73ed
parent 1b4c3f688c
commit e1f85e73ed
6 changed files with 46 additions and 82 deletions
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java
@ -408,7 +408,7 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
        return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
      }

-      // nocommit: this can be achieved by making use of Util.getByOutput()
+      // TODO: this can be achieved by making use of Util.getByOutput()
      //           and should have related tests
      @Override
      public void seekExact(long ord) throws IOException {
@ -541,14 +541,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
        this.fstReader = fst.getBytesReader();
        this.fstOutputs = index.outputs;
        this.fsa = compiled.runAutomaton;
-        /*
-        PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
-        Util.toDot(dict,pw1, false, false);
-        pw1.close();
-        PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
-        pw2.write(compiled.toDot());
-        pw2.close();
-        */
        this.level = -1;
        this.stack = new Frame[16];
        for (int i = 0 ; i < stack.length; i++) {
@ -679,8 +671,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
        return frame;
      }

-      // nocommit: expected to use readFirstTargetArc here?
-
      /** Load frame for target arc(node) on fst */
      Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
        if (!canGrow(top)) {
@ -743,7 +733,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
        return !frame.arc.isLast();
      }

-      // nocommit: need to load ord lazily?
      void pushFrame(Frame frame) {
        final FST.Arc<Long> arc = frame.arc;
        arc.output = fstOutputs.add(topFrame().arc.output, arc.output);
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java
@ -55,13 +55,12 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
  public static final int TERMS_VERSION_START = 0;
  public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;
  public static final int SKIP_INTERVAL = 8;
-  //static final boolean TEST = false;
  
  final PostingsWriterBase postingsWriter;
  final FieldInfos fieldInfos;
  final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
  IndexOutput blockOut = null;
-  IndexOutput indexOut = null;  // nocommit: hmm, do we really need two streams?
+  IndexOutput indexOut = null;

  public TempFSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
    final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
@ -134,8 +133,6 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
    out.writeLong(dirStart);
  }

-  // nocommit: nuke this? we don't need to buffer so much data, 
-  // since close() can do this naturally
  private static class FieldMetaData {
    public FieldInfo fieldInfo;
    public long numTerms;
@ -145,12 +142,16 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
    public int longsSize;
    public FST<Long> dict;

-    // nocommit: block encode each part 
-    // (so that we'll have metaLongsOut[])
-    public RAMOutputStream skipOut;       // vint encode next skip point (all values start from 0, fully decoded when reading)
-    public RAMOutputStream statsOut;      // vint encode df, (ttf-df)
-    public RAMOutputStream metaLongsOut;  // vint encode monotonic long[] and length for corresponding byte[]
-    public RAMOutputStream metaBytesOut;  // put all bytes blob here
+    // TODO: block encode each part 
+
+    // vint encode next skip point (fully decoded when reading)
+    public RAMOutputStream skipOut;
+    // vint encode df, (ttf-df)
+    public RAMOutputStream statsOut;
+    // vint encode monotonic long[] and length for corresponding byte[]
+    public RAMOutputStream metaLongsOut;
+    // generic byte[]
+    public RAMOutputStream metaBytesOut;
  }

  final class TermsWriter extends TermsConsumer {
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
@ -63,7 +63,7 @@ public class TempFSTTermsReader extends FieldsProducer {
  final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
  final PostingsReaderBase postingsReader;
  final IndexInput in;
-  //static boolean DEBUG = false;
+  //static boolean TEST = false;

  public TempFSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws IOException {
    final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
@ -283,12 +283,6 @@ public class TempFSTTermsReader extends FieldsProducer {
        return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
      }

-      // nocommit: do we need this? for SegmentTermsEnum, we can maintain
-      // a stack to record how current term is constructed on FST, (and ord on each alphabet)
-      // so that during seek we don't have to start from the first arc.
-      // however, we'll be implementing a new fstEnum instead of wrapping current one.
-      //
-      // nocommit: this can also be achieved by making use of Util.getByOutput()
      @Override
      public void seekExact(long ord) throws IOException {
        throw new UnsupportedOperationException();
@ -428,19 +422,11 @@ public class TempFSTTermsReader extends FieldsProducer {

      IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
        super();
-        //if (DEBUG) System.out.println("Enum init, startTerm=" + startTerm);
+        //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
        this.fst = dict;
        this.fstReader = fst.getBytesReader();
        this.fstOutputs = dict.outputs;
        this.fsa = compiled.runAutomaton;
-        /*
-        PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
-        Util.toDot(dict,pw1, false, false);
-        pw1.close();
-        PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
-        pw2.write(compiled.toDot());
-        pw2.close();
-        */
        this.level = -1;
        this.stack = new Frame[16];
        for (int i = 0 ; i < stack.length; i++) {
@ -511,7 +497,7 @@ public class TempFSTTermsReader extends FieldsProducer {

      @Override
      public BytesRef next() throws IOException {
-        //if (DEBUG) System.out.println("Enum next()");
+        //if (TEST) System.out.println("Enum next()");
        if (pending) {
          pending = false;
          loadMetaData();
@ -546,7 +532,7 @@ public class TempFSTTermsReader extends FieldsProducer {
      }

      private BytesRef doSeekCeil(BytesRef target) throws IOException {
-        //if (DEBUG) System.out.println("Enum doSeekCeil()");
+        //if (TEST) System.out.println("Enum doSeekCeil()");
        Frame frame= null;
        int label, upto = 0, limit = target.length;
        while (upto < limit) {  // to target prefix, or ceil label (rewind prefix)
@ -580,12 +566,6 @@ public class TempFSTTermsReader extends FieldsProducer {
        return null;
      }

-      // nocommit: might be great if we can set flag BIT_LAST_ARC
-      // nocommit: actually we can use first arc as candidate...
-      // it always has NO_OUTPUT as output, and BIT_LAST_ARC set.
-      // but we'll have problem if later FST supports output sharing
-      // on first arc!
-
      /** Virtual frame, never pop */
      Frame loadVirtualFrame(Frame frame) throws IOException {
        frame.fstArc.output = fstOutputs.getNoOutput();
@ -601,8 +581,6 @@ public class TempFSTTermsReader extends FieldsProducer {
        return frame;
      }

-      // nocommit: expected to use readFirstTargetArc here?
-
      /** Load frame for target arc(node) on fst */
      Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
        if (!canGrow(top)) {
@ -610,18 +588,13 @@ public class TempFSTTermsReader extends FieldsProducer {
        }
        frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
        frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
-        //if (DEBUG) System.out.println(" loadExpand frame="+frame);
+        //if (TEST) System.out.println(" loadExpand frame="+frame);
        if (frame.fsaState == -1) {
          return loadNextFrame(top, frame);
        }
        return frame;
      }

-      // nocommit: actually, here we're looking for a valid state for fsa, 
-      //           so if numArcs is large in fst, we should try a reverse lookup?
-      //           but we don have methods like advance(label) in fst, even 
-      //           binary search hurts. 
-      
      /** Load frame for sibling arc(node) on fst */
      Frame loadNextFrame(Frame top, Frame frame) throws IOException {
        if (!canRewind(frame)) {
@ -634,7 +607,7 @@ public class TempFSTTermsReader extends FieldsProducer {
            break;
          }
        }
-        //if (DEBUG) System.out.println(" loadNext frame="+frame);
+        //if (TEST) System.out.println(" loadNext frame="+frame);
        if (frame.fsaState == -1) {
          return null;
        }
@ -650,7 +623,7 @@ public class TempFSTTermsReader extends FieldsProducer {
          return null;
        }
        frame.fsaState = fsa.step(top.fsaState, arc.label);
-        //if (DEBUG) System.out.println(" loadCeil frame="+frame);
+        //if (TEST) System.out.println(" loadCeil frame="+frame);
        if (frame.fsaState == -1) {
          return loadNextFrame(top, frame);
        }
@ -673,14 +646,14 @@ public class TempFSTTermsReader extends FieldsProducer {
      void pushFrame(Frame frame) {
        term = grow(frame.fstArc.label);
        level++;
-        //if (DEBUG) System.out.println("  term=" + term + " level=" + level);
+        //if (TEST) System.out.println("  term=" + term + " level=" + level);
      }

      Frame popFrame() {
        term = shrink();
        level--;
        metaUpto = metaUpto > level ? level : metaUpto;
-        //if (DEBUG) System.out.println("  term=" + term + " level=" + level);
+        //if (TEST) System.out.println("  term=" + term + " level=" + level);
        return stack[level+1];
      }

--- a/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
@ -22,8 +22,6 @@ import java.util.Arrays;

 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.fst.Outputs;
@ -128,16 +126,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
  @Override
  //
  // The return value will be the smaller one, when these two are 
-  // 'comparable', i.e. every value in long[] fits the same ordering.
+  // 'comparable', i.e. 
+  // 1. every value in t1 is not larger than in t2, or
+  // 2. every value in t1 is not smaller than t2.
  //
  // NOTE: 
  // Only long[] part is 'shared' and pushed towards root.
-  // byte[] and term stats will be on deeper arcs.
+  // byte[] and term stats will be kept on deeper arcs.
  //
  public TempMetaData common(TempMetaData t1, TempMetaData t2) {
-    if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
+    //if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
    if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
+      //if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
      return NO_OUTPUT;
    }
    assert t1.longs.length == t2.longs.length;
@ -172,15 +172,15 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
        ret = new TempMetaData(min, null, 0, -1);
      }
    }
-    if (DEBUG) System.out.println("ret:"+ret);
+    //if (DEBUG) System.out.println("ret:"+ret);
    return ret;
  }

  @Override
  public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
-    if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
+    //if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
    if (t2 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+t1);
+      //if (DEBUG) System.out.println("ret:"+t1);
      return t1;
    }
    assert t1.longs.length == t2.longs.length;
@ -201,20 +201,21 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
    } else {
      ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
    }
-    if (DEBUG) System.out.println("ret:"+ret);
+    //if (DEBUG) System.out.println("ret:"+ret);
    return ret;
  }

-  // nocommit: we might refactor out an 'addSelf' later, 
-  // which improves 5~7% for fuzzy queries
+  // TODO: if we refactor a 'addSelf(TempMetaDat other)',
+  // we can gain about 5~7% for fuzzy queries, however on the other hand
+  // we seem to put much stress on FST Outputs decoding?
  @Override
  public TempMetaData add(TempMetaData t1, TempMetaData t2) {
-    if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
+    //if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
    if (t1 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+t2);
+      //if (DEBUG) System.out.println("ret:"+t2);
      return t2;
    } else if (t2 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+t1);
+      //if (DEBUG) System.out.println("ret:"+t1);
      return t1;
    }
    assert t1.longs.length == t2.longs.length;
@ -233,7 +234,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
    } else {
      ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
    }
-    if (DEBUG) System.out.println("ret:"+ret);
+    //if (DEBUG) System.out.println("ret:"+ret);
    return ret;
  }

--- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
@ -55,7 +55,7 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
  public abstract BlockTermState newTermState() throws IOException;

  /** Start a new term.  Note that a matching call to {@link
-   *  #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least one
+   *  #finishTerm(BlockTermState)} is done, only if the term has at least one
   *  document. */
  public abstract void startTerm() throws IOException;

@ -67,12 +67,18 @@ public abstract class PostingsWriterBase extends PostingsConsumer implements Clo
  /**
   * Encode metadata as long[] and byte[]. {@code absolute} controls 
   * whether current term is delta encoded according to latest term.
+   *
+   * NOTE: sometimes long[] might contain values that doesn't make sense,
+   * e.g. for Lucene41PostingsFormat, when singletonDocID != -1, docStartFP is not defined.
+   * Here postings side should always use the last docStartFP, to keep each element in 
+   * metadata long[] monotonic.
   */
  public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;

  /** 
-   * Return the fixed length of longs,
+   * Return the fixed length of long[] metadata (which is fixed per field),
   * called when the writing switches to another field. */
+  // TODO: better name?
  public abstract int setField(FieldInfo fieldInfo);

  @Override
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
@ -215,8 +215,6 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
    termsOut.writeVInt(BLOCK_SIZE);
  }

-  // nocommit better name?
-
  @Override
  public int setField(FieldInfo fieldInfo) {
    IndexOptions indexOptions = fieldInfo.getIndexOptions();
@ -540,16 +538,12 @@ public final class Lucene41PostingsWriter extends PostingsWriterBase {
    docCount = 0;
  }
  
-  // nocommit explain about the "don't care" values
-
  @Override
  public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
    IntBlockTermState state = (IntBlockTermState)_state;
    if (absolute) {
      lastState = newTermState();
    }
-    //System.out.println("PW: state=" + state);
-    //System.out.println("     last=" + lastState);
    if (VERSION_CURRENT < VERSION_META_ARRAY) {  // impersonation
      _encodeTerm(out, fieldInfo, state);
      return;