mirror of https://github.com/apache/lucene.git
LUCENE-3069: javadocs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1519909 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ec07c84339
commit
054a95e182
|
@ -31,6 +31,9 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.codecs.CodecUtil; // javadocs
|
||||||
|
import org.apache.lucene.store.DataOutput; // javadocs
|
||||||
|
import org.apache.lucene.util.fst.FST; // javadocs
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* FST-based term dict, using ord as FST output.
|
* FST-based term dict, using ord as FST output.
|
||||||
|
@ -44,7 +47,89 @@ import org.apache.lucene.util.IOUtils;
|
||||||
* 3. generic byte[], e.g. other information customized by postings base.
|
* 3. generic byte[], e.g. other information customized by postings base.
|
||||||
* 4. single-level skip list to speed up metadata decoding by ord.
|
* 4. single-level skip list to speed up metadata decoding by ord.
|
||||||
*
|
*
|
||||||
* <!-- TODO: explain about the data format -->
|
* <p>
|
||||||
|
* Files:
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>.tix</tt>: <a href="#Termindex">Term Index</a></li>
|
||||||
|
* <li><tt>.tbk</tt>: <a href="#Termblock">Term Block</a></li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <a name="Termindex" id="Termindex"></a>
|
||||||
|
* <h3>Term Index</h3>
|
||||||
|
* <p>
|
||||||
|
* The .tix contains a list of FSTs, one for each field.
|
||||||
|
* The FST maps a term to its corresponding order in current field.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>TermIndex(.tix) --> Header, TermFST<sup>NumFields</sup></li>
|
||||||
|
* <li>TermFST --> {@link FST FST<long>}</li>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p>Notes:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* Since terms are already sorted before writing to <a href="#Termblock">Term Block</a>,
|
||||||
|
* their ords can directly used to seek term metadata from term block.
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <a name="Termblock" id="Termblock"></a>
|
||||||
|
* <h3>Term Block</h3>
|
||||||
|
* <p>
|
||||||
|
* The .tbk contains all the statistics and metadata for terms, along with field summary (e.g.
|
||||||
|
* per-field data like number of documents in current field). For each field, there are four blocks:
|
||||||
|
* <ul>
|
||||||
|
* <li>statistics bytes block: contains term statistics; </li>
|
||||||
|
* <li>metadata longs block: delta-encodes monotonical part of metadata; </li>
|
||||||
|
* <li>metadata bytes block: encodes other parts of metadata; </li>
|
||||||
|
* <li>skip block: contains skip data, to speed up metadata seeking and decoding</li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>File Format:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>TermBlock(.tbk) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
|
||||||
|
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
|
||||||
|
* DocCount, LongsSize, DataBlock > <sup>NumFields</sup></li>
|
||||||
|
*
|
||||||
|
* <li>DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
|
||||||
|
* SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock </li>
|
||||||
|
* <li>SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta,
|
||||||
|
* MetaLongsSkipDelta<sup>LongsSize</sup> ><sup>NumTerms</sup>
|
||||||
|
* <li>StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > <sup>NumTerms</sup>
|
||||||
|
* <li>MetaLongsBlock --> < LongDelta<sup>LongsSize</sup>, BytesSize > <sup>NumTerms</sup>
|
||||||
|
* <li>MetaBytesBlock --> Byte <sup>MetaBytesBlockLength</sup>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
|
||||||
|
* <li>NumFields, FieldNumber, DocCount, DocFreq, LongsSize,
|
||||||
|
* FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
* <li>NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
|
||||||
|
* StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq,
|
||||||
|
* LongDelta,--> {@link DataOutput#writeVLong VLong}</li>
|
||||||
|
* </ul>
|
||||||
|
* <p>Notes: </p>
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* The format of PostingsHeader and MetaBytes are customized by the specific postings implementation:
|
||||||
|
* they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
|
||||||
|
* (non-monotonical ones like pulsed postings data).
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek
|
||||||
|
* term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset
|
||||||
|
* for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates
|
||||||
|
* the value of preceding metadata longs for every SkipInterval's term.
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term.
|
||||||
|
* Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case,
|
||||||
|
* so that encoding of TotalTermFreq may be omitted.
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
|
@ -89,7 +89,6 @@ public class TempFSTOrdTermsReader extends FieldsProducer {
|
||||||
readHeader(indexIn);
|
readHeader(indexIn);
|
||||||
readHeader(blockIn);
|
readHeader(blockIn);
|
||||||
this.postingsReader.init(blockIn);
|
this.postingsReader.init(blockIn);
|
||||||
seekDir(indexIn);
|
|
||||||
seekDir(blockIn);
|
seekDir(blockIn);
|
||||||
|
|
||||||
final FieldInfos fieldInfos = state.fieldInfos;
|
final FieldInfos fieldInfos = state.fieldInfos;
|
||||||
|
|
|
@ -120,7 +120,6 @@ public class TempFSTOrdTermsWriter extends FieldsConsumer {
|
||||||
field.metaBytesOut.writeTo(blockOut);
|
field.metaBytesOut.writeTo(blockOut);
|
||||||
field.dict.save(indexOut);
|
field.dict.save(indexOut);
|
||||||
}
|
}
|
||||||
writeTrailer(indexOut, indexDirStart);
|
|
||||||
writeTrailer(blockOut, blockDirStart);
|
writeTrailer(blockOut, blockDirStart);
|
||||||
} catch (IOException ioe2) {
|
} catch (IOException ioe2) {
|
||||||
ioe = ioe2;
|
ioe = ioe2;
|
||||||
|
|
|
@ -31,6 +31,9 @@ import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||||
import org.apache.lucene.index.SegmentReadState;
|
import org.apache.lucene.index.SegmentReadState;
|
||||||
import org.apache.lucene.index.SegmentWriteState;
|
import org.apache.lucene.index.SegmentWriteState;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.codecs.CodecUtil; // javadocs
|
||||||
|
import org.apache.lucene.store.DataOutput; // javadocs
|
||||||
|
import org.apache.lucene.util.fst.FST; // javadocs
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* FST-based term dict, using metadata as FST output.
|
* FST-based term dict, using metadata as FST output.
|
||||||
|
@ -42,8 +45,69 @@ import org.apache.lucene.util.IOUtils;
|
||||||
* 2. monotonic long[], e.g. the pointer to the postings list for that term;
|
* 2. monotonic long[], e.g. the pointer to the postings list for that term;
|
||||||
* 3. generic byte[], e.g. other information need by postings reader.
|
* 3. generic byte[], e.g. other information need by postings reader.
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* File:
|
||||||
|
* <ul>
|
||||||
|
* <li><tt>.tst</tt>: <a href="#Termdictionary">Term Dictionary</a></li>
|
||||||
|
* </ul>
|
||||||
|
* <p>
|
||||||
|
*
|
||||||
|
* <a name="Termdictionary" id="Termdictionary"></a>
|
||||||
|
* <h3>Term Dictionary</h3>
|
||||||
|
* <p>
|
||||||
|
* The .tst contains a list of FSTs, one for each field.
|
||||||
|
* The FST maps a term to its corresponding statistics (e.g. docfreq)
|
||||||
|
* and metadata (e.g. information for postings list reader like file pointer
|
||||||
|
* to postings list).
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* Typically the metadata is separated into two parts:
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* Monotonical long array: Some metadata will always be ascending in order
|
||||||
|
* with the corresponding term. This part is used by FST to share outputs between arcs.
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* Generic byte array: Used to store non-monotonical metadata.
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* File format:
|
||||||
|
* <ul>
|
||||||
|
* <li>TermsDict(.tst) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
|
||||||
|
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?,
|
||||||
|
* SumDocFreq, DocCount, LongsSize, TermFST ><sup>NumFields</sup></li>
|
||||||
|
* <li>TermFST --> {@link FST FST<TermData>}</li>
|
||||||
|
* <li>TermData --> Flag, BytesSize?, LongDelta<sup>LongsSize</sup>?, Byte<sup>BytesSize</sup>?,
|
||||||
|
* < DocFreq[Same?], (TotalTermFreq-DocFreq) > ? </li>
|
||||||
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
|
* <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
|
||||||
|
* <li>DocFreq, LongsSize, BytesSize, NumFields,
|
||||||
|
* FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
* <li>TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta -->
|
||||||
|
* {@link DataOutput#writeVLong VLong}</li>
|
||||||
|
* </ul>
|
||||||
|
* <p>Notes:</p>
|
||||||
|
* <ul>
|
||||||
|
* <li>
|
||||||
|
* The format of PostingsHeader and generic meta bytes are customized by the specific postings implementation:
|
||||||
|
* they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
|
||||||
|
* (non-monotonical ones like pulsed postings data).
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* The format of TermData is determined by FST, typically monotonical metadata will be dense around shallow arcs,
|
||||||
|
* while in deeper arcs only generic bytes and term statistics exist.
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* The byte Flag is used to indicate which part of metadata exists on current arc. Specially the monotonical part
|
||||||
|
* is omitted when it is an array of 0s.
|
||||||
|
* </li>
|
||||||
|
* <li>
|
||||||
|
* Since LongsSize is per-field fixed, it is only written once in field summary.
|
||||||
|
* </li>
|
||||||
|
* </ul>
|
||||||
*
|
*
|
||||||
* <!-- TODO: explain about the data format -->
|
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
|
@ -167,7 +167,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
final long sumDocFreq;
|
final long sumDocFreq;
|
||||||
final int docCount;
|
final int docCount;
|
||||||
final int longsSize;
|
final int longsSize;
|
||||||
final FST<TempTermOutputs.TempMetaData> dict;
|
final FST<TempTermOutputs.TempTermData> dict;
|
||||||
|
|
||||||
TermsReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
|
TermsReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
|
@ -176,7 +176,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
this.sumDocFreq = sumDocFreq;
|
this.sumDocFreq = sumDocFreq;
|
||||||
this.docCount = docCount;
|
this.docCount = docCount;
|
||||||
this.longsSize = longsSize;
|
this.longsSize = longsSize;
|
||||||
this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(fieldInfo, longsSize));
|
this.dict = new FST<TempTermOutputs.TempTermData>(in, new TempTermOutputs(fieldInfo, longsSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -238,7 +238,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
final BlockTermState state;
|
final BlockTermState state;
|
||||||
|
|
||||||
/* Current term stats + undecoded metadata (long[] & byte[]) */
|
/* Current term stats + undecoded metadata (long[] & byte[]) */
|
||||||
TempTermOutputs.TempMetaData meta;
|
TempTermOutputs.TempTermData meta;
|
||||||
ByteArrayDataInput bytesReader;
|
ByteArrayDataInput bytesReader;
|
||||||
|
|
||||||
/** Decodes metadata into customized term state */
|
/** Decodes metadata into customized term state */
|
||||||
|
@ -306,7 +306,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
|
|
||||||
// Iterates through all terms in this field
|
// Iterates through all terms in this field
|
||||||
private final class SegmentTermsEnum extends BaseTermsEnum {
|
private final class SegmentTermsEnum extends BaseTermsEnum {
|
||||||
final BytesRefFSTEnum<TempTermOutputs.TempMetaData> fstEnum;
|
final BytesRefFSTEnum<TempTermOutputs.TempTermData> fstEnum;
|
||||||
|
|
||||||
/* True when current term's metadata is decoded */
|
/* True when current term's metadata is decoded */
|
||||||
boolean decoded;
|
boolean decoded;
|
||||||
|
@ -316,7 +316,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
|
|
||||||
SegmentTermsEnum() throws IOException {
|
SegmentTermsEnum() throws IOException {
|
||||||
super();
|
super();
|
||||||
this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempMetaData>(dict);
|
this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempTermData>(dict);
|
||||||
this.decoded = false;
|
this.decoded = false;
|
||||||
this.seekPending = false;
|
this.seekPending = false;
|
||||||
this.meta = null;
|
this.meta = null;
|
||||||
|
@ -335,7 +335,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update current enum according to FSTEnum
|
// Update current enum according to FSTEnum
|
||||||
void updateEnum(final InputOutput<TempTermOutputs.TempMetaData> pair) {
|
void updateEnum(final InputOutput<TempTermOutputs.TempTermData> pair) {
|
||||||
if (pair == null) {
|
if (pair == null) {
|
||||||
term = null;
|
term = null;
|
||||||
} else {
|
} else {
|
||||||
|
@ -405,22 +405,22 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
int metaUpto;
|
int metaUpto;
|
||||||
|
|
||||||
/* term dict fst */
|
/* term dict fst */
|
||||||
final FST<TempTermOutputs.TempMetaData> fst;
|
final FST<TempTermOutputs.TempTermData> fst;
|
||||||
final FST.BytesReader fstReader;
|
final FST.BytesReader fstReader;
|
||||||
final Outputs<TempTermOutputs.TempMetaData> fstOutputs;
|
final Outputs<TempTermOutputs.TempTermData> fstOutputs;
|
||||||
|
|
||||||
/* query automaton to intersect with */
|
/* query automaton to intersect with */
|
||||||
final ByteRunAutomaton fsa;
|
final ByteRunAutomaton fsa;
|
||||||
|
|
||||||
private final class Frame {
|
private final class Frame {
|
||||||
/* fst stats */
|
/* fst stats */
|
||||||
FST.Arc<TempTermOutputs.TempMetaData> fstArc;
|
FST.Arc<TempTermOutputs.TempTermData> fstArc;
|
||||||
|
|
||||||
/* automaton stats */
|
/* automaton stats */
|
||||||
int fsaState;
|
int fsaState;
|
||||||
|
|
||||||
Frame() {
|
Frame() {
|
||||||
this.fstArc = new FST.Arc<TempTermOutputs.TempMetaData>();
|
this.fstArc = new FST.Arc<TempTermOutputs.TempTermData>();
|
||||||
this.fsaState = -1;
|
this.fsaState = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -475,7 +475,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
|
|
||||||
/** Lazily accumulate meta data, when we got a accepted term */
|
/** Lazily accumulate meta data, when we got a accepted term */
|
||||||
void loadMetaData() throws IOException {
|
void loadMetaData() throws IOException {
|
||||||
FST.Arc<TempTermOutputs.TempMetaData> last, next;
|
FST.Arc<TempTermOutputs.TempTermData> last, next;
|
||||||
last = stack[metaUpto].fstArc;
|
last = stack[metaUpto].fstArc;
|
||||||
while (metaUpto != level) {
|
while (metaUpto != level) {
|
||||||
metaUpto++;
|
metaUpto++;
|
||||||
|
@ -626,7 +626,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
||||||
/** Load frame for target arc(node) on fst, so that
|
/** Load frame for target arc(node) on fst, so that
|
||||||
* arc.label >= label and !fsa.reject(arc.label) */
|
* arc.label >= label and !fsa.reject(arc.label) */
|
||||||
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
|
Frame loadCeilFrame(int label, Frame top, Frame frame) throws IOException {
|
||||||
FST.Arc<TempTermOutputs.TempMetaData> arc = frame.fstArc;
|
FST.Arc<TempTermOutputs.TempTermData> arc = frame.fstArc;
|
||||||
arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
|
arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
|
||||||
if (arc == null) {
|
if (arc == null) {
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -125,9 +125,9 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
||||||
public final long sumDocFreq;
|
public final long sumDocFreq;
|
||||||
public final int docCount;
|
public final int docCount;
|
||||||
public final int longsSize;
|
public final int longsSize;
|
||||||
public final FST<TempTermOutputs.TempMetaData> dict;
|
public final FST<TempTermOutputs.TempTermData> dict;
|
||||||
|
|
||||||
public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST<TempTermOutputs.TempMetaData> fst) {
|
public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST<TempTermOutputs.TempTermData> fst) {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.numTerms = numTerms;
|
this.numTerms = numTerms;
|
||||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||||
|
@ -139,7 +139,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
||||||
}
|
}
|
||||||
|
|
||||||
final class TermsWriter extends TermsConsumer {
|
final class TermsWriter extends TermsConsumer {
|
||||||
private final Builder<TempTermOutputs.TempMetaData> builder;
|
private final Builder<TempTermOutputs.TempTermData> builder;
|
||||||
private final TempTermOutputs outputs;
|
private final TempTermOutputs outputs;
|
||||||
private final FieldInfo fieldInfo;
|
private final FieldInfo fieldInfo;
|
||||||
private final int longsSize;
|
private final int longsSize;
|
||||||
|
@ -154,7 +154,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
||||||
this.fieldInfo = fieldInfo;
|
this.fieldInfo = fieldInfo;
|
||||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||||
this.outputs = new TempTermOutputs(fieldInfo, longsSize);
|
this.outputs = new TempTermOutputs(fieldInfo, longsSize);
|
||||||
this.builder = new Builder<TempTermOutputs.TempMetaData>(FST.INPUT_TYPE.BYTE1, outputs);
|
this.builder = new Builder<TempTermOutputs.TempTermData>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -172,7 +172,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
||||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||||
// write term meta data into fst
|
// write term meta data into fst
|
||||||
final BlockTermState state = postingsWriter.newTermState();
|
final BlockTermState state = postingsWriter.newTermState();
|
||||||
final TempTermOutputs.TempMetaData meta = new TempTermOutputs.TempMetaData();
|
final TempTermOutputs.TempTermData meta = new TempTermOutputs.TempTermData();
|
||||||
meta.longs = new long[longsSize];
|
meta.longs = new long[longsSize];
|
||||||
meta.bytes = null;
|
meta.bytes = null;
|
||||||
meta.docFreq = state.docFreq = stats.docFreq;
|
meta.docFreq = state.docFreq = stats.docFreq;
|
||||||
|
@ -193,7 +193,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
||||||
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
|
||||||
// save FST dict
|
// save FST dict
|
||||||
if (numTerms > 0) {
|
if (numTerms > 0) {
|
||||||
final FST<TempTermOutputs.TempMetaData> fst = builder.finish();
|
final FST<TempTermOutputs.TempTermData> fst = builder.finish();
|
||||||
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
|
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,8 +36,8 @@ import org.apache.lucene.util.LongsRef;
|
||||||
|
|
||||||
// NOTE: outputs should be per-field, since
|
// NOTE: outputs should be per-field, since
|
||||||
// longsSize is fixed for each field
|
// longsSize is fixed for each field
|
||||||
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
public class TempTermOutputs extends Outputs<TempTermOutputs.TempTermData> {
|
||||||
private final static TempMetaData NO_OUTPUT = new TempMetaData();
|
private final static TempTermData NO_OUTPUT = new TempTermData();
|
||||||
//private static boolean TEST = false;
|
//private static boolean TEST = false;
|
||||||
private final boolean hasPos;
|
private final boolean hasPos;
|
||||||
private final int longsSize;
|
private final int longsSize;
|
||||||
|
@ -47,18 +47,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
* On an FST, only long[] part is 'shared' and pushed towards root.
|
* On an FST, only long[] part is 'shared' and pushed towards root.
|
||||||
* byte[] and term stats will be kept on deeper arcs.
|
* byte[] and term stats will be kept on deeper arcs.
|
||||||
*/
|
*/
|
||||||
public static class TempMetaData {
|
public static class TempTermData {
|
||||||
long[] longs;
|
long[] longs;
|
||||||
byte[] bytes;
|
byte[] bytes;
|
||||||
int docFreq;
|
int docFreq;
|
||||||
long totalTermFreq;
|
long totalTermFreq;
|
||||||
TempMetaData() {
|
TempTermData() {
|
||||||
this.longs = null;
|
this.longs = null;
|
||||||
this.bytes = null;
|
this.bytes = null;
|
||||||
this.docFreq = 0;
|
this.docFreq = 0;
|
||||||
this.totalTermFreq = -1;
|
this.totalTermFreq = -1;
|
||||||
}
|
}
|
||||||
TempMetaData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) {
|
TempTermData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) {
|
||||||
this.longs = longs;
|
this.longs = longs;
|
||||||
this.bytes = bytes;
|
this.bytes = bytes;
|
||||||
this.docFreq = docFreq;
|
this.docFreq = docFreq;
|
||||||
|
@ -92,10 +92,10 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
public boolean equals(Object other_) {
|
public boolean equals(Object other_) {
|
||||||
if (other_ == this) {
|
if (other_ == this) {
|
||||||
return true;
|
return true;
|
||||||
} else if (!(other_ instanceof TempTermOutputs.TempMetaData)) {
|
} else if (!(other_ instanceof TempTermOutputs.TempTermData)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
TempMetaData other = (TempMetaData) other_;
|
TempTermData other = (TempTermData) other_;
|
||||||
return statsEqual(this, other) &&
|
return statsEqual(this, other) &&
|
||||||
longsEqual(this, other) &&
|
longsEqual(this, other) &&
|
||||||
bytesEqual(this, other);
|
bytesEqual(this, other);
|
||||||
|
@ -115,7 +115,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
// 1. every value in t1 is not larger than in t2, or
|
// 1. every value in t1 is not larger than in t2, or
|
||||||
// 2. every value in t1 is not smaller than t2.
|
// 2. every value in t1 is not smaller than t2.
|
||||||
//
|
//
|
||||||
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
|
public TempTermData common(TempTermData t1, TempTermData t2) {
|
||||||
//if (TEST) System.out.print("common("+t1+", "+t2+") = ");
|
//if (TEST) System.out.print("common("+t1+", "+t2+") = ");
|
||||||
if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
|
if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
|
||||||
//if (TEST) System.out.println("ret:"+NO_OUTPUT);
|
//if (TEST) System.out.println("ret:"+NO_OUTPUT);
|
||||||
|
@ -125,7 +125,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
|
|
||||||
long[] min = t1.longs, max = t2.longs;
|
long[] min = t1.longs, max = t2.longs;
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
TempMetaData ret;
|
TempTermData ret;
|
||||||
|
|
||||||
while (pos < longsSize && min[pos] == max[pos]) {
|
while (pos < longsSize && min[pos] == max[pos]) {
|
||||||
pos++;
|
pos++;
|
||||||
|
@ -142,7 +142,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
if (pos < longsSize || allZero(min)) { // not comparable or all-zero
|
if (pos < longsSize || allZero(min)) { // not comparable or all-zero
|
||||||
ret = NO_OUTPUT;
|
ret = NO_OUTPUT;
|
||||||
} else {
|
} else {
|
||||||
ret = new TempMetaData(min, null, 0, -1);
|
ret = new TempTermData(min, null, 0, -1);
|
||||||
}
|
}
|
||||||
} else { // equal long[]
|
} else { // equal long[]
|
||||||
if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
|
if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
|
||||||
|
@ -150,7 +150,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
} else if (allZero(min)) {
|
} else if (allZero(min)) {
|
||||||
ret = NO_OUTPUT;
|
ret = NO_OUTPUT;
|
||||||
} else {
|
} else {
|
||||||
ret = new TempMetaData(min, null, 0, -1);
|
ret = new TempTermData(min, null, 0, -1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//if (TEST) System.out.println("ret:"+ret);
|
//if (TEST) System.out.println("ret:"+ret);
|
||||||
|
@ -158,7 +158,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
|
public TempTermData subtract(TempTermData t1, TempTermData t2) {
|
||||||
//if (TEST) System.out.print("subtract("+t1+", "+t2+") = ");
|
//if (TEST) System.out.print("subtract("+t1+", "+t2+") = ");
|
||||||
if (t2 == NO_OUTPUT) {
|
if (t2 == NO_OUTPUT) {
|
||||||
//if (TEST) System.out.println("ret:"+t1);
|
//if (TEST) System.out.println("ret:"+t1);
|
||||||
|
@ -176,11 +176,11 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
TempMetaData ret;
|
TempTermData ret;
|
||||||
if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) {
|
if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) {
|
||||||
ret = NO_OUTPUT;
|
ret = NO_OUTPUT;
|
||||||
} else {
|
} else {
|
||||||
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
ret = new TempTermData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||||
}
|
}
|
||||||
//if (TEST) System.out.println("ret:"+ret);
|
//if (TEST) System.out.println("ret:"+ret);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -190,7 +190,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
// we can gain about 5~7% for fuzzy queries, however this also
|
// we can gain about 5~7% for fuzzy queries, however this also
|
||||||
// means we are putting too much stress on FST Outputs decoding?
|
// means we are putting too much stress on FST Outputs decoding?
|
||||||
@Override
|
@Override
|
||||||
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
|
public TempTermData add(TempTermData t1, TempTermData t2) {
|
||||||
//if (TEST) System.out.print("add("+t1+", "+t2+") = ");
|
//if (TEST) System.out.print("add("+t1+", "+t2+") = ");
|
||||||
if (t1 == NO_OUTPUT) {
|
if (t1 == NO_OUTPUT) {
|
||||||
//if (TEST) System.out.println("ret:"+t2);
|
//if (TEST) System.out.println("ret:"+t2);
|
||||||
|
@ -209,18 +209,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
TempMetaData ret;
|
TempTermData ret;
|
||||||
if (t2.bytes != null || t2.docFreq > 0) {
|
if (t2.bytes != null || t2.docFreq > 0) {
|
||||||
ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
|
ret = new TempTermData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
|
||||||
} else {
|
} else {
|
||||||
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
ret = new TempTermData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||||
}
|
}
|
||||||
//if (TEST) System.out.println("ret:"+ret);
|
//if (TEST) System.out.println("ret:"+ret);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void write(TempMetaData data, DataOutput out) throws IOException {
|
public void write(TempTermData data, DataOutput out) throws IOException {
|
||||||
int bit0 = allZero(data.longs) ? 0 : 1;
|
int bit0 = allZero(data.longs) ? 0 : 1;
|
||||||
int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1;
|
int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1;
|
||||||
int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2;
|
int bit2 = ((data.docFreq == 0) ? 0 : 1) << 2;
|
||||||
|
@ -259,7 +259,7 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TempMetaData read(DataInput in) throws IOException {
|
public TempTermData read(DataInput in) throws IOException {
|
||||||
long[] longs = new long[longsSize];
|
long[] longs = new long[longsSize];
|
||||||
byte[] bytes = null;
|
byte[] bytes = null;
|
||||||
int docFreq = 0;
|
int docFreq = 0;
|
||||||
|
@ -292,29 +292,29 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||||
docFreq = code;
|
docFreq = code;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new TempMetaData(longs, bytes, docFreq, totalTermFreq);
|
return new TempTermData(longs, bytes, docFreq, totalTermFreq);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TempMetaData getNoOutput() {
|
public TempTermData getNoOutput() {
|
||||||
return NO_OUTPUT;
|
return NO_OUTPUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String outputToString(TempMetaData data) {
|
public String outputToString(TempTermData data) {
|
||||||
return data.toString();
|
return data.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
|
static boolean statsEqual(final TempTermData t1, final TempTermData t2) {
|
||||||
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
|
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
|
||||||
}
|
}
|
||||||
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
|
static boolean bytesEqual(final TempTermData t1, final TempTermData t2) {
|
||||||
if (t1.bytes == null && t2.bytes == null) {
|
if (t1.bytes == null && t2.bytes == null) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes);
|
return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes);
|
||||||
}
|
}
|
||||||
static boolean longsEqual(final TempMetaData t1, final TempMetaData t2) {
|
static boolean longsEqual(final TempTermData t1, final TempTermData t2) {
|
||||||
if (t1.longs == null && t2.longs == null) {
|
if (t1.longs == null && t2.longs == null) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -104,14 +104,14 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* and decoding the Postings Metadata and Term Metadata sections.</p>
|
* and decoding the Postings Metadata and Term Metadata sections.</p>
|
||||||
*
|
*
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>TermsDict (.tim) --> Header, <i>Postings Header</i>, NodeBlock<sup>NumBlocks</sup>,
|
* <li>TermsDict (.tim) --> Header, <i>PostingsHeader</i>, NodeBlock<sup>NumBlocks</sup>,
|
||||||
* FieldSummary, DirOffset</li>
|
* FieldSummary, DirOffset</li>
|
||||||
* <li>NodeBlock --> (OuterNode | InnerNode)</li>
|
* <li>NodeBlock --> (OuterNode | InnerNode)</li>
|
||||||
* <li>OuterNode --> EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ><sup>EntryCount</sup>, MetaLength, <<i>Term Metadata</i>><sup>EntryCount</sup></li>
|
* <li>OuterNode --> EntryCount, SuffixLength, Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ><sup>EntryCount</sup>, MetaLength, <<i>TermMetadata</i>><sup>EntryCount</sup></li>
|
||||||
* <li>InnerNode --> EntryCount, SuffixLength[,Sub?], Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ? ><sup>EntryCount</sup>, MetaLength, <<i>Term Metadata ? </i>><sup>EntryCount</sup></li>
|
* <li>InnerNode --> EntryCount, SuffixLength[,Sub?], Byte<sup>SuffixLength</sup>, StatsLength, < TermStats ? ><sup>EntryCount</sup>, MetaLength, <<i>TermMetadata ? </i>><sup>EntryCount</sup></li>
|
||||||
* <li>TermStats --> DocFreq, TotalTermFreq </li>
|
* <li>TermStats --> DocFreq, TotalTermFreq </li>
|
||||||
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, Byte<sup>RootCodeLength</sup>,
|
* <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, RootCodeLength, Byte<sup>RootCodeLength</sup>,
|
||||||
* SumDocFreq, DocCount><sup>NumFields</sup></li>
|
* SumTotalTermFreq?, SumDocFreq, DocCount><sup>NumFields</sup></li>
|
||||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
* <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
|
* <li>DirOffset --> {@link DataOutput#writeLong Uint64}</li>
|
||||||
* <li>EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields,
|
* <li>EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength,NumFields,
|
||||||
|
@ -133,7 +133,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
* <li>SumDocFreq is the total number of postings, the number of term-document pairs across
|
* <li>SumDocFreq is the total number of postings, the number of term-document pairs across
|
||||||
* the entire field.</li>
|
* the entire field.</li>
|
||||||
* <li>DocCount is the number of documents that have at least one posting for this field.</li>
|
* <li>DocCount is the number of documents that have at least one posting for this field.</li>
|
||||||
* <li>PostingsMetadata and TermMetadata are plugged into by the specific postings implementation:
|
* <li>PostingsHeader and TermMetadata are plugged into by the specific postings implementation:
|
||||||
* these contain arbitrary per-file data (such as parameters or versioning information)
|
* these contain arbitrary per-file data (such as parameters or versioning information)
|
||||||
* and per-term data (such as pointers to inverted files).</li>
|
* and per-term data (such as pointers to inverted files).</li>
|
||||||
* <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
|
* <li>For inner nodes of the tree, every entry will steal one bit to mark whether it points
|
||||||
|
|
|
@ -123,11 +123,11 @@ import org.apache.lucene.util.packed.PackedInts;
|
||||||
*
|
*
|
||||||
* <p>NOTE: The term dictionary can plug into different postings implementations:
|
* <p>NOTE: The term dictionary can plug into different postings implementations:
|
||||||
* the postings writer/reader are actually responsible for encoding
|
* the postings writer/reader are actually responsible for encoding
|
||||||
* and decoding the Postings Metadata and Term Metadata sections described here:</p>
|
* and decoding the PostingsHeader and TermMetadata sections described here:</p>
|
||||||
*
|
*
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>Postings Metadata --> Header, PackedBlockSize</li>
|
* <li>PostingsHeader --> Header, PackedBlockSize</li>
|
||||||
* <li>Term Metadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
|
* <li>TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, PayFPDelta?,
|
||||||
* SkipFPDelta?</li>
|
* SkipFPDelta?</li>
|
||||||
* <li>Header, --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
* <li>Header, --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||||
* <li>PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}</li>
|
* <li>PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}</li>
|
||||||
|
|
|
@ -324,11 +324,7 @@ public abstract class LuceneTestCase extends Assert {
|
||||||
"MockFixedIntBlock",
|
"MockFixedIntBlock",
|
||||||
"MockVariableIntBlock",
|
"MockVariableIntBlock",
|
||||||
"MockSep",
|
"MockSep",
|
||||||
"MockRandom",
|
"MockRandom"
|
||||||
"TempSep",
|
|
||||||
"TempFixedIntBlock",
|
|
||||||
"TempVariableIntBlock",
|
|
||||||
"TempRandom"
|
|
||||||
));
|
));
|
||||||
|
|
||||||
// -----------------------------------------------------------------
|
// -----------------------------------------------------------------
|
||||||
|
|
Loading…
Reference in New Issue