reader part, support basic enums

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1500391 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Han Jiang 2013-07-07 09:14:17 +00:00
parent d6e2f4b663
commit 9f6db24cee
9 changed files with 288 additions and 105 deletions

View File

@ -56,7 +56,7 @@ import org.apache.lucene.util.packed.PackedInts;
*
* <li>
* <b>Block structure</b>:
* <p>When the postings are long enough, TempPostingsFormat will try to encode most integer data
* <p>When the postings are long enough, TempBlockPostingsFormat will try to encode most integer data
* as a packed block.</p>
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as two packed
* blocks, while the remaining 3 are encoded as one VInt block. </p>
@ -159,7 +159,7 @@ import org.apache.lucene.util.packed.PackedInts;
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
* file. In particular, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
* (i.e. 8 in TempPostingsFormat).</li>
* (i.e. 8 in TempBlockPostingsFormat).</li>
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
* single document ID is written to the term dictionary.</li>
@ -239,7 +239,7 @@ import org.apache.lucene.util.packed.PackedInts;
* We use this trick since the definition of skip entry is a little different from base interface.
* In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
* skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
* in TempPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
* in TempBlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
* 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
* When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
* more skip data than TempSkipWriter. </li>
@ -352,7 +352,7 @@ import org.apache.lucene.util.packed.PackedInts;
* @lucene.experimental
*/
public final class TempPostingsFormat extends PostingsFormat {
public final class TempBlockPostingsFormat extends PostingsFormat {
/**
* Filename extension for document number, frequencies, and skip data.
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
@ -381,20 +381,17 @@ public final class TempPostingsFormat extends PostingsFormat {
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
public final static int BLOCK_SIZE = 128;
/** Creates {@code TempPostingsFormat} with default
/** Creates {@code TempBlockPostingsFormat} with default
* settings. */
public TempPostingsFormat() {
super("TempFST");
minTermBlockSize = 0;
maxTermBlockSize = 0;
//this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
public TempBlockPostingsFormat() {
this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/** Creates {@code TempPostingsFormat} with custom
/** Creates {@code TempBlockPostingsFormat} with custom
* values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
* @see TempBlockTermsWriter#TempBlockTermsWriter(SegmentWriteState,TempPostingsWriterBase,int,int) */
public TempPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
public TempBlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
super("TempBlock");
this.minTermBlockSize = minTermBlockSize;
assert minTermBlockSize > 1;
@ -413,11 +410,10 @@ public final class TempPostingsFormat extends PostingsFormat {
boolean success = false;
try {
//FieldsConsumer ret = new TempBlockTermsWriter(state,
// postingsWriter,
// minTermBlockSize,
// maxTermBlockSize);
FieldsConsumer ret = new TempFSTTermsWriter(state, postingsWriter);
FieldsConsumer ret = new TempBlockTermsWriter(state,
postingsWriter,
minTermBlockSize,
maxTermBlockSize);
success = true;
return ret;
} finally {
@ -436,14 +432,13 @@ public final class TempPostingsFormat extends PostingsFormat {
state.segmentSuffix);
boolean success = false;
try {
//FieldsProducer ret = new TempBlockTermsReader(state.directory,
// state.fieldInfos,
// state.segmentInfo,
// postingsReader,
// state.context,
// state.segmentSuffix,
// state.termsIndexDivisor);
FieldsProducer ret = new TempFSTTermsReader(state, postingsReader);
FieldsProducer ret = new TempBlockTermsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
postingsReader,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {

View File

@ -0,0 +1,77 @@
package org.apache.lucene.codecs.temp;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TempPostingsReaderBase;
import org.apache.lucene.codecs.TempPostingsWriterBase;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
public final class TempFSTPostingsFormat extends PostingsFormat {
public TempFSTPostingsFormat() {
super("TempFST");
}
@Override
public String toString() {
return getName();
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
TempPostingsWriterBase postingsWriter = new TempPostingsWriter(state);
boolean success = false;
try {
FieldsConsumer ret = new TempFSTTermsWriter(state, postingsWriter);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
TempPostingsReaderBase postingsReader = new TempPostingsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
state.context,
state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new TempFSTTermsReader(state, postingsReader);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -18,8 +18,6 @@ package org.apache.lucene.codecs.temp;
*/
import java.io.IOException;
import java.io.PrintWriter;
import java.io.File;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
@ -43,23 +41,19 @@ import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.TempPostingsReaderBase;
import org.apache.lucene.codecs.CodecUtil;
public class TempFSTTermsReader extends FieldsProducer {
final TempPostingsReaderBase postingsReader;
final IndexInput in;
final TreeMap<String, FieldReader> fields = new TreeMap<String, FieldReader>();
final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
boolean DEBUG = false;
public TempFSTTermsReader(SegmentReadState state, TempPostingsReaderBase postingsReader) throws IOException {
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
@ -83,8 +77,8 @@ public class TempFSTTermsReader extends FieldsProducer {
long sumDocFreq = in.readVLong();
int docCount = in.readVInt();
int longsSize = in.readVInt();
FieldReader current = new FieldReader(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
FieldReader previous = fields.put(fieldInfo.name, current);
TermsReader current = new TermsReader(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
TermsReader previous = fields.put(fieldInfo.name, current);
checkFieldSummary(state.segmentInfo, current, previous);
}
success = true;
@ -96,7 +90,8 @@ public class TempFSTTermsReader extends FieldsProducer {
}
private int readHeader(IndexInput in) throws IOException {
return CodecUtil.checkHeader(in, TempFSTTermsWriter.TERMS_CODEC_NAME,
return CodecUtil.checkHeader(in,
TempFSTTermsWriter.TERMS_CODEC_NAME,
TempFSTTermsWriter.TERMS_VERSION_START,
TempFSTTermsWriter.TERMS_VERSION_CURRENT);
}
@ -104,7 +99,7 @@ public class TempFSTTermsReader extends FieldsProducer {
in.seek(in.length() - 8);
in.seek(in.readLong());
}
private void checkFieldSummary(SegmentInfo info, FieldReader field, FieldReader previous) throws IOException {
private void checkFieldSummary(SegmentInfo info, TermsReader field, TermsReader previous) throws IOException {
// #docs with field must be <= #docs
if (field.docCount < 0 || field.docCount > info.getDocCount()) {
throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
@ -147,7 +142,7 @@ public class TempFSTTermsReader extends FieldsProducer {
}
}
final class FieldReader extends Terms {
final class TermsReader extends Terms {
final FieldInfo fieldInfo;
final long numTerms;
final long sumTotalTermFreq;
@ -156,16 +151,14 @@ public class TempFSTTermsReader extends FieldsProducer {
final int longsSize;
final FST<TempTermOutputs.TempMetaData> dict;
FieldReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
TermsReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
this.fieldInfo = fieldInfo;
this.numTerms = numTerms;
this.sumTotalTermFreq = sumTotalTermFreq;
this.sumDocFreq = sumDocFreq;
this.docCount = docCount;
this.longsSize = longsSize;
this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(longsSize));
//PrintWriter pw = new PrintWriter(new File("../temp/xxx.txt"));
//Util.toDot(dict, pw, false, false);
this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(fieldInfo, longsSize));
}
// nocommit: implement intersect
@ -216,8 +209,32 @@ public class TempFSTTermsReader extends FieldsProducer {
}
// Iterates through terms in this field
private final class SegmentTermsEnum extends TermsEnum {
SegmentTermsEnum() {
final class SegmentTermsEnum extends TermsEnum {
final BytesRefFSTEnum<TempTermOutputs.TempMetaData> fstEnum;
/* Current term, null when enum ends or unpositioned */
BytesRef term;
/* Current term stats + decoded metadata (customized by PBF) */
final TempTermState state;
/* Current term stats + undecoded metadata (long[] & byte[]) */
TempTermOutputs.TempMetaData meta;
ByteArrayDataInput bytesReader;
/* True when current term's metadata is decoded */
boolean decoded;
/* True when current enum is 'positioned' by seekExact(TermState) */
boolean seekPending;
SegmentTermsEnum() throws IOException {
this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempMetaData>(dict);
this.state = postingsReader.newTermState();
this.bytesReader = new ByteArrayDataInput();
this.term = null;
this.decoded = false;
this.seekPending = false;
}
@Override
@ -226,56 +243,115 @@ public class TempFSTTermsReader extends FieldsProducer {
}
@Override
public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
return null;
}
@Override
public BytesRef next() throws IOException {
return null;
public TermState termState() throws IOException {
decodeMetaData();
return state.clone();
}
@Override
public BytesRef term() {
return null;
return term;
}
@Override
public int docFreq() throws IOException {
return 0;
return state.docFreq;
}
@Override
public long totalTermFreq() throws IOException {
return 0;
return state.totalTermFreq;
}
// Let PBF decodes metadata from long[] and byte[]
private void decodeMetaData() throws IOException {
if (!decoded && !seekPending) {
if (meta.bytes != null) {
bytesReader.reset(meta.bytes, 0, meta.bytes.length);
}
postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state);
decoded = true;
}
}
// Update current enum according to FSTEnum
private void updateEnum(final InputOutput<TempTermOutputs.TempMetaData> pair) {
if (pair == null) {
term = null;
} else {
term = pair.input;
meta = pair.output;
state.docFreq = meta.docFreq;
state.totalTermFreq = meta.totalTermFreq;
}
decoded = false;
seekPending = false;
}
// nocommit: reuse?
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
return null;
decodeMetaData();
return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
return null;
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
return null;
}
decodeMetaData();
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
}
@Override
public BytesRef next() throws IOException {
if (seekPending) { // previously positioned, but termOutputs not fetched
seekPending = false;
if (seekCeil(term, false) != SeekStatus.FOUND) {
return term;
}
}
updateEnum(fstEnum.next());
return term;
}
@Override
public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException {
updateEnum(fstEnum.seekExact(target));
return term != null;
}
// nocommit: when will we useCache?
@Override
public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
updateEnum(fstEnum.seekCeil(target));
if (term == null) {
return SeekStatus.END;
} else {
return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
}
}
// nocommit: this method doesn't act as 'seekExact' right?
@Override
public void seekExact(BytesRef target, TermState otherState) {
if (term == null || target.compareTo(term) != 0) {
state.copyFrom(otherState);
term = BytesRef.deepCopyOf(target);
seekPending = true;
}
}
@Override
public TermState termState() throws IOException {
return null;
}
// nocommit: do we need this?
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
return 0;
throw new UnsupportedOperationException();
}
}
}

View File

@ -46,6 +46,7 @@ import org.apache.lucene.codecs.CodecUtil;
/** FST based term dict, all the metadata held
* as output of FST */
// nocommit: where is 'TermStats' ???
public class TempFSTTermsWriter extends FieldsConsumer {
static final String TERMS_EXTENSION = "tmp";
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
@ -135,7 +136,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
}
}
class TermsWriter extends TermsConsumer {
final class TermsWriter extends TermsConsumer {
private final Builder<TempTermOutputs.TempMetaData> builder;
private final TempTermOutputs outputs;
private final FieldInfo fieldInfo;
@ -143,13 +144,14 @@ public class TempFSTTermsWriter extends FieldsConsumer {
private long numTerms;
private final IntsRef scratchTerm = new IntsRef();
private final RAMOutputStream statsWriter = new RAMOutputStream();
private final RAMOutputStream metaWriter = new RAMOutputStream();
TermsWriter(FieldInfo fieldInfo) {
this.numTerms = 0;
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = new TempTermOutputs(longsSize);
this.outputs = new TempTermOutputs(fieldInfo, longsSize);
this.builder = new Builder<TempTermOutputs.TempMetaData>(FST.INPUT_TYPE.BYTE1, outputs);
}
@ -166,16 +168,14 @@ public class TempFSTTermsWriter extends FieldsConsumer {
@Override
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
// write term meta data into fst
final TempTermOutputs.TempMetaData meta = new TempTermOutputs.TempMetaData();
meta.longs = new long[longsSize];
meta.bytes = null;
meta.docFreq = stats.docFreq;
meta.totalTermFreq = stats.totalTermFreq;
postingsWriter.finishTerm(meta.longs, metaWriter, stats);
/*
meta.bytes = new byte[(int)metaWriter.getFilePointer()];
metaWriter.writeTo(meta.bytes, 0);
metaWriter.reset();
*/
int bytesSize = (int)metaWriter.getFilePointer();
final int bytesSize = (int)metaWriter.getFilePointer();
if (bytesSize > 0) {
meta.bytes = new byte[bytesSize];
metaWriter.writeTo(meta.bytes, 0);
@ -191,6 +191,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
// save FST dict
if (numTerms > 0) {
final FST<TempTermOutputs.TempMetaData> fst = builder.finish();
//fst.dump();
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
}
}

View File

@ -71,7 +71,7 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
IndexInput posIn = null;
IndexInput payIn = null;
try {
docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.DOC_EXTENSION),
ioContext);
CodecUtil.checkHeader(docIn,
TempPostingsWriter.DOC_CODEC,
@ -80,7 +80,7 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
forUtil = new ForUtil(docIn);
if (fieldInfos.hasProx()) {
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.POS_EXTENSION),
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.POS_EXTENSION),
ioContext);
CodecUtil.checkHeader(posIn,
TempPostingsWriter.POS_CODEC,
@ -88,7 +88,7 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
TempPostingsWriter.VERSION_CURRENT);
if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.PAY_EXTENSION),
ioContext);
CodecUtil.checkHeader(payIn,
TempPostingsWriter.PAY_CODEC,

View File

@ -119,7 +119,7 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
public TempPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
super();
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.DOC_EXTENSION),
state.context);
IndexOutput posOut = null;
IndexOutput payOut = null;
@ -129,7 +129,7 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
if (state.fieldInfos.hasProx()) {
posDeltaBuffer = new int[MAX_DATA_SIZE];
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.POS_EXTENSION),
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.POS_EXTENSION),
state.context);
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
@ -150,7 +150,7 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
}
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.PAY_EXTENSION),
state.context);
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
}

View File

@ -20,6 +20,8 @@ package org.apache.lucene.codecs.temp;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.fst.Outputs;
@ -31,18 +33,25 @@ import org.apache.lucene.util.LongsRef;
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
private final static TempMetaData NO_OUTPUT = new TempMetaData();
private static boolean DEBUG = false;
private FieldInfo fieldInfo;
private int longsSize;
public static class TempMetaData {
public long[] longs;
public byte[] bytes;
int docFreq;
long totalTermFreq;
TempMetaData() {
this.longs = null;
this.bytes = null;
this.docFreq = 0;
this.totalTermFreq = -1;
}
TempMetaData(long[] longs, byte[] bytes) {
TempMetaData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) {
this.longs = longs;
this.bytes = bytes;
this.docFreq = docFreq;
this.totalTermFreq = totalTermFreq;
}
@Override
public int hashCode() {
@ -79,12 +88,14 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
if (bytes != null) {
sb.append(" [ ");
for (int i = 0; i < bytes.length; i++) {
sb.append(bytes[i]+" ");
sb.append(Integer.toHexString((int)bytes[i] & 0xff)+" ");
}
sb.append("]");
} else {
sb.append(" null");
}
sb.append(" "+docFreq);
sb.append(" "+totalTermFreq);
return sb.toString();
}
}
@ -92,7 +103,8 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
private TempTermOutputs() {
}
protected TempTermOutputs(int longsSize) {
protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) {
this.fieldInfo = fieldInfo;
this.longsSize = longsSize;
}
@ -102,13 +114,13 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
// i.e. when every value in long[] fits the same ordering, the smaller one
// will be the result.
//
// NOTE: only long[] is 'shared', i.e. after sharing common value,
// the output of smaller one will be a all-zero long[] with original byte[] blob.
// NOTE: only long[] is 'shared', i.e. if there are two byte[] on the successive
// arcs, only the last byte[] is valid. (this somewhat saves nodes, but might affect
// compression, since we'll have to load metadata block for other terms as well, currently,
// we don't support this)
//
// nocommit: Builder.add() doesn't immediatelly consumes the output data,
// which means, the longs after one add() should all be deeply copied
// instead of being reused? quite hairly to detect it here, so the caller
// must be careful about this.
// nocommit: get the byte[] from smaller one as well, so that
// byte[] is actually inherited
//
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
@ -148,18 +160,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
if (pos < longsSize || accum == 0) {
ret = NO_OUTPUT;
} else if (order) {
ret = new TempMetaData(longs2, null);
ret = new TempMetaData(longs2, null, 0, -1);
} else {
ret = new TempMetaData(longs1, null);
ret = new TempMetaData(longs1, null, 0, -1);
}
} else {
// equal
if (t1.bytes!= null && Arrays.equals(t1.bytes, t2.bytes)) { // all fields are equal
if (t1.bytes!= null && bytesEqual(t1, t2) && statsEqual(t1, t2)) { // all fields are equal
ret = t1;
} else if (accum == 0) { // all zero case
ret = NO_OUTPUT;
} else {
ret = new TempMetaData(longs1, null);
ret = new TempMetaData(longs1, null, 0, -1);
}
}
if (DEBUG) System.out.println("ret:"+ret);
@ -189,21 +201,27 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
}
TempMetaData ret;
if (diff == 0 && (t1.bytes == null || t1.bytes.length == 0)) {
if (diff == 0 && bytesEqual(t1, t2) && statsEqual(t1, t2)) {
ret = NO_OUTPUT;
} else {
ret = new TempMetaData(share, t1.bytes);
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret);
return ret;
}
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
}
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
return Arrays.equals(t1.bytes, t2.bytes);
}
@Override
// nocommit: need to check all-zero case?
// so we can reuse one long[]
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
// nocommit: necessary?
if (t1 == NO_OUTPUT) {
if (DEBUG) System.out.println("ret:"+t2);
return t2;
@ -215,17 +233,17 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
assert t2.longs != null;
int pos = 0;
long[] accum = new long[longsSize]; // nocommit: reuse
long[] accum = new long[longsSize]; // nocommit: reuse?
while (pos < longsSize) {
accum[pos] = t1.longs[pos] + t2.longs[pos];
assert(accum[pos] >= 0);
pos++;
}
TempMetaData ret;
if (t2.bytes != null) {
ret = new TempMetaData(accum, t2.bytes);
if (t2.bytes != null || t2.docFreq > 0) {
ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
} else {
ret = new TempMetaData(accum, t1.bytes);
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
}
if (DEBUG) System.out.println("ret:"+ret);
return ret;
@ -236,14 +254,20 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
for (int pos = 0; pos < longsSize; pos++) {
out.writeVLong(data.longs[pos]);
}
int code = data.docFreq == 0 ? 0 : 1;
if (data.bytes != null) {
out.writeVInt(data.bytes.length);
out.writeVInt((data.bytes.length << 1) | code);
out.writeBytes(data.bytes, 0, data.bytes.length);
} else {
out.writeVInt(0);
out.writeVInt(code);
}
if (data.docFreq > 0) {
out.writeVInt(data.docFreq);
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
out.writeVLong(data.totalTermFreq - data.docFreq);
}
}
}
// nocommit: can this non-null byte case be used in Final Output?
@Override
public TempMetaData read(DataInput in) throws IOException {
@ -251,13 +275,22 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
for (int pos = 0; pos < longsSize; pos++) {
longs[pos] = in.readVLong();
}
int bytesSize = in.readVInt();
int code = in.readVInt();
int bytesSize = code >>> 1;
int docFreq = 0;
long totalTermFreq = -1;
byte[] bytes = null;
if (bytesSize > 0) {
bytes = new byte[bytesSize];
in.readBytes(bytes, 0, bytes.length);
}
TempMetaData meta = new TempMetaData(longs, bytes);
if ((code & 1) == 1) {
docFreq = in.readVInt();
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
totalTermFreq = docFreq + in.readVLong();
}
}
TempMetaData meta = new TempMetaData(longs, bytes, docFreq, totalTermFreq);
return meta;
}

View File

@ -15,4 +15,5 @@
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat
org.apache.lucene.codecs.temp.TempPostingsFormat
org.apache.lucene.codecs.temp.TempBlockPostingsFormat
org.apache.lucene.codecs.temp.TempFSTPostingsFormat

View File

@ -1015,7 +1015,7 @@ public class TestIndexWriterReader extends LuceneTestCase {
// Don't proceed if picked Codec is in the list of illegal ones.
final String format = _TestUtil.getPostingsFormat("f");
assumeFalse("Format: " + format + " does not support ReaderTermsIndexDivisor!",
(format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct")));
(format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct") || format.equals("TempFST")));
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, conf);