mirror of https://github.com/apache/lucene.git
reader part, support basic enums
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3069@1500391 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d6e2f4b663
commit
9f6db24cee
|
@ -56,7 +56,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
*
|
||||
* <li>
|
||||
* <b>Block structure</b>:
|
||||
* <p>When the postings are long enough, TempPostingsFormat will try to encode most integer data
|
||||
* <p>When the postings are long enough, TempBlockPostingsFormat will try to encode most integer data
|
||||
* as a packed block.</p>
|
||||
* <p>Take a term with 259 documents as an example, the first 256 document ids are encoded as two packed
|
||||
* blocks, while the remaining 3 are encoded as one VInt block. </p>
|
||||
|
@ -159,7 +159,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* <li>SkipFPDelta determines the position of this term's SkipData within the .doc
|
||||
* file. In particular, it is the length of the TermFreq data.
|
||||
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum
|
||||
* (i.e. 8 in TempPostingsFormat).</li>
|
||||
* (i.e. 8 in TempBlockPostingsFormat).</li>
|
||||
* <li>SingletonDocID is an optimization when a term only appears in one document. In this case, instead
|
||||
* of writing a file pointer to the .doc file (DocFPDelta), and then a VIntBlock at that location, the
|
||||
* single document ID is written to the term dictionary.</li>
|
||||
|
@ -239,7 +239,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* We use this trick since the definition of skip entry is a little different from base interface.
|
||||
* In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
|
||||
* skipInterval<sup>th</sup>, 2*skipInterval<sup>th</sup> ... posting in the list. However,
|
||||
* in TempPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
|
||||
* in TempBlockPostingsFormat, the skip data is saved for skipInterval+1<sup>th</sup>,
|
||||
* 2*skipInterval+1<sup>th</sup> ... posting (skipInterval==PackedBlockSize in this case).
|
||||
* When DocFreq is multiple of PackedBlockSize, MultiLevelSkipListWriter will expect one
|
||||
* more skip data than TempSkipWriter. </li>
|
||||
|
@ -352,7 +352,7 @@ import org.apache.lucene.util.packed.PackedInts;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
|
||||
public final class TempPostingsFormat extends PostingsFormat {
|
||||
public final class TempBlockPostingsFormat extends PostingsFormat {
|
||||
/**
|
||||
* Filename extension for document number, frequencies, and skip data.
|
||||
* See chapter: <a href="#Frequencies">Frequencies and Skip Data</a>
|
||||
|
@ -381,20 +381,17 @@ public final class TempPostingsFormat extends PostingsFormat {
|
|||
// NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
|
||||
public final static int BLOCK_SIZE = 128;
|
||||
|
||||
/** Creates {@code TempPostingsFormat} with default
|
||||
/** Creates {@code TempBlockPostingsFormat} with default
|
||||
* settings. */
|
||||
public TempPostingsFormat() {
|
||||
super("TempFST");
|
||||
minTermBlockSize = 0;
|
||||
maxTermBlockSize = 0;
|
||||
//this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
public TempBlockPostingsFormat() {
|
||||
this(TempBlockTermsWriter.DEFAULT_MIN_BLOCK_SIZE, TempBlockTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/** Creates {@code TempPostingsFormat} with custom
|
||||
/** Creates {@code TempBlockPostingsFormat} with custom
|
||||
* values for {@code minBlockSize} and {@code
|
||||
* maxBlockSize} passed to block terms dictionary.
|
||||
* @see TempBlockTermsWriter#TempBlockTermsWriter(SegmentWriteState,TempPostingsWriterBase,int,int) */
|
||||
public TempPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
public TempBlockPostingsFormat(int minTermBlockSize, int maxTermBlockSize) {
|
||||
super("TempBlock");
|
||||
this.minTermBlockSize = minTermBlockSize;
|
||||
assert minTermBlockSize > 1;
|
||||
|
@ -413,11 +410,10 @@ public final class TempPostingsFormat extends PostingsFormat {
|
|||
|
||||
boolean success = false;
|
||||
try {
|
||||
//FieldsConsumer ret = new TempBlockTermsWriter(state,
|
||||
// postingsWriter,
|
||||
// minTermBlockSize,
|
||||
// maxTermBlockSize);
|
||||
FieldsConsumer ret = new TempFSTTermsWriter(state, postingsWriter);
|
||||
FieldsConsumer ret = new TempBlockTermsWriter(state,
|
||||
postingsWriter,
|
||||
minTermBlockSize,
|
||||
maxTermBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
|
@ -436,14 +432,13 @@ public final class TempPostingsFormat extends PostingsFormat {
|
|||
state.segmentSuffix);
|
||||
boolean success = false;
|
||||
try {
|
||||
//FieldsProducer ret = new TempBlockTermsReader(state.directory,
|
||||
// state.fieldInfos,
|
||||
// state.segmentInfo,
|
||||
// postingsReader,
|
||||
// state.context,
|
||||
// state.segmentSuffix,
|
||||
// state.termsIndexDivisor);
|
||||
FieldsProducer ret = new TempFSTTermsReader(state, postingsReader);
|
||||
FieldsProducer ret = new TempBlockTermsReader(state.directory,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
postingsReader,
|
||||
state.context,
|
||||
state.segmentSuffix,
|
||||
state.termsIndexDivisor);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
|
@ -0,0 +1,77 @@
|
|||
package org.apache.lucene.codecs.temp;
|
||||
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsConsumer;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.TempPostingsReaderBase;
|
||||
import org.apache.lucene.codecs.TempPostingsWriterBase;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
public final class TempFSTPostingsFormat extends PostingsFormat {
|
||||
public TempFSTPostingsFormat() {
|
||||
super("TempFST");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
TempPostingsWriterBase postingsWriter = new TempPostingsWriter(state);
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsConsumer ret = new TempFSTTermsWriter(state, postingsWriter);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
TempPostingsReaderBase postingsReader = new TempPostingsReader(state.directory,
|
||||
state.fieldInfos,
|
||||
state.segmentInfo,
|
||||
state.context,
|
||||
state.segmentSuffix);
|
||||
boolean success = false;
|
||||
try {
|
||||
FieldsProducer ret = new TempFSTTermsReader(state, postingsReader);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(postingsReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -18,8 +18,6 @@ package org.apache.lucene.codecs.temp;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.io.File;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
@ -43,23 +41,19 @@ import org.apache.lucene.store.IndexInput;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.RunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.codecs.TempPostingsReaderBase;
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
|
||||
|
||||
public class TempFSTTermsReader extends FieldsProducer {
|
||||
final TempPostingsReaderBase postingsReader;
|
||||
final IndexInput in;
|
||||
final TreeMap<String, FieldReader> fields = new TreeMap<String, FieldReader>();
|
||||
|
||||
final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
|
||||
boolean DEBUG = false;
|
||||
|
||||
public TempFSTTermsReader(SegmentReadState state, TempPostingsReaderBase postingsReader) throws IOException {
|
||||
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempFSTTermsWriter.TERMS_EXTENSION);
|
||||
|
@ -83,8 +77,8 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
long sumDocFreq = in.readVLong();
|
||||
int docCount = in.readVInt();
|
||||
int longsSize = in.readVInt();
|
||||
FieldReader current = new FieldReader(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
|
||||
FieldReader previous = fields.put(fieldInfo.name, current);
|
||||
TermsReader current = new TermsReader(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
|
||||
TermsReader previous = fields.put(fieldInfo.name, current);
|
||||
checkFieldSummary(state.segmentInfo, current, previous);
|
||||
}
|
||||
success = true;
|
||||
|
@ -96,7 +90,8 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
private int readHeader(IndexInput in) throws IOException {
|
||||
return CodecUtil.checkHeader(in, TempFSTTermsWriter.TERMS_CODEC_NAME,
|
||||
return CodecUtil.checkHeader(in,
|
||||
TempFSTTermsWriter.TERMS_CODEC_NAME,
|
||||
TempFSTTermsWriter.TERMS_VERSION_START,
|
||||
TempFSTTermsWriter.TERMS_VERSION_CURRENT);
|
||||
}
|
||||
|
@ -104,7 +99,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
in.seek(in.length() - 8);
|
||||
in.seek(in.readLong());
|
||||
}
|
||||
private void checkFieldSummary(SegmentInfo info, FieldReader field, FieldReader previous) throws IOException {
|
||||
private void checkFieldSummary(SegmentInfo info, TermsReader field, TermsReader previous) throws IOException {
|
||||
// #docs with field must be <= #docs
|
||||
if (field.docCount < 0 || field.docCount > info.getDocCount()) {
|
||||
throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
|
||||
|
@ -147,7 +142,7 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
}
|
||||
}
|
||||
|
||||
final class FieldReader extends Terms {
|
||||
final class TermsReader extends Terms {
|
||||
final FieldInfo fieldInfo;
|
||||
final long numTerms;
|
||||
final long sumTotalTermFreq;
|
||||
|
@ -156,16 +151,14 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
final int longsSize;
|
||||
final FST<TempTermOutputs.TempMetaData> dict;
|
||||
|
||||
FieldReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
|
||||
TermsReader(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize) throws IOException {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.numTerms = numTerms;
|
||||
this.sumTotalTermFreq = sumTotalTermFreq;
|
||||
this.sumDocFreq = sumDocFreq;
|
||||
this.docCount = docCount;
|
||||
this.longsSize = longsSize;
|
||||
this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(longsSize));
|
||||
//PrintWriter pw = new PrintWriter(new File("../temp/xxx.txt"));
|
||||
//Util.toDot(dict, pw, false, false);
|
||||
this.dict = new FST<TempTermOutputs.TempMetaData>(in, new TempTermOutputs(fieldInfo, longsSize));
|
||||
}
|
||||
|
||||
// nocommit: implement intersect
|
||||
|
@ -216,8 +209,32 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
// Iterates through terms in this field
|
||||
private final class SegmentTermsEnum extends TermsEnum {
|
||||
SegmentTermsEnum() {
|
||||
final class SegmentTermsEnum extends TermsEnum {
|
||||
final BytesRefFSTEnum<TempTermOutputs.TempMetaData> fstEnum;
|
||||
|
||||
/* Current term, null when enum ends or unpositioned */
|
||||
BytesRef term;
|
||||
|
||||
/* Current term stats + decoded metadata (customized by PBF) */
|
||||
final TempTermState state;
|
||||
|
||||
/* Current term stats + undecoded metadata (long[] & byte[]) */
|
||||
TempTermOutputs.TempMetaData meta;
|
||||
ByteArrayDataInput bytesReader;
|
||||
|
||||
/* True when current term's metadata is decoded */
|
||||
boolean decoded;
|
||||
|
||||
/* True when current enum is 'positioned' by seekExact(TermState) */
|
||||
boolean seekPending;
|
||||
|
||||
SegmentTermsEnum() throws IOException {
|
||||
this.fstEnum = new BytesRefFSTEnum<TempTermOutputs.TempMetaData>(dict);
|
||||
this.state = postingsReader.newTermState();
|
||||
this.bytesReader = new ByteArrayDataInput();
|
||||
this.term = null;
|
||||
this.decoded = false;
|
||||
this.seekPending = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -226,56 +243,115 @@ public class TempFSTTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
return null;
|
||||
public TermState termState() throws IOException {
|
||||
decodeMetaData();
|
||||
return state.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef term() {
|
||||
return null;
|
||||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docFreq() throws IOException {
|
||||
return 0;
|
||||
return state.docFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long totalTermFreq() throws IOException {
|
||||
return 0;
|
||||
return state.totalTermFreq;
|
||||
}
|
||||
|
||||
// Let PBF decodes metadata from long[] and byte[]
|
||||
private void decodeMetaData() throws IOException {
|
||||
if (!decoded && !seekPending) {
|
||||
if (meta.bytes != null) {
|
||||
bytesReader.reset(meta.bytes, 0, meta.bytes.length);
|
||||
}
|
||||
postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state);
|
||||
decoded = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Update current enum according to FSTEnum
|
||||
private void updateEnum(final InputOutput<TempTermOutputs.TempMetaData> pair) {
|
||||
if (pair == null) {
|
||||
term = null;
|
||||
} else {
|
||||
term = pair.input;
|
||||
meta = pair.output;
|
||||
state.docFreq = meta.docFreq;
|
||||
state.totalTermFreq = meta.totalTermFreq;
|
||||
}
|
||||
decoded = false;
|
||||
seekPending = false;
|
||||
}
|
||||
|
||||
// nocommit: reuse?
|
||||
@Override
|
||||
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
|
||||
return null;
|
||||
decodeMetaData();
|
||||
return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||
return null;
|
||||
if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
return null;
|
||||
}
|
||||
decodeMetaData();
|
||||
return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
if (seekPending) { // previously positioned, but termOutputs not fetched
|
||||
seekPending = false;
|
||||
if (seekCeil(term, false) != SeekStatus.FOUND) {
|
||||
return term;
|
||||
}
|
||||
}
|
||||
updateEnum(fstEnum.next());
|
||||
return term;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekExact(final BytesRef target, final boolean useCache) throws IOException {
|
||||
updateEnum(fstEnum.seekExact(target));
|
||||
return term != null;
|
||||
}
|
||||
|
||||
// nocommit: when will we useCache?
|
||||
@Override
|
||||
public SeekStatus seekCeil(final BytesRef target, final boolean useCache) throws IOException {
|
||||
updateEnum(fstEnum.seekCeil(target));
|
||||
if (term == null) {
|
||||
return SeekStatus.END;
|
||||
} else {
|
||||
return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit: this method doesn't act as 'seekExact' right?
|
||||
@Override
|
||||
public void seekExact(BytesRef target, TermState otherState) {
|
||||
if (term == null || target.compareTo(term) != 0) {
|
||||
state.copyFrom(otherState);
|
||||
term = BytesRef.deepCopyOf(target);
|
||||
seekPending = true;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermState termState() throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
// nocommit: do we need this?
|
||||
@Override
|
||||
public void seekExact(long ord) throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ord() {
|
||||
return 0;
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@ import org.apache.lucene.codecs.CodecUtil;
|
|||
/** FST based term dict, all the metadata held
|
||||
* as output of FST */
|
||||
|
||||
// nocommit: where is 'TermStats' ???
|
||||
public class TempFSTTermsWriter extends FieldsConsumer {
|
||||
static final String TERMS_EXTENSION = "tmp";
|
||||
static final String TERMS_CODEC_NAME = "FST_TERMS_DICT";
|
||||
|
@ -135,7 +136,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
|||
}
|
||||
}
|
||||
|
||||
class TermsWriter extends TermsConsumer {
|
||||
final class TermsWriter extends TermsConsumer {
|
||||
private final Builder<TempTermOutputs.TempMetaData> builder;
|
||||
private final TempTermOutputs outputs;
|
||||
private final FieldInfo fieldInfo;
|
||||
|
@ -143,13 +144,14 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
|||
private long numTerms;
|
||||
|
||||
private final IntsRef scratchTerm = new IntsRef();
|
||||
private final RAMOutputStream statsWriter = new RAMOutputStream();
|
||||
private final RAMOutputStream metaWriter = new RAMOutputStream();
|
||||
|
||||
TermsWriter(FieldInfo fieldInfo) {
|
||||
this.numTerms = 0;
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.longsSize = postingsWriter.setField(fieldInfo);
|
||||
this.outputs = new TempTermOutputs(longsSize);
|
||||
this.outputs = new TempTermOutputs(fieldInfo, longsSize);
|
||||
this.builder = new Builder<TempTermOutputs.TempMetaData>(FST.INPUT_TYPE.BYTE1, outputs);
|
||||
}
|
||||
|
||||
|
@ -166,16 +168,14 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
|||
|
||||
@Override
|
||||
public void finishTerm(BytesRef text, TermStats stats) throws IOException {
|
||||
// write term meta data into fst
|
||||
final TempTermOutputs.TempMetaData meta = new TempTermOutputs.TempMetaData();
|
||||
meta.longs = new long[longsSize];
|
||||
meta.bytes = null;
|
||||
meta.docFreq = stats.docFreq;
|
||||
meta.totalTermFreq = stats.totalTermFreq;
|
||||
postingsWriter.finishTerm(meta.longs, metaWriter, stats);
|
||||
/*
|
||||
meta.bytes = new byte[(int)metaWriter.getFilePointer()];
|
||||
metaWriter.writeTo(meta.bytes, 0);
|
||||
metaWriter.reset();
|
||||
*/
|
||||
int bytesSize = (int)metaWriter.getFilePointer();
|
||||
final int bytesSize = (int)metaWriter.getFilePointer();
|
||||
if (bytesSize > 0) {
|
||||
meta.bytes = new byte[bytesSize];
|
||||
metaWriter.writeTo(meta.bytes, 0);
|
||||
|
@ -191,6 +191,7 @@ public class TempFSTTermsWriter extends FieldsConsumer {
|
|||
// save FST dict
|
||||
if (numTerms > 0) {
|
||||
final FST<TempTermOutputs.TempMetaData> fst = builder.finish();
|
||||
//fst.dump();
|
||||
fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -71,7 +71,7 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
|
|||
IndexInput posIn = null;
|
||||
IndexInput payIn = null;
|
||||
try {
|
||||
docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
|
||||
docIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.DOC_EXTENSION),
|
||||
ioContext);
|
||||
CodecUtil.checkHeader(docIn,
|
||||
TempPostingsWriter.DOC_CODEC,
|
||||
|
@ -80,7 +80,7 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
|
|||
forUtil = new ForUtil(docIn);
|
||||
|
||||
if (fieldInfos.hasProx()) {
|
||||
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.POS_EXTENSION),
|
||||
posIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.POS_EXTENSION),
|
||||
ioContext);
|
||||
CodecUtil.checkHeader(posIn,
|
||||
TempPostingsWriter.POS_CODEC,
|
||||
|
@ -88,7 +88,7 @@ public final class TempPostingsReader extends TempPostingsReaderBase {
|
|||
TempPostingsWriter.VERSION_CURRENT);
|
||||
|
||||
if (fieldInfos.hasPayloads() || fieldInfos.hasOffsets()) {
|
||||
payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
|
||||
payIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, TempBlockPostingsFormat.PAY_EXTENSION),
|
||||
ioContext);
|
||||
CodecUtil.checkHeader(payIn,
|
||||
TempPostingsWriter.PAY_CODEC,
|
||||
|
|
|
@ -119,7 +119,7 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
|
|||
public TempPostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException {
|
||||
super();
|
||||
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.DOC_EXTENSION),
|
||||
docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.DOC_EXTENSION),
|
||||
state.context);
|
||||
IndexOutput posOut = null;
|
||||
IndexOutput payOut = null;
|
||||
|
@ -129,7 +129,7 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
|
|||
forUtil = new ForUtil(acceptableOverheadRatio, docOut);
|
||||
if (state.fieldInfos.hasProx()) {
|
||||
posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.POS_EXTENSION),
|
||||
posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.POS_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT);
|
||||
|
||||
|
@ -150,7 +150,7 @@ public final class TempPostingsWriter extends TempPostingsWriterBase {
|
|||
}
|
||||
|
||||
if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) {
|
||||
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempPostingsFormat.PAY_EXTENSION),
|
||||
payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TempBlockPostingsFormat.PAY_EXTENSION),
|
||||
state.context);
|
||||
CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.codecs.temp;
|
|||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.util.fst.Outputs;
|
||||
|
@ -31,18 +33,25 @@ import org.apache.lucene.util.LongsRef;
|
|||
public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
||||
private final static TempMetaData NO_OUTPUT = new TempMetaData();
|
||||
private static boolean DEBUG = false;
|
||||
private FieldInfo fieldInfo;
|
||||
private int longsSize;
|
||||
|
||||
public static class TempMetaData {
|
||||
public long[] longs;
|
||||
public byte[] bytes;
|
||||
int docFreq;
|
||||
long totalTermFreq;
|
||||
TempMetaData() {
|
||||
this.longs = null;
|
||||
this.bytes = null;
|
||||
this.docFreq = 0;
|
||||
this.totalTermFreq = -1;
|
||||
}
|
||||
TempMetaData(long[] longs, byte[] bytes) {
|
||||
TempMetaData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) {
|
||||
this.longs = longs;
|
||||
this.bytes = bytes;
|
||||
this.docFreq = docFreq;
|
||||
this.totalTermFreq = totalTermFreq;
|
||||
}
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
@ -79,12 +88,14 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
if (bytes != null) {
|
||||
sb.append(" [ ");
|
||||
for (int i = 0; i < bytes.length; i++) {
|
||||
sb.append(bytes[i]+" ");
|
||||
sb.append(Integer.toHexString((int)bytes[i] & 0xff)+" ");
|
||||
}
|
||||
sb.append("]");
|
||||
} else {
|
||||
sb.append(" null");
|
||||
}
|
||||
sb.append(" "+docFreq);
|
||||
sb.append(" "+totalTermFreq);
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
@ -92,7 +103,8 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
private TempTermOutputs() {
|
||||
}
|
||||
|
||||
protected TempTermOutputs(int longsSize) {
|
||||
protected TempTermOutputs(FieldInfo fieldInfo, int longsSize) {
|
||||
this.fieldInfo = fieldInfo;
|
||||
this.longsSize = longsSize;
|
||||
}
|
||||
|
||||
|
@ -102,13 +114,13 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
// i.e. when every value in long[] fits the same ordering, the smaller one
|
||||
// will be the result.
|
||||
//
|
||||
// NOTE: only long[] is 'shared', i.e. after sharing common value,
|
||||
// the output of smaller one will be a all-zero long[] with original byte[] blob.
|
||||
// NOTE: only long[] is 'shared', i.e. if there are two byte[] on the successive
|
||||
// arcs, only the last byte[] is valid. (this somewhat saves nodes, but might affect
|
||||
// compression, since we'll have to load metadata block for other terms as well, currently,
|
||||
// we don't support this)
|
||||
//
|
||||
// nocommit: Builder.add() doesn't immediatelly consumes the output data,
|
||||
// which means, the longs after one add() should all be deeply copied
|
||||
// instead of being reused? quite hairly to detect it here, so the caller
|
||||
// must be careful about this.
|
||||
// nocommit: get the byte[] from smaller one as well, so that
|
||||
// byte[] is actually inherited
|
||||
//
|
||||
public TempMetaData common(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
|
||||
|
@ -148,18 +160,18 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
if (pos < longsSize || accum == 0) {
|
||||
ret = NO_OUTPUT;
|
||||
} else if (order) {
|
||||
ret = new TempMetaData(longs2, null);
|
||||
ret = new TempMetaData(longs2, null, 0, -1);
|
||||
} else {
|
||||
ret = new TempMetaData(longs1, null);
|
||||
ret = new TempMetaData(longs1, null, 0, -1);
|
||||
}
|
||||
} else {
|
||||
// equal
|
||||
if (t1.bytes!= null && Arrays.equals(t1.bytes, t2.bytes)) { // all fields are equal
|
||||
if (t1.bytes!= null && bytesEqual(t1, t2) && statsEqual(t1, t2)) { // all fields are equal
|
||||
ret = t1;
|
||||
} else if (accum == 0) { // all zero case
|
||||
ret = NO_OUTPUT;
|
||||
} else {
|
||||
ret = new TempMetaData(longs1, null);
|
||||
ret = new TempMetaData(longs1, null, 0, -1);
|
||||
}
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
|
@ -189,21 +201,27 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
}
|
||||
|
||||
TempMetaData ret;
|
||||
if (diff == 0 && (t1.bytes == null || t1.bytes.length == 0)) {
|
||||
if (diff == 0 && bytesEqual(t1, t2) && statsEqual(t1, t2)) {
|
||||
ret = NO_OUTPUT;
|
||||
} else {
|
||||
ret = new TempMetaData(share, t1.bytes);
|
||||
ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static boolean statsEqual(final TempMetaData t1, final TempMetaData t2) {
|
||||
return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
|
||||
}
|
||||
static boolean bytesEqual(final TempMetaData t1, final TempMetaData t2) {
|
||||
return Arrays.equals(t1.bytes, t2.bytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
// nocommit: need to check all-zero case?
|
||||
// so we can reuse one long[]
|
||||
public TempMetaData add(TempMetaData t1, TempMetaData t2) {
|
||||
if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
|
||||
// nocommit: necessary?
|
||||
if (t1 == NO_OUTPUT) {
|
||||
if (DEBUG) System.out.println("ret:"+t2);
|
||||
return t2;
|
||||
|
@ -215,17 +233,17 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
assert t2.longs != null;
|
||||
|
||||
int pos = 0;
|
||||
long[] accum = new long[longsSize]; // nocommit: reuse
|
||||
long[] accum = new long[longsSize]; // nocommit: reuse?
|
||||
while (pos < longsSize) {
|
||||
accum[pos] = t1.longs[pos] + t2.longs[pos];
|
||||
assert(accum[pos] >= 0);
|
||||
pos++;
|
||||
}
|
||||
TempMetaData ret;
|
||||
if (t2.bytes != null) {
|
||||
ret = new TempMetaData(accum, t2.bytes);
|
||||
if (t2.bytes != null || t2.docFreq > 0) {
|
||||
ret = new TempMetaData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
|
||||
} else {
|
||||
ret = new TempMetaData(accum, t1.bytes);
|
||||
ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
|
||||
}
|
||||
if (DEBUG) System.out.println("ret:"+ret);
|
||||
return ret;
|
||||
|
@ -236,14 +254,20 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
for (int pos = 0; pos < longsSize; pos++) {
|
||||
out.writeVLong(data.longs[pos]);
|
||||
}
|
||||
int code = data.docFreq == 0 ? 0 : 1;
|
||||
if (data.bytes != null) {
|
||||
out.writeVInt(data.bytes.length);
|
||||
out.writeVInt((data.bytes.length << 1) | code);
|
||||
out.writeBytes(data.bytes, 0, data.bytes.length);
|
||||
} else {
|
||||
out.writeVInt(0);
|
||||
out.writeVInt(code);
|
||||
}
|
||||
if (data.docFreq > 0) {
|
||||
out.writeVInt(data.docFreq);
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
out.writeVLong(data.totalTermFreq - data.docFreq);
|
||||
}
|
||||
}
|
||||
}
|
||||
// nocommit: can this non-null byte case be used in Final Output?
|
||||
|
||||
@Override
|
||||
public TempMetaData read(DataInput in) throws IOException {
|
||||
|
@ -251,13 +275,22 @@ public class TempTermOutputs extends Outputs<TempTermOutputs.TempMetaData> {
|
|||
for (int pos = 0; pos < longsSize; pos++) {
|
||||
longs[pos] = in.readVLong();
|
||||
}
|
||||
int bytesSize = in.readVInt();
|
||||
int code = in.readVInt();
|
||||
int bytesSize = code >>> 1;
|
||||
int docFreq = 0;
|
||||
long totalTermFreq = -1;
|
||||
byte[] bytes = null;
|
||||
if (bytesSize > 0) {
|
||||
bytes = new byte[bytesSize];
|
||||
in.readBytes(bytes, 0, bytes.length);
|
||||
}
|
||||
TempMetaData meta = new TempMetaData(longs, bytes);
|
||||
if ((code & 1) == 1) {
|
||||
docFreq = in.readVInt();
|
||||
if (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
|
||||
totalTermFreq = docFreq + in.readVLong();
|
||||
}
|
||||
}
|
||||
TempMetaData meta = new TempMetaData(longs, bytes, docFreq, totalTermFreq);
|
||||
return meta;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,4 +15,5 @@
|
|||
|
||||
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat
|
||||
org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat
|
||||
org.apache.lucene.codecs.temp.TempPostingsFormat
|
||||
org.apache.lucene.codecs.temp.TempBlockPostingsFormat
|
||||
org.apache.lucene.codecs.temp.TempFSTPostingsFormat
|
||||
|
|
|
@ -1015,7 +1015,7 @@ public class TestIndexWriterReader extends LuceneTestCase {
|
|||
// Don't proceed if picked Codec is in the list of illegal ones.
|
||||
final String format = _TestUtil.getPostingsFormat("f");
|
||||
assumeFalse("Format: " + format + " does not support ReaderTermsIndexDivisor!",
|
||||
(format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct")));
|
||||
(format.equals("SimpleText") || format.equals("Memory") || format.equals("Direct") || format.equals("TempFST")));
|
||||
|
||||
Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(dir, conf);
|
||||
|
|
Loading…
Reference in New Issue