LUCENE-5675: initial scaffolding for new IDVPF

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5675@1594971 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-05-15 16:32:21 +00:00
parent ad7e4bd450
commit a301ab5610
3 changed files with 332 additions and 0 deletions

View File

@ -0,0 +1,96 @@
package org.apache.lucene.codecs.idversion;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** A PostingsFormat for primary-key (ID) fields, that associates a
* long version with each ID and enables fast (using only the terms index)
* lookup for whether a given ID may have a version > N.
*
* The field is indexed as DOCS_ONLY, but the user must feed in the
* version as a payload on the first token.
*
* The docID and version for each ID is inlined into the terms dict.
*
* @lucene.experimental */
public abstract class IDVersionPostingsFormat extends PostingsFormat {
private final int minTermsInBlock;
private final int maxTermsInBlock;
public IDVersionPostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
public IDVersionPostingsFormat(int minTermsInBlock, int maxTermsInBlock) {
super("IDVersion");
this.minTermsInBlock = minTermsInBlock;
this.maxTermsInBlock = maxTermsInBlock;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase postingsWriter = new IDVersionPostingsWriter();
boolean success = false;
try {
FieldsConsumer ret = new BlockTreeTermsWriter(state,
postingsWriter,
minTermsInBlock,
maxTermsInBlock);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postingsReader = new IDVersionPostingsReader();
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(state.directory,
state.fieldInfos,
state.segmentInfo,
postingsReader,
state.context,
state.segmentSuffix);
success = true;
return ret;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(postingsReader);
}
}
}
}

View File

@ -0,0 +1,82 @@
package org.apache.lucene.codecs.idversion;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.codecs.idversion.IDVersionPostingsWriter.IDVersionTermState;
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
public final class IDVersionPostingsReader extends PostingsReaderBase {
@Override
public void init(IndexInput termsIn) throws IOException {
// Make sure we are talking to the matching postings writer
CodecUtil.checkHeader(termsIn,
IDVersionPostingsWriter.TERMS_CODEC,
IDVersionPostingsWriter.VERSION_START,
IDVersionPostingsWriter.VERSION_CURRENT);
}
@Override
public BlockTermState newTermState() {
return new IDVersionTermState();
}
@Override
public void close() throws IOException {
}
@Override
public void decodeTerm(long[] longs, DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute)
throws IOException {
final IDVersionTermState termState = (IDVersionTermState) _termState;
termState.idVersion = Long.MAX_VALUE - longs[0];
termState.docID = in.readVInt();
}
@Override
public DocsEnum docs(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
// nocommit todo -- need a SingleDocDocsEnum
return null;
}
@Override
public DocsAndPositionsEnum docsAndPositions(FieldInfo fieldInfo, BlockTermState termState, Bits liveDocs,
DocsAndPositionsEnum reuse, int flags) {
return null;
}
@Override
public long ramBytesUsed() {
return 0;
}
@Override
public void checkIntegrity() throws IOException {
}
}

View File

@ -0,0 +1,154 @@
package org.apache.lucene.codecs.idversion;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.PushPostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
public final class IDVersionPostingsWriter extends PushPostingsWriterBase {
final static String TERMS_CODEC = "IDVersionPostingsWriterTerms";
// Increment version to change it
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
final static IDVersionTermState emptyState = new IDVersionTermState();
IDVersionTermState lastState;
private int lastDocID;
private int lastPosition;
private long lastVersion;
final static class IDVersionTermState extends BlockTermState {
long idVersion;
int docID;
@Override
public IDVersionTermState clone() {
IDVersionTermState other = new IDVersionTermState();
other.copyFrom(this);
return other;
}
@Override
public void copyFrom(TermState _other) {
super.copyFrom(_other);
IDVersionTermState other = (IDVersionTermState) _other;
idVersion = other.idVersion;
docID = other.docID;
}
}
@Override
public IDVersionTermState newTermState() {
return new IDVersionTermState();
}
@Override
public void init(IndexOutput termsOut) throws IOException {
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
}
@Override
public int setField(FieldInfo fieldInfo) {
super.setField(fieldInfo);
if (fieldInfo.getIndexOptions() != FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) {
throw new IllegalArgumentException("field must be index using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS");
}
lastState = emptyState;
return 0;
}
@Override
public void startTerm() {
lastDocID = -1;
}
@Override
public void startDoc(int docID, int termDocFreq) throws IOException {
if (lastDocID != -1) {
// nocommit need test
throw new IllegalArgumentException("term appears in more than one document");
}
if (termDocFreq != 1) {
// nocommit need test
throw new IllegalArgumentException("term appears more than once in the document");
}
lastDocID = docID;
lastPosition = -1;
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException {
if (lastPosition != -1) {
// nocommit need test
throw new IllegalArgumentException("term appears more than once in document");
}
lastPosition = position;
if (payload == null) {
// nocommit need test
throw new IllegalArgumentException("missing payload");
}
if (payload.length == 0) {
// nocommit need test
throw new IllegalArgumentException("payload.length == 0");
}
// nocommit decode payload to long here ... PayloadHelper!? or keep as byte[]?
lastVersion = 0;
}
@Override
public void finishDoc() throws IOException {
if (lastPosition == -1) {
// nocommit need test
throw new IllegalArgumentException("missing addPosition");
}
}
/** Called when we are done adding docs to this term */
@Override
public void finishTerm(BlockTermState _state) throws IOException {
IDVersionTermState state = (IDVersionTermState) _state;
assert state.docFreq > 0;
assert lastDocID != -1;
state.docID = lastDocID;
state.idVersion = lastVersion;
}
@Override
public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException {
IDVersionTermState state = (IDVersionTermState) _state;
// nocommit must send version up to FST somehow ...
out.writeVInt(state.docID);
}
@Override
public void close() throws IOException {
}
}