mirror of https://github.com/apache/lucene.git
LUCENE-3613: split out 4.0/3.x term vectors implementations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1232652 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c4f57c6081
commit
d159f25b63
|
@ -51,8 +51,7 @@ public class Lucene3xCodec extends Codec {
|
||||||
// TODO: this should really be a different impl
|
// TODO: this should really be a different impl
|
||||||
private final StoredFieldsFormat fieldsFormat = new Lucene40StoredFieldsFormat();
|
private final StoredFieldsFormat fieldsFormat = new Lucene40StoredFieldsFormat();
|
||||||
|
|
||||||
// TODO: this should really be a different impl
|
private final TermVectorsFormat vectorsFormat = new Lucene3xTermVectorsFormat();
|
||||||
private final TermVectorsFormat vectorsFormat = new Lucene40TermVectorsFormat();
|
|
||||||
|
|
||||||
private final FieldInfosFormat fieldInfosFormat = new Lucene3xFieldInfosFormat();
|
private final FieldInfosFormat fieldInfosFormat = new Lucene3xFieldInfosFormat();
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.apache.lucene.codecs.lucene3x;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
|
import org.apache.lucene.codecs.TermVectorsReader;
|
||||||
|
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.SegmentInfo;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
|
||||||
|
public class Lucene3xTermVectorsFormat extends TermVectorsFormat {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermVectorsReader vectorsReader(Directory directory,SegmentInfo segmentInfo, FieldInfos fieldInfos, IOContext context) throws IOException {
|
||||||
|
return new Lucene3xTermVectorsReader(directory, segmentInfo, fieldInfos, context);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
|
||||||
|
// TODO all these IAEs in preflex should be UOEs?
|
||||||
|
throw new IllegalArgumentException("this codec can only be used for reading");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
|
||||||
|
Lucene3xTermVectorsReader.files(dir, info, files);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,672 @@
|
||||||
|
package org.apache.lucene.codecs.lucene3x;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.TermVectorsReader;
|
||||||
|
import org.apache.lucene.index.CorruptIndexException;
|
||||||
|
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||||
|
import org.apache.lucene.index.DocsEnum;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.FieldInfos;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
|
import org.apache.lucene.index.FieldsEnum;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.index.IndexFormatTooNewException;
|
||||||
|
import org.apache.lucene.index.IndexFormatTooOldException;
|
||||||
|
import org.apache.lucene.index.SegmentInfo;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexInput;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
public class Lucene3xTermVectorsReader extends TermVectorsReader {
|
||||||
|
|
||||||
|
// NOTE: if you make a new format, it must be larger than
|
||||||
|
// the current format
|
||||||
|
|
||||||
|
// Changed strings to UTF8 with length-in-bytes not length-in-chars
|
||||||
|
static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
|
||||||
|
|
||||||
|
// NOTE: always change this if you switch to a new format!
|
||||||
|
// whenever you add a new format, make it 1 larger (positive version logic)!
|
||||||
|
public static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
|
||||||
|
|
||||||
|
// when removing support for old versions, leave the last supported version here
|
||||||
|
public static final int FORMAT_MINIMUM = FORMAT_UTF8_LENGTH_IN_BYTES;
|
||||||
|
|
||||||
|
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
|
||||||
|
static final int FORMAT_SIZE = 4;
|
||||||
|
|
||||||
|
public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
|
||||||
|
|
||||||
|
public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
|
||||||
|
|
||||||
|
/** Extension of vectors fields file */
|
||||||
|
public static final String VECTORS_FIELDS_EXTENSION = "tvf";
|
||||||
|
|
||||||
|
/** Extension of vectors documents file */
|
||||||
|
public static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
|
||||||
|
|
||||||
|
/** Extension of vectors index file */
|
||||||
|
public static final String VECTORS_INDEX_EXTENSION = "tvx";
|
||||||
|
|
||||||
|
private FieldInfos fieldInfos;
|
||||||
|
|
||||||
|
private IndexInput tvx;
|
||||||
|
private IndexInput tvd;
|
||||||
|
private IndexInput tvf;
|
||||||
|
private int size;
|
||||||
|
private int numTotalDocs;
|
||||||
|
|
||||||
|
// The docID offset where our docs begin in the index
|
||||||
|
// file. This will be 0 if we have our own private file.
|
||||||
|
private int docStoreOffset;
|
||||||
|
|
||||||
|
private final int format;
|
||||||
|
|
||||||
|
// used by clone
|
||||||
|
Lucene3xTermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
|
||||||
|
this.fieldInfos = fieldInfos;
|
||||||
|
this.tvx = tvx;
|
||||||
|
this.tvd = tvd;
|
||||||
|
this.tvf = tvf;
|
||||||
|
this.size = size;
|
||||||
|
this.numTotalDocs = numTotalDocs;
|
||||||
|
this.docStoreOffset = docStoreOffset;
|
||||||
|
this.format = format;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Lucene3xTermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
|
||||||
|
throws CorruptIndexException, IOException {
|
||||||
|
final String segment = si.getDocStoreSegment();
|
||||||
|
final int docStoreOffset = si.getDocStoreOffset();
|
||||||
|
final int size = si.docCount;
|
||||||
|
|
||||||
|
boolean success = false;
|
||||||
|
|
||||||
|
try {
|
||||||
|
String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
|
||||||
|
tvx = d.openInput(idxName, context);
|
||||||
|
format = checkValidFormat(tvx);
|
||||||
|
String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
|
||||||
|
tvd = d.openInput(fn, context);
|
||||||
|
final int tvdFormat = checkValidFormat(tvd);
|
||||||
|
fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
|
||||||
|
tvf = d.openInput(fn, context);
|
||||||
|
final int tvfFormat = checkValidFormat(tvf);
|
||||||
|
|
||||||
|
assert format == tvdFormat;
|
||||||
|
assert format == tvfFormat;
|
||||||
|
|
||||||
|
numTotalDocs = (int) (tvx.length() >> 4);
|
||||||
|
|
||||||
|
if (-1 == docStoreOffset) {
|
||||||
|
this.docStoreOffset = 0;
|
||||||
|
this.size = numTotalDocs;
|
||||||
|
assert size == 0 || numTotalDocs == size;
|
||||||
|
} else {
|
||||||
|
this.docStoreOffset = docStoreOffset;
|
||||||
|
this.size = size;
|
||||||
|
// Verify the file is long enough to hold all of our
|
||||||
|
// docs
|
||||||
|
assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.fieldInfos = fieldInfos;
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
// With lock-less commits, it's entirely possible (and
|
||||||
|
// fine) to hit a FileNotFound exception above. In
|
||||||
|
// this case, we want to explicitly close any subset
|
||||||
|
// of things that were opened so that we don't have to
|
||||||
|
// wait for a GC to do so.
|
||||||
|
if (!success) {
|
||||||
|
close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not private to avoid synthetic access$NNN methods
|
||||||
|
void seekTvx(final int docNum) throws IOException {
|
||||||
|
tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
|
||||||
|
{
|
||||||
|
int format = in.readInt();
|
||||||
|
if (format < FORMAT_MINIMUM)
|
||||||
|
throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
|
||||||
|
if (format > FORMAT_CURRENT)
|
||||||
|
throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
|
||||||
|
return format;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
IOUtils.close(tvx, tvd, tvf);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return The number of documents in the reader
|
||||||
|
*/
|
||||||
|
int size() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
private class TVFields extends Fields {
|
||||||
|
private final int[] fieldNumbers;
|
||||||
|
private final long[] fieldFPs;
|
||||||
|
private final Map<Integer,Integer> fieldNumberToIndex = new HashMap<Integer,Integer>();
|
||||||
|
|
||||||
|
public TVFields(int docID) throws IOException {
|
||||||
|
seekTvx(docID);
|
||||||
|
tvd.seek(tvx.readLong());
|
||||||
|
|
||||||
|
final int fieldCount = tvd.readVInt();
|
||||||
|
assert fieldCount >= 0;
|
||||||
|
if (fieldCount != 0) {
|
||||||
|
fieldNumbers = new int[fieldCount];
|
||||||
|
fieldFPs = new long[fieldCount];
|
||||||
|
for(int fieldUpto=0;fieldUpto<fieldCount;fieldUpto++) {
|
||||||
|
final int fieldNumber = tvd.readVInt();
|
||||||
|
fieldNumbers[fieldUpto] = fieldNumber;
|
||||||
|
fieldNumberToIndex.put(fieldNumber, fieldUpto);
|
||||||
|
}
|
||||||
|
|
||||||
|
long position = tvx.readLong();
|
||||||
|
fieldFPs[0] = position;
|
||||||
|
for(int fieldUpto=1;fieldUpto<fieldCount;fieldUpto++) {
|
||||||
|
position += tvd.readVLong();
|
||||||
|
fieldFPs[fieldUpto] = position;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// TODO: we can improve writer here, eg write 0 into
|
||||||
|
// tvx file, so we know on first read from tvx that
|
||||||
|
// this doc has no TVs
|
||||||
|
fieldNumbers = null;
|
||||||
|
fieldFPs = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FieldsEnum iterator() throws IOException {
|
||||||
|
|
||||||
|
return new FieldsEnum() {
|
||||||
|
private int fieldUpto;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String next() throws IOException {
|
||||||
|
if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
|
||||||
|
return fieldInfos.fieldName(fieldNumbers[fieldUpto++]);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Terms terms() throws IOException {
|
||||||
|
return TVFields.this.terms(fieldInfos.fieldName(fieldNumbers[fieldUpto-1]));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Terms terms(String field) throws IOException {
|
||||||
|
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||||
|
if (fieldInfo == null) {
|
||||||
|
// No such field
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
|
||||||
|
if (fieldIndex == null) {
|
||||||
|
// Term vectors were not indexed for this field
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TVTerms(fieldFPs[fieldIndex]);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getUniqueFieldCount() {
|
||||||
|
if (fieldNumbers == null) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
return fieldNumbers.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class TVTerms extends Terms {
|
||||||
|
private final int numTerms;
|
||||||
|
private final long tvfFPStart;
|
||||||
|
|
||||||
|
public TVTerms(long tvfFP) throws IOException {
|
||||||
|
tvf.seek(tvfFP);
|
||||||
|
numTerms = tvf.readVInt();
|
||||||
|
tvfFPStart = tvf.getFilePointer();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||||
|
TVTermsEnum termsEnum;
|
||||||
|
if (reuse instanceof TVTermsEnum) {
|
||||||
|
termsEnum = (TVTermsEnum) reuse;
|
||||||
|
if (!termsEnum.canReuse(tvf)) {
|
||||||
|
termsEnum = new TVTermsEnum();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
termsEnum = new TVTermsEnum();
|
||||||
|
}
|
||||||
|
termsEnum.reset(numTerms, tvfFPStart);
|
||||||
|
return termsEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getUniqueTermCount() {
|
||||||
|
return numTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumTotalTermFreq() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getSumDocFreq() {
|
||||||
|
// Every term occurs in just one doc:
|
||||||
|
return numTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getDocCount() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
// TODO: really indexer hardwires
|
||||||
|
// this...? I guess codec could buffer and re-sort...
|
||||||
|
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class TVTermsEnum extends TermsEnum {
|
||||||
|
private final IndexInput origTVF;
|
||||||
|
private final IndexInput tvf;
|
||||||
|
private int numTerms;
|
||||||
|
private int nextTerm;
|
||||||
|
private int freq;
|
||||||
|
private BytesRef lastTerm = new BytesRef();
|
||||||
|
private BytesRef term = new BytesRef();
|
||||||
|
private boolean storePositions;
|
||||||
|
private boolean storeOffsets;
|
||||||
|
private long tvfFP;
|
||||||
|
|
||||||
|
private int[] positions;
|
||||||
|
private int[] startOffsets;
|
||||||
|
private int[] endOffsets;
|
||||||
|
|
||||||
|
// NOTE: tvf is pre-positioned by caller
|
||||||
|
public TVTermsEnum() throws IOException {
|
||||||
|
this.origTVF = Lucene3xTermVectorsReader.this.tvf;
|
||||||
|
tvf = (IndexInput) origTVF.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean canReuse(IndexInput tvf) {
|
||||||
|
return tvf == origTVF;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(int numTerms, long tvfFPStart) throws IOException {
|
||||||
|
this.numTerms = numTerms;
|
||||||
|
nextTerm = 0;
|
||||||
|
tvf.seek(tvfFPStart);
|
||||||
|
final byte bits = tvf.readByte();
|
||||||
|
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||||
|
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||||
|
tvfFP = 1+tvfFPStart;
|
||||||
|
positions = null;
|
||||||
|
startOffsets = null;
|
||||||
|
endOffsets = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: slow! (linear scan)
|
||||||
|
@Override
|
||||||
|
public SeekStatus seekCeil(BytesRef text, boolean useCache)
|
||||||
|
throws IOException {
|
||||||
|
if (nextTerm != 0 && text.compareTo(term) < 0) {
|
||||||
|
nextTerm = 0;
|
||||||
|
tvf.seek(tvfFP);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (next() != null) {
|
||||||
|
final int cmp = text.compareTo(term);
|
||||||
|
if (cmp < 0) {
|
||||||
|
return SeekStatus.NOT_FOUND;
|
||||||
|
} else if (cmp == 0) {
|
||||||
|
return SeekStatus.FOUND;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return SeekStatus.END;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void seekExact(long ord) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef next() throws IOException {
|
||||||
|
if (nextTerm >= numTerms) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
term.copyBytes(lastTerm);
|
||||||
|
final int start = tvf.readVInt();
|
||||||
|
final int deltaLen = tvf.readVInt();
|
||||||
|
term.length = start + deltaLen;
|
||||||
|
term.grow(term.length);
|
||||||
|
tvf.readBytes(term.bytes, start, deltaLen);
|
||||||
|
freq = tvf.readVInt();
|
||||||
|
|
||||||
|
if (storePositions) {
|
||||||
|
// TODO: we could maybe reuse last array, if we can
|
||||||
|
// somehow be careful about consumer never using two
|
||||||
|
// D&PEnums at once...
|
||||||
|
positions = new int[freq];
|
||||||
|
int pos = 0;
|
||||||
|
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||||
|
pos += tvf.readVInt();
|
||||||
|
positions[posUpto] = pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (storeOffsets) {
|
||||||
|
startOffsets = new int[freq];
|
||||||
|
endOffsets = new int[freq];
|
||||||
|
int offset = 0;
|
||||||
|
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||||
|
startOffsets[posUpto] = offset + tvf.readVInt();
|
||||||
|
offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lastTerm.copyBytes(term);
|
||||||
|
nextTerm++;
|
||||||
|
return term;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef term() {
|
||||||
|
return term;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long ord() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docFreq() {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long totalTermFreq() {
|
||||||
|
return freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs /* ignored */) throws IOException {
|
||||||
|
TVDocsEnum docsEnum;
|
||||||
|
if (reuse != null && reuse instanceof TVDocsEnum) {
|
||||||
|
docsEnum = (TVDocsEnum) reuse;
|
||||||
|
} else {
|
||||||
|
docsEnum = new TVDocsEnum();
|
||||||
|
}
|
||||||
|
docsEnum.reset(liveDocs, freq);
|
||||||
|
return docsEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
|
||||||
|
if (needsOffsets && !storeOffsets) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!storePositions && !storeOffsets) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
TVDocsAndPositionsEnum docsAndPositionsEnum;
|
||||||
|
if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
|
||||||
|
docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
|
||||||
|
} else {
|
||||||
|
docsAndPositionsEnum = new TVDocsAndPositionsEnum();
|
||||||
|
}
|
||||||
|
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
|
||||||
|
return docsAndPositionsEnum;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Comparator<BytesRef> getComparator() {
|
||||||
|
// TODO: really indexer hardwires
|
||||||
|
// this...? I guess codec could buffer and re-sort...
|
||||||
|
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NOTE: sort of a silly class, since you can get the
|
||||||
|
// freq() already by TermsEnum.totalTermFreq
|
||||||
|
private static class TVDocsEnum extends DocsEnum {
|
||||||
|
private boolean didNext;
|
||||||
|
private int doc = -1;
|
||||||
|
private int freq;
|
||||||
|
private Bits liveDocs;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int freq() {
|
||||||
|
return freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() {
|
||||||
|
if (!didNext && (liveDocs == null || liveDocs.get(0))) {
|
||||||
|
didNext = true;
|
||||||
|
return (doc = 0);
|
||||||
|
} else {
|
||||||
|
return (doc = NO_MORE_DOCS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) {
|
||||||
|
if (!didNext && target == 0) {
|
||||||
|
return nextDoc();
|
||||||
|
} else {
|
||||||
|
return (doc = NO_MORE_DOCS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(Bits liveDocs, int freq) {
|
||||||
|
this.liveDocs = liveDocs;
|
||||||
|
this.freq = freq;
|
||||||
|
this.doc = -1;
|
||||||
|
didNext = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
|
||||||
|
private boolean didNext;
|
||||||
|
private int doc = -1;
|
||||||
|
private int nextPos;
|
||||||
|
private Bits liveDocs;
|
||||||
|
private int[] positions;
|
||||||
|
private int[] startOffsets;
|
||||||
|
private int[] endOffsets;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int freq() {
|
||||||
|
if (positions != null) {
|
||||||
|
return positions.length;
|
||||||
|
} else {
|
||||||
|
assert startOffsets != null;
|
||||||
|
return startOffsets.length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() {
|
||||||
|
if (!didNext && (liveDocs == null || liveDocs.get(0))) {
|
||||||
|
didNext = true;
|
||||||
|
return (doc = 0);
|
||||||
|
} else {
|
||||||
|
return (doc = NO_MORE_DOCS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) {
|
||||||
|
if (!didNext && target == 0) {
|
||||||
|
return nextDoc();
|
||||||
|
} else {
|
||||||
|
return (doc = NO_MORE_DOCS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
|
||||||
|
this.liveDocs = liveDocs;
|
||||||
|
this.positions = positions;
|
||||||
|
this.startOffsets = startOffsets;
|
||||||
|
this.endOffsets = endOffsets;
|
||||||
|
this.doc = -1;
|
||||||
|
didNext = false;
|
||||||
|
nextPos = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef getPayload() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasPayload() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextPosition() {
|
||||||
|
assert (positions != null && nextPos < positions.length) ||
|
||||||
|
startOffsets != null && nextPos < startOffsets.length;
|
||||||
|
|
||||||
|
if (positions != null) {
|
||||||
|
return positions[nextPos++];
|
||||||
|
} else {
|
||||||
|
nextPos++;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int startOffset() {
|
||||||
|
assert startOffsets != null;
|
||||||
|
return startOffsets[nextPos-1];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int endOffset() {
|
||||||
|
assert endOffsets != null;
|
||||||
|
return endOffsets[nextPos-1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Fields get(int docID) throws IOException {
|
||||||
|
if (docID < 0 || docID >= numTotalDocs) {
|
||||||
|
throw new IllegalArgumentException("doID=" + docID + " is out of bounds [0.." + (numTotalDocs-1) + "]");
|
||||||
|
}
|
||||||
|
if (tvx != null) {
|
||||||
|
Fields fields = new TVFields(docID);
|
||||||
|
if (fields.getUniqueFieldCount() == 0) {
|
||||||
|
// TODO: we can improve writer here, eg write 0 into
|
||||||
|
// tvx file, so we know on first read from tvx that
|
||||||
|
// this doc has no TVs
|
||||||
|
return null;
|
||||||
|
} else {
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermVectorsReader clone() {
|
||||||
|
IndexInput cloneTvx = null;
|
||||||
|
IndexInput cloneTvd = null;
|
||||||
|
IndexInput cloneTvf = null;
|
||||||
|
|
||||||
|
// These are null when a TermVectorsReader was created
|
||||||
|
// on a segment that did not have term vectors saved
|
||||||
|
if (tvx != null && tvd != null && tvf != null) {
|
||||||
|
cloneTvx = (IndexInput) tvx.clone();
|
||||||
|
cloneTvd = (IndexInput) tvd.clone();
|
||||||
|
cloneTvf = (IndexInput) tvf.clone();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Lucene3xTermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
|
||||||
|
if (info.getHasVectors()) {
|
||||||
|
if (info.getDocStoreOffset() != -1) {
|
||||||
|
assert info.getDocStoreSegment() != null;
|
||||||
|
if (!info.getDocStoreIsCompoundFile()) {
|
||||||
|
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
|
||||||
|
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
|
||||||
|
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
|
||||||
|
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
|
||||||
|
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -74,8 +74,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
||||||
static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
|
static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
|
||||||
|
|
||||||
/** Extension of vectors index file */
|
/** Extension of vectors index file */
|
||||||
// TODO: shouldnt be visible to segments reader, preflex should do this itself somehow
|
static final String VECTORS_INDEX_EXTENSION = "tvx";
|
||||||
public static final String VECTORS_INDEX_EXTENSION = "tvx";
|
|
||||||
|
|
||||||
private FieldInfos fieldInfos;
|
private FieldInfos fieldInfos;
|
||||||
|
|
||||||
|
@ -85,28 +84,22 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
||||||
private int size;
|
private int size;
|
||||||
private int numTotalDocs;
|
private int numTotalDocs;
|
||||||
|
|
||||||
// The docID offset where our docs begin in the index
|
|
||||||
// file. This will be 0 if we have our own private file.
|
|
||||||
private int docStoreOffset;
|
|
||||||
|
|
||||||
private final int format;
|
private final int format;
|
||||||
|
|
||||||
// used by clone
|
// used by clone
|
||||||
Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
|
Lucene40TermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int format) {
|
||||||
this.fieldInfos = fieldInfos;
|
this.fieldInfos = fieldInfos;
|
||||||
this.tvx = tvx;
|
this.tvx = tvx;
|
||||||
this.tvd = tvd;
|
this.tvd = tvd;
|
||||||
this.tvf = tvf;
|
this.tvf = tvf;
|
||||||
this.size = size;
|
this.size = size;
|
||||||
this.numTotalDocs = numTotalDocs;
|
this.numTotalDocs = numTotalDocs;
|
||||||
this.docStoreOffset = docStoreOffset;
|
|
||||||
this.format = format;
|
this.format = format;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
|
public Lucene40TermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
|
||||||
throws CorruptIndexException, IOException {
|
throws CorruptIndexException, IOException {
|
||||||
final String segment = si.getDocStoreSegment();
|
final String segment = si.name;
|
||||||
final int docStoreOffset = si.getDocStoreOffset();
|
|
||||||
final int size = si.docCount;
|
final int size = si.docCount;
|
||||||
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
|
@ -127,17 +120,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
||||||
|
|
||||||
numTotalDocs = (int) (tvx.length() >> 4);
|
numTotalDocs = (int) (tvx.length() >> 4);
|
||||||
|
|
||||||
if (-1 == docStoreOffset) {
|
this.size = numTotalDocs;
|
||||||
this.docStoreOffset = 0;
|
assert size == 0 || numTotalDocs == size;
|
||||||
this.size = numTotalDocs;
|
|
||||||
assert size == 0 || numTotalDocs == size;
|
|
||||||
} else {
|
|
||||||
this.docStoreOffset = docStoreOffset;
|
|
||||||
this.size = size;
|
|
||||||
// Verify the file is long enough to hold all of our
|
|
||||||
// docs
|
|
||||||
assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
this.fieldInfos = fieldInfos;
|
this.fieldInfos = fieldInfos;
|
||||||
success = true;
|
success = true;
|
||||||
|
@ -165,7 +149,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
||||||
|
|
||||||
// Not private to avoid synthetic access$NNN methods
|
// Not private to avoid synthetic access$NNN methods
|
||||||
void seekTvx(final int docNum) throws IOException {
|
void seekTvx(final int docNum) throws IOException {
|
||||||
tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
|
tvx.seek(docNum * 16L + FORMAT_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean canReadRawDocs() {
|
boolean canReadRawDocs() {
|
||||||
|
@ -201,7 +185,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
while (count < numDocs) {
|
while (count < numDocs) {
|
||||||
final int docID = docStoreOffset + startDocID + count + 1;
|
final int docID = startDocID + count + 1;
|
||||||
assert docID <= numTotalDocs;
|
assert docID <= numTotalDocs;
|
||||||
if (docID < numTotalDocs) {
|
if (docID < numTotalDocs) {
|
||||||
tvdPosition = tvx.readLong();
|
tvdPosition = tvx.readLong();
|
||||||
|
@ -712,23 +696,14 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
||||||
cloneTvf = (IndexInput) tvf.clone();
|
cloneTvf = (IndexInput) tvf.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
|
return new Lucene40TermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, format);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
|
public static void files(Directory dir, SegmentInfo info, Set<String> files) throws IOException {
|
||||||
if (info.getHasVectors()) {
|
if (info.getHasVectors()) {
|
||||||
if (info.getDocStoreOffset() != -1) {
|
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
|
||||||
assert info.getDocStoreSegment() != null;
|
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
|
||||||
if (!info.getDocStoreIsCompoundFile()) {
|
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
|
||||||
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
|
|
||||||
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
|
|
||||||
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_INDEX_EXTENSION));
|
|
||||||
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_FIELDS_EXTENSION));
|
|
||||||
files.add(IndexFileNames.segmentFileName(info.name, "", VECTORS_DOCUMENTS_EXTENSION));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.codecs.preflexrw;
|
||||||
import org.apache.lucene.codecs.FieldInfosFormat;
|
import org.apache.lucene.codecs.FieldInfosFormat;
|
||||||
import org.apache.lucene.codecs.NormsFormat;
|
import org.apache.lucene.codecs.NormsFormat;
|
||||||
import org.apache.lucene.codecs.PostingsFormat;
|
import org.apache.lucene.codecs.PostingsFormat;
|
||||||
|
import org.apache.lucene.codecs.TermVectorsFormat;
|
||||||
import org.apache.lucene.codecs.lucene3x.Lucene3xCodec;
|
import org.apache.lucene.codecs.lucene3x.Lucene3xCodec;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
@ -31,6 +32,7 @@ public class PreFlexRWCodec extends Lucene3xCodec {
|
||||||
private final PostingsFormat postings = new PreFlexRWPostingsFormat();
|
private final PostingsFormat postings = new PreFlexRWPostingsFormat();
|
||||||
private final NormsFormat norms = new PreFlexRWNormsFormat();
|
private final NormsFormat norms = new PreFlexRWNormsFormat();
|
||||||
private final FieldInfosFormat fieldInfos = new PreFlexRWFieldInfosFormat();
|
private final FieldInfosFormat fieldInfos = new PreFlexRWFieldInfosFormat();
|
||||||
|
private final TermVectorsFormat termVectors = new PreFlexRWTermVectorsFormat();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public PostingsFormat postingsFormat() {
|
public PostingsFormat postingsFormat() {
|
||||||
|
@ -58,4 +60,13 @@ public class PreFlexRWCodec extends Lucene3xCodec {
|
||||||
return super.fieldInfosFormat();
|
return super.fieldInfosFormat();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermVectorsFormat termVectorsFormat() {
|
||||||
|
if (LuceneTestCase.PREFLEX_IMPERSONATION_IS_ACTIVE) {
|
||||||
|
return termVectors;
|
||||||
|
} else {
|
||||||
|
return super.termVectorsFormat();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
package org.apache.lucene.codecs.preflexrw;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||||
|
import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsFormat;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
|
||||||
|
public class PreFlexRWTermVectorsFormat extends Lucene3xTermVectorsFormat {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TermVectorsWriter vectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
|
||||||
|
return new PreFlexRWTermVectorsWriter(directory, segment, context);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,216 @@
|
||||||
|
package org.apache.lucene.codecs.preflexrw;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.codecs.TermVectorsWriter;
|
||||||
|
import org.apache.lucene.codecs.lucene3x.Lucene3xTermVectorsReader;
|
||||||
|
import org.apache.lucene.index.FieldInfo;
|
||||||
|
import org.apache.lucene.index.IndexFileNames;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.store.IOContext;
|
||||||
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.StringHelper;
|
||||||
|
|
||||||
|
// TODO: surrogates dance!
|
||||||
|
public final class PreFlexRWTermVectorsWriter extends TermVectorsWriter {
|
||||||
|
private final Directory directory;
|
||||||
|
private final String segment;
|
||||||
|
private IndexOutput tvx = null, tvd = null, tvf = null;
|
||||||
|
|
||||||
|
public PreFlexRWTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
|
||||||
|
this.directory = directory;
|
||||||
|
this.segment = segment;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
// Open files for TermVector storage
|
||||||
|
tvx = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION), context);
|
||||||
|
tvx.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
|
||||||
|
tvd = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION), context);
|
||||||
|
tvd.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
|
||||||
|
tvf = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION), context);
|
||||||
|
tvf.writeInt(Lucene3xTermVectorsReader.FORMAT_CURRENT);
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (!success) {
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startDocument(int numVectorFields) throws IOException {
|
||||||
|
lastFieldName = null;
|
||||||
|
this.numVectorFields = numVectorFields;
|
||||||
|
tvx.writeLong(tvd.getFilePointer());
|
||||||
|
tvx.writeLong(tvf.getFilePointer());
|
||||||
|
tvd.writeVInt(numVectorFields);
|
||||||
|
fieldCount = 0;
|
||||||
|
fps = ArrayUtil.grow(fps, numVectorFields);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long fps[] = new long[10]; // pointers to the tvf before writing each field
|
||||||
|
private int fieldCount = 0; // number of fields we have written so far for this document
|
||||||
|
private int numVectorFields = 0; // total number of fields we will write for this document
|
||||||
|
private String lastFieldName;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
|
||||||
|
assert lastFieldName == null || info.name.compareTo(lastFieldName) > 0: "fieldName=" + info.name + " lastFieldName=" + lastFieldName;
|
||||||
|
lastFieldName = info.name;
|
||||||
|
this.positions = positions;
|
||||||
|
this.offsets = offsets;
|
||||||
|
lastTerm.length = 0;
|
||||||
|
fps[fieldCount++] = tvf.getFilePointer();
|
||||||
|
tvd.writeVInt(info.number);
|
||||||
|
tvf.writeVInt(numTerms);
|
||||||
|
byte bits = 0x0;
|
||||||
|
if (positions)
|
||||||
|
bits |= Lucene3xTermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
|
||||||
|
if (offsets)
|
||||||
|
bits |= Lucene3xTermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
|
||||||
|
tvf.writeByte(bits);
|
||||||
|
|
||||||
|
assert fieldCount <= numVectorFields;
|
||||||
|
if (fieldCount == numVectorFields) {
|
||||||
|
// last field of the document
|
||||||
|
// this is crazy because the file format is crazy!
|
||||||
|
for (int i = 1; i < fieldCount; i++) {
|
||||||
|
tvd.writeVLong(fps[i] - fps[i-1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final BytesRef lastTerm = new BytesRef(10);
|
||||||
|
|
||||||
|
// NOTE: we override addProx, so we don't need to buffer when indexing.
|
||||||
|
// we also don't buffer during bulk merges.
|
||||||
|
private int offsetStartBuffer[] = new int[10];
|
||||||
|
private int offsetEndBuffer[] = new int[10];
|
||||||
|
private int offsetIndex = 0;
|
||||||
|
private int offsetFreq = 0;
|
||||||
|
private boolean positions = false;
|
||||||
|
private boolean offsets = false;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startTerm(BytesRef term, int freq) throws IOException {
|
||||||
|
final int prefix = StringHelper.bytesDifference(lastTerm, term);
|
||||||
|
final int suffix = term.length - prefix;
|
||||||
|
tvf.writeVInt(prefix);
|
||||||
|
tvf.writeVInt(suffix);
|
||||||
|
tvf.writeBytes(term.bytes, term.offset + prefix, suffix);
|
||||||
|
tvf.writeVInt(freq);
|
||||||
|
lastTerm.copyBytes(term);
|
||||||
|
lastPosition = lastOffset = 0;
|
||||||
|
|
||||||
|
if (offsets && positions) {
|
||||||
|
// we might need to buffer if its a non-bulk merge
|
||||||
|
offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq);
|
||||||
|
offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq);
|
||||||
|
offsetIndex = 0;
|
||||||
|
offsetFreq = freq;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int lastPosition = 0;
|
||||||
|
int lastOffset = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
|
||||||
|
// TODO: technically we could just copy bytes and not re-encode if we knew the length...
|
||||||
|
if (positions != null) {
|
||||||
|
for (int i = 0; i < numProx; i++) {
|
||||||
|
tvf.writeVInt(positions.readVInt());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (offsets != null) {
|
||||||
|
for (int i = 0; i < numProx; i++) {
|
||||||
|
tvf.writeVInt(offsets.readVInt());
|
||||||
|
tvf.writeVInt(offsets.readVInt());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void addPosition(int position, int startOffset, int endOffset) throws IOException {
|
||||||
|
if (positions && offsets) {
|
||||||
|
// write position delta
|
||||||
|
tvf.writeVInt(position - lastPosition);
|
||||||
|
lastPosition = position;
|
||||||
|
|
||||||
|
// buffer offsets
|
||||||
|
offsetStartBuffer[offsetIndex] = startOffset;
|
||||||
|
offsetEndBuffer[offsetIndex] = endOffset;
|
||||||
|
offsetIndex++;
|
||||||
|
|
||||||
|
// dump buffer if we are done
|
||||||
|
if (offsetIndex == offsetFreq) {
|
||||||
|
for (int i = 0; i < offsetIndex; i++) {
|
||||||
|
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
|
||||||
|
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
|
||||||
|
lastOffset = offsetEndBuffer[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (positions) {
|
||||||
|
// write position delta
|
||||||
|
tvf.writeVInt(position - lastPosition);
|
||||||
|
lastPosition = position;
|
||||||
|
} else if (offsets) {
|
||||||
|
// write offset deltas
|
||||||
|
tvf.writeVInt(startOffset - lastOffset);
|
||||||
|
tvf.writeVInt(endOffset - startOffset);
|
||||||
|
lastOffset = endOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void abort() {
|
||||||
|
try {
|
||||||
|
close();
|
||||||
|
} catch (IOException ignored) {}
|
||||||
|
IOUtils.deleteFilesIgnoringExceptions(directory, IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_INDEX_EXTENSION),
|
||||||
|
IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_DOCUMENTS_EXTENSION),
|
||||||
|
IndexFileNames.segmentFileName(segment, "", Lucene3xTermVectorsReader.VECTORS_FIELDS_EXTENSION));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finish(int numDocs) throws IOException {
|
||||||
|
if (4+((long) numDocs)*16 != tvx.getFilePointer())
|
||||||
|
// This is most likely a bug in Sun JRE 1.6.0_04/_05;
|
||||||
|
// we detect that the bug has struck, here, and
|
||||||
|
// throw an exception to prevent the corruption from
|
||||||
|
// entering the index. See LUCENE-1282 for
|
||||||
|
// details.
|
||||||
|
throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + numDocs + " but tvx size is " + tvx.getFilePointer() + " file=" + tvx.toString() + "; now aborting this merge to prevent index corruption");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Close all streams. */
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
// make an effort to close all streams we can but remember and re-throw
|
||||||
|
// the first exception encountered in this process
|
||||||
|
IOUtils.close(tvx, tvd, tvf);
|
||||||
|
tvx = tvd = tvf = null;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue