mirror of https://github.com/apache/lucene.git
LUCENE-984: reduce TermVectorsWriter to just the merging use case
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@576807 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
299d6357dd
commit
ddb17c6397
|
@ -733,12 +733,12 @@ final class DocumentsWriter {
|
|||
if (docHasVectors) {
|
||||
if (tvx == null) {
|
||||
assert docStoreSegment != null;
|
||||
tvx = directory.createOutput(docStoreSegment + TermVectorsWriter.TVX_EXTENSION);
|
||||
tvx.writeInt(TermVectorsWriter.FORMAT_VERSION);
|
||||
tvd = directory.createOutput(docStoreSegment + TermVectorsWriter.TVD_EXTENSION);
|
||||
tvd.writeInt(TermVectorsWriter.FORMAT_VERSION);
|
||||
tvf = directory.createOutput(docStoreSegment + TermVectorsWriter.TVF_EXTENSION);
|
||||
tvf.writeInt(TermVectorsWriter.FORMAT_VERSION);
|
||||
tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
|
||||
tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
|
||||
tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
|
||||
tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
|
||||
tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
|
||||
tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
|
||||
files = null;
|
||||
|
||||
// We must "catch up" for all docIDs that had no
|
||||
|
@ -1613,9 +1613,9 @@ final class DocumentsWriter {
|
|||
tvfLocal.writeVInt(numPostingsVectors);
|
||||
byte bits = 0x0;
|
||||
if (doVectorPositions)
|
||||
bits |= TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR;
|
||||
bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
|
||||
if (doVectorOffsets)
|
||||
bits |= TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR;
|
||||
bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
|
||||
tvfLocal.writeByte(bits);
|
||||
|
||||
doVectorSort(postingsVectors, numPostingsVectors);
|
||||
|
|
|
@ -27,6 +27,14 @@ import java.io.IOException;
|
|||
* @version $Id$
|
||||
*/
|
||||
class TermVectorsReader implements Cloneable {
|
||||
|
||||
static final int FORMAT_VERSION = 2;
|
||||
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
|
||||
static final int FORMAT_SIZE = 4;
|
||||
|
||||
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
|
||||
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
|
||||
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private IndexInput tvx;
|
||||
|
@ -56,12 +64,12 @@ class TermVectorsReader implements Cloneable {
|
|||
boolean success = false;
|
||||
|
||||
try {
|
||||
if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
|
||||
tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION, readBufferSize);
|
||||
if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) {
|
||||
tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
|
||||
checkValidFormat(tvx);
|
||||
tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION, readBufferSize);
|
||||
tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
|
||||
tvdFormat = checkValidFormat(tvd);
|
||||
tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION, readBufferSize);
|
||||
tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
|
||||
tvfFormat = checkValidFormat(tvf);
|
||||
if (-1 == docStoreOffset) {
|
||||
this.docStoreOffset = 0;
|
||||
|
@ -92,10 +100,10 @@ class TermVectorsReader implements Cloneable {
|
|||
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
|
||||
{
|
||||
int format = in.readInt();
|
||||
if (format > TermVectorsWriter.FORMAT_VERSION)
|
||||
if (format > FORMAT_VERSION)
|
||||
{
|
||||
throw new CorruptIndexException("Incompatible format version: " + format + " expected "
|
||||
+ TermVectorsWriter.FORMAT_VERSION + " or less");
|
||||
+ FORMAT_VERSION + " or less");
|
||||
}
|
||||
return format;
|
||||
}
|
||||
|
@ -125,7 +133,7 @@ class TermVectorsReader implements Cloneable {
|
|||
//We don't need to do this in other seeks because we already have the
|
||||
// file pointer
|
||||
//that was written in another file
|
||||
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
|
||||
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
|
||||
long position = tvx.readLong();
|
||||
|
||||
|
@ -138,7 +146,7 @@ class TermVectorsReader implements Cloneable {
|
|||
int number = 0;
|
||||
int found = -1;
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
|
||||
if(tvdFormat == FORMAT_VERSION)
|
||||
number = tvd.readVInt();
|
||||
else
|
||||
number += tvd.readVInt();
|
||||
|
@ -192,7 +200,7 @@ class TermVectorsReader implements Cloneable {
|
|||
TermFreqVector[] result = null;
|
||||
if (tvx != null) {
|
||||
//We need to offset by
|
||||
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
tvx.seek(((docNum + docStoreOffset) * 8L) + FORMAT_SIZE);
|
||||
long position = tvx.readLong();
|
||||
|
||||
tvd.seek(position);
|
||||
|
@ -204,7 +212,7 @@ class TermVectorsReader implements Cloneable {
|
|||
String[] fields = new String[fieldCount];
|
||||
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
|
||||
if(tvdFormat == FORMAT_VERSION)
|
||||
number = tvd.readVInt();
|
||||
else
|
||||
number += tvd.readVInt();
|
||||
|
@ -232,7 +240,7 @@ class TermVectorsReader implements Cloneable {
|
|||
// Check if no term vectors are available for this segment at all
|
||||
if (tvx != null) {
|
||||
//We need to offset by
|
||||
tvx.seek((docNumber * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
tvx.seek((docNumber * 8L) + FORMAT_SIZE);
|
||||
long position = tvx.readLong();
|
||||
|
||||
tvd.seek(position);
|
||||
|
@ -244,7 +252,7 @@ class TermVectorsReader implements Cloneable {
|
|||
String[] fields = new String[fieldCount];
|
||||
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
|
||||
if(tvdFormat == FORMAT_VERSION)
|
||||
number = tvd.readVInt();
|
||||
else
|
||||
number += tvd.readVInt();
|
||||
|
@ -313,10 +321,10 @@ class TermVectorsReader implements Cloneable {
|
|||
boolean storePositions;
|
||||
boolean storeOffsets;
|
||||
|
||||
if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){
|
||||
if(tvfFormat == FORMAT_VERSION){
|
||||
byte bits = tvf.readByte();
|
||||
storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||
storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||
}
|
||||
else{
|
||||
tvf.readVInt();
|
||||
|
@ -470,4 +478,4 @@ class ParallelArrayTermVectorMapper extends TermVectorMapper
|
|||
}
|
||||
return tv;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,161 +24,23 @@ import org.apache.lucene.util.StringHelper;
|
|||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
|
||||
/**
|
||||
* Writer works by opening a document and then opening the fields within the document and then
|
||||
* writing out the vectors for each field.
|
||||
*
|
||||
* Rough usage:
|
||||
*
|
||||
<CODE>
|
||||
for each document
|
||||
{
|
||||
writer.openDocument();
|
||||
for each field on the document
|
||||
{
|
||||
writer.openField(field);
|
||||
for all of the terms
|
||||
{
|
||||
writer.addTerm(...)
|
||||
}
|
||||
writer.closeField
|
||||
}
|
||||
writer.closeDocument()
|
||||
}
|
||||
</CODE>
|
||||
*
|
||||
* @version $Id$
|
||||
*
|
||||
*/
|
||||
final class TermVectorsWriter {
|
||||
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
|
||||
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
|
||||
|
||||
static final int FORMAT_VERSION = 2;
|
||||
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
|
||||
static final int FORMAT_SIZE = 4;
|
||||
|
||||
static final String TVX_EXTENSION = ".tvx";
|
||||
static final String TVD_EXTENSION = ".tvd";
|
||||
static final String TVF_EXTENSION = ".tvf";
|
||||
|
||||
private IndexOutput tvx = null, tvd = null, tvf = null;
|
||||
private Vector fields = null;
|
||||
private Vector terms = null;
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private TVField currentField = null;
|
||||
private long currentDocPointer = -1;
|
||||
|
||||
public TermVectorsWriter(Directory directory, String segment,
|
||||
FieldInfos fieldInfos)
|
||||
throws IOException {
|
||||
// Open files for TermVector storage
|
||||
tvx = directory.createOutput(segment + TVX_EXTENSION);
|
||||
tvx.writeInt(FORMAT_VERSION);
|
||||
tvd = directory.createOutput(segment + TVD_EXTENSION);
|
||||
tvd.writeInt(FORMAT_VERSION);
|
||||
tvf = directory.createOutput(segment + TVF_EXTENSION);
|
||||
tvf.writeInt(FORMAT_VERSION);
|
||||
tvx = directory.createOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
|
||||
tvx.writeInt(TermVectorsReader.FORMAT_VERSION);
|
||||
tvd = directory.createOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
|
||||
tvd.writeInt(TermVectorsReader.FORMAT_VERSION);
|
||||
tvf = directory.createOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
|
||||
tvf.writeInt(TermVectorsReader.FORMAT_VERSION);
|
||||
|
||||
this.fieldInfos = fieldInfos;
|
||||
fields = new Vector(fieldInfos.size());
|
||||
terms = new Vector();
|
||||
}
|
||||
|
||||
|
||||
public final void openDocument()
|
||||
throws IOException {
|
||||
closeDocument();
|
||||
currentDocPointer = tvd.getFilePointer();
|
||||
}
|
||||
|
||||
|
||||
public final void closeDocument()
|
||||
throws IOException {
|
||||
if (isDocumentOpen()) {
|
||||
closeField();
|
||||
writeDoc();
|
||||
fields.clear();
|
||||
currentDocPointer = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public final boolean isDocumentOpen() {
|
||||
return currentDocPointer != -1;
|
||||
}
|
||||
|
||||
|
||||
/** Start processing a field. This can be followed by a number of calls to
|
||||
* addTerm, and a final call to closeField to indicate the end of
|
||||
* processing of this field. If a field was previously open, it is
|
||||
* closed automatically.
|
||||
*/
|
||||
public final void openField(String field) throws IOException {
|
||||
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
|
||||
openField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
|
||||
}
|
||||
|
||||
private void openField(int fieldNumber, boolean storePositionWithTermVector,
|
||||
boolean storeOffsetWithTermVector) throws IOException{
|
||||
if (!isDocumentOpen())
|
||||
throw new IllegalStateException("Cannot open field when no document is open.");
|
||||
closeField();
|
||||
currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
|
||||
}
|
||||
|
||||
/** Finished processing current field. This should be followed by a call to
|
||||
* openField before future calls to addTerm.
|
||||
*/
|
||||
public final void closeField()
|
||||
throws IOException {
|
||||
if (isFieldOpen()) {
|
||||
/* DEBUG */
|
||||
//System.out.println("closeField()");
|
||||
/* DEBUG */
|
||||
|
||||
// save field and terms
|
||||
writeField();
|
||||
fields.add(currentField);
|
||||
terms.clear();
|
||||
currentField = null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return true if a field is currently open. */
|
||||
public final boolean isFieldOpen() {
|
||||
return currentField != null;
|
||||
}
|
||||
|
||||
/** Add term to the field's term vector. Fieldable must already be open.
|
||||
* Terms should be added in
|
||||
* increasing order of terms, one call per unique termNum. ProxPointer
|
||||
* is a pointer into the TermPosition file (prx). Freq is the number of
|
||||
* times this term appears in this field, in this document.
|
||||
* @throws IllegalStateException if document or field is not open
|
||||
*/
|
||||
public final void addTerm(String termText, int freq) {
|
||||
addTerm(termText, freq, null, null);
|
||||
}
|
||||
|
||||
public final void addTerm(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets)
|
||||
{
|
||||
if (!isDocumentOpen())
|
||||
throw new IllegalStateException("Cannot add terms when document is not open");
|
||||
if (!isFieldOpen())
|
||||
throw new IllegalStateException("Cannot add terms when field is not open");
|
||||
|
||||
addTermInternal(termText, freq, positions, offsets);
|
||||
}
|
||||
|
||||
private final void addTermInternal(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) {
|
||||
TVTerm term = new TVTerm();
|
||||
term.termText = termText;
|
||||
term.freq = freq;
|
||||
term.positions = positions;
|
||||
term.offsets = offsets;
|
||||
terms.add(term);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -190,185 +52,135 @@ final class TermVectorsWriter {
|
|||
*/
|
||||
public final void addAllDocVectors(TermFreqVector[] vectors)
|
||||
throws IOException {
|
||||
openDocument();
|
||||
|
||||
tvx.writeLong(tvd.getFilePointer());
|
||||
|
||||
if (vectors != null) {
|
||||
for (int i = 0; i < vectors.length; i++) {
|
||||
boolean storePositionWithTermVector = false;
|
||||
boolean storeOffsetWithTermVector = false;
|
||||
final int numFields = vectors.length;
|
||||
tvd.writeVInt(numFields);
|
||||
|
||||
try {
|
||||
long[] fieldPointers = new long[numFields];
|
||||
|
||||
TermPositionVector tpVector = (TermPositionVector) vectors[i];
|
||||
for (int i=0; i<numFields; i++) {
|
||||
fieldPointers[i] = tvf.getFilePointer();
|
||||
|
||||
if (tpVector.size() > 0 && tpVector.getTermPositions(0) != null)
|
||||
storePositionWithTermVector = true;
|
||||
if (tpVector.size() > 0 && tpVector.getOffsets(0) != null)
|
||||
storeOffsetWithTermVector = true;
|
||||
final int fieldNumber = fieldInfos.fieldNumber(vectors[i].getField());
|
||||
|
||||
FieldInfo fieldInfo = fieldInfos.fieldInfo(tpVector.getField());
|
||||
openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
|
||||
// 1st pass: write field numbers to tvd
|
||||
tvd.writeVInt(fieldNumber);
|
||||
|
||||
for (int j = 0; j < tpVector.size(); j++)
|
||||
addTermInternal(tpVector.getTerms()[j], tpVector.getTermFrequencies()[j], tpVector.getTermPositions(j),
|
||||
tpVector.getOffsets(j));
|
||||
final int numTerms = vectors[i].size();
|
||||
tvf.writeVInt(numTerms);
|
||||
|
||||
closeField();
|
||||
final TermPositionVector tpVector;
|
||||
final TermFreqVector tfVector;
|
||||
|
||||
} catch (ClassCastException ignore) {
|
||||
final byte bits;
|
||||
final boolean storePositions;
|
||||
final boolean storeOffsets;
|
||||
|
||||
TermFreqVector tfVector = vectors[i];
|
||||
if (vectors[i] instanceof TermPositionVector) {
|
||||
// May have positions & offsets
|
||||
tpVector = (TermPositionVector) vectors[i];
|
||||
tfVector = null;
|
||||
storePositions = tpVector.size() > 0 && tpVector.getTermPositions(0) != null;
|
||||
storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null;
|
||||
bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) +
|
||||
(storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0));
|
||||
} else {
|
||||
tpVector = null;
|
||||
tfVector = vectors[i];
|
||||
bits = 0;
|
||||
storePositions = false;
|
||||
storeOffsets = false;
|
||||
}
|
||||
|
||||
FieldInfo fieldInfo = fieldInfos.fieldInfo(tfVector.getField());
|
||||
openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
|
||||
tvf.writeVInt(bits);
|
||||
|
||||
for (int j = 0; j < tfVector.size(); j++)
|
||||
addTermInternal(tfVector.getTerms()[j], tfVector.getTermFrequencies()[j], null, null);
|
||||
final String[] terms = vectors[i].getTerms();
|
||||
final int[] freqs = vectors[i].getTermFrequencies();
|
||||
|
||||
closeField();
|
||||
String lastTermText = "";
|
||||
for (int j=0; j<numTerms; j++) {
|
||||
final String termText = terms[j];
|
||||
int start = StringHelper.stringDifference(lastTermText, termText);
|
||||
int length = termText.length() - start;
|
||||
tvf.writeVInt(start); // write shared prefix length
|
||||
tvf.writeVInt(length); // write delta length
|
||||
tvf.writeChars(termText, start, length); // write delta chars
|
||||
lastTermText = termText;
|
||||
|
||||
final int termFreq = freqs[j];
|
||||
|
||||
tvf.writeVInt(termFreq);
|
||||
|
||||
if (storePositions) {
|
||||
final int[] positions = tpVector.getTermPositions(j);
|
||||
if (positions == null)
|
||||
throw new IllegalStateException("Trying to write positions that are null!");
|
||||
assert positions.length == termFreq;
|
||||
|
||||
// use delta encoding for positions
|
||||
int lastPosition = 0;
|
||||
for(int k=0;k<positions.length;k++) {
|
||||
final int position = positions[k];
|
||||
tvf.writeVInt(position-lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
}
|
||||
|
||||
if (storeOffsets) {
|
||||
final TermVectorOffsetInfo[] offsets = tpVector.getOffsets(j);
|
||||
if (offsets == null)
|
||||
throw new IllegalStateException("Trying to write offsets that are null!");
|
||||
assert offsets.length == termFreq;
|
||||
|
||||
// use delta encoding for offsets
|
||||
int lastEndOffset = 0;
|
||||
for(int k=0;k<offsets.length;k++) {
|
||||
final int startOffset = offsets[k].getStartOffset();
|
||||
final int endOffset = offsets[k].getEndOffset();
|
||||
tvf.writeVInt(startOffset-lastEndOffset);
|
||||
tvf.writeVInt(endOffset-startOffset);
|
||||
lastEndOffset = endOffset;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
closeDocument();
|
||||
// 2nd pass: write field pointers to tvd
|
||||
long lastFieldPointer = 0;
|
||||
for (int i=0; i<numFields; i++) {
|
||||
final long fieldPointer = fieldPointers[i];
|
||||
tvd.writeVLong(fieldPointer-lastFieldPointer);
|
||||
lastFieldPointer = fieldPointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Close all streams. */
|
||||
final void close() throws IOException {
|
||||
try {
|
||||
closeDocument();
|
||||
} finally {
|
||||
// make an effort to close all streams we can but remember and re-throw
|
||||
// the first exception encountered in this process
|
||||
IOException keep = null;
|
||||
if (tvx != null)
|
||||
try {
|
||||
tvx.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (tvd != null)
|
||||
try {
|
||||
tvd.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (tvf != null)
|
||||
try {
|
||||
tvf.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (keep != null) throw (IOException) keep.fillInStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void writeField() throws IOException {
|
||||
// remember where this field is written
|
||||
currentField.tvfPointer = tvf.getFilePointer();
|
||||
//System.out.println("Fieldable Pointer: " + currentField.tvfPointer);
|
||||
|
||||
final int size = terms.size();
|
||||
tvf.writeVInt(size);
|
||||
|
||||
boolean storePositions = currentField.storePositions;
|
||||
boolean storeOffsets = currentField.storeOffsets;
|
||||
byte bits = 0x0;
|
||||
if (storePositions)
|
||||
bits |= STORE_POSITIONS_WITH_TERMVECTOR;
|
||||
if (storeOffsets)
|
||||
bits |= STORE_OFFSET_WITH_TERMVECTOR;
|
||||
tvf.writeByte(bits);
|
||||
|
||||
String lastTermText = "";
|
||||
for (int i = 0; i < size; i++) {
|
||||
TVTerm term = (TVTerm) terms.elementAt(i);
|
||||
int start = StringHelper.stringDifference(lastTermText, term.termText);
|
||||
int length = term.termText.length() - start;
|
||||
tvf.writeVInt(start); // write shared prefix length
|
||||
tvf.writeVInt(length); // write delta length
|
||||
tvf.writeChars(term.termText, start, length); // write delta chars
|
||||
tvf.writeVInt(term.freq);
|
||||
lastTermText = term.termText;
|
||||
|
||||
if(storePositions){
|
||||
if(term.positions == null)
|
||||
throw new IllegalStateException("Trying to write positions that are null!");
|
||||
|
||||
// use delta encoding for positions
|
||||
int position = 0;
|
||||
for (int j = 0; j < term.freq; j++){
|
||||
tvf.writeVInt(term.positions[j] - position);
|
||||
position = term.positions[j];
|
||||
}
|
||||
// make an effort to close all streams we can but remember and re-throw
|
||||
// the first exception encountered in this process
|
||||
IOException keep = null;
|
||||
if (tvx != null)
|
||||
try {
|
||||
tvx.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
|
||||
if(storeOffsets){
|
||||
if(term.offsets == null)
|
||||
throw new IllegalStateException("Trying to write offsets that are null!");
|
||||
|
||||
// use delta encoding for offsets
|
||||
int position = 0;
|
||||
for (int j = 0; j < term.freq; j++) {
|
||||
tvf.writeVInt(term.offsets[j].getStartOffset() - position);
|
||||
tvf.writeVInt(term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); //Save the diff between the two.
|
||||
position = term.offsets[j].getEndOffset();
|
||||
}
|
||||
if (tvd != null)
|
||||
try {
|
||||
tvd.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
}
|
||||
if (tvf != null)
|
||||
try {
|
||||
tvf.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (keep != null) throw (IOException) keep.fillInStackTrace();
|
||||
}
|
||||
|
||||
private void writeDoc() throws IOException {
|
||||
if (isFieldOpen())
|
||||
throw new IllegalStateException("Field is still open while writing document");
|
||||
//System.out.println("Writing doc pointer: " + currentDocPointer);
|
||||
// write document index record
|
||||
tvx.writeLong(currentDocPointer);
|
||||
|
||||
// write document data record
|
||||
final int size = fields.size();
|
||||
|
||||
// write the number of fields
|
||||
tvd.writeVInt(size);
|
||||
|
||||
// write field numbers
|
||||
for (int i = 0; i < size; i++) {
|
||||
TVField field = (TVField) fields.elementAt(i);
|
||||
tvd.writeVInt(field.number);
|
||||
}
|
||||
|
||||
// write field pointers
|
||||
long lastFieldPointer = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
TVField field = (TVField) fields.elementAt(i);
|
||||
tvd.writeVLong(field.tvfPointer - lastFieldPointer);
|
||||
lastFieldPointer = field.tvfPointer;
|
||||
}
|
||||
//System.out.println("After writing doc pointer: " + tvx.getFilePointer());
|
||||
}
|
||||
|
||||
|
||||
private static class TVField {
|
||||
int number;
|
||||
long tvfPointer = 0;
|
||||
boolean storePositions = false;
|
||||
boolean storeOffsets = false;
|
||||
TVField(int number, boolean storePos, boolean storeOff) {
|
||||
this.number = number;
|
||||
storePositions = storePos;
|
||||
storeOffsets = storeOff;
|
||||
}
|
||||
}
|
||||
|
||||
private static class TVTerm {
|
||||
String termText;
|
||||
int freq = 0;
|
||||
int positions[] = null;
|
||||
TermVectorOffsetInfo [] offsets = null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1444,6 +1444,26 @@ public class TestIndexWriter extends TestCase
|
|||
}
|
||||
dir.delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that no NullPointerException will be raised,
|
||||
* when adding one document with a single, empty field
|
||||
* and term vectors enabled.
|
||||
* @throws IOException
|
||||
*
|
||||
*/
|
||||
public void testBadSegment() throws IOException {
|
||||
MockRAMDirectory dir = new MockRAMDirectory();
|
||||
IndexWriter ir = new IndexWriter(dir, new StandardAnalyzer(), true);
|
||||
|
||||
Document document = new Document();
|
||||
document.add(new Field("tvtest", "", Field.Store.NO, Field.Index.TOKENIZED,
|
||||
Field.TermVector.YES));
|
||||
ir.addDocument(document);
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -18,16 +18,21 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.MockRAMDirectory;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
|
||||
public class TestTermVectorsReader extends TestCase {
|
||||
private TermVectorsWriter writer = null;
|
||||
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
||||
private String[] testFields = {"f1", "f2", "f3", "f4"};
|
||||
private boolean[] testFieldsStorePos = {true, false, true, false};
|
||||
|
@ -35,49 +40,105 @@ public class TestTermVectorsReader extends TestCase {
|
|||
private String[] testTerms = {"this", "is", "a", "test"};
|
||||
private int[][] positions = new int[testTerms.length][];
|
||||
private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private String seg = "testSegment";
|
||||
private MockRAMDirectory dir = new MockRAMDirectory();
|
||||
private String seg;
|
||||
private FieldInfos fieldInfos = new FieldInfos();
|
||||
private static int TERM_FREQ = 3;
|
||||
|
||||
public TestTermVectorsReader(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
private class TestToken implements Comparable {
|
||||
String text;
|
||||
int pos;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
public int compareTo(Object other) {
|
||||
return pos - ((TestToken) other).pos;
|
||||
}
|
||||
}
|
||||
|
||||
TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
|
||||
|
||||
protected void setUp() throws IOException {
|
||||
/*
|
||||
for (int i = 0; i < testFields.length; i++) {
|
||||
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
|
||||
}
|
||||
*/
|
||||
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
positions[i] = new int[3];
|
||||
for (int j = 0; j < positions[i].length; j++) {
|
||||
// poditions are always sorted in increasing order
|
||||
positions[i][j] = (int) (j * 10 + Math.random() * 10);
|
||||
}
|
||||
offsets[i] = new TermVectorOffsetInfo[3];
|
||||
for (int j = 0; j < offsets[i].length; j++) {
|
||||
// ofsets are alway sorted in increasing order
|
||||
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
|
||||
}
|
||||
}
|
||||
Arrays.sort(testTerms);
|
||||
//Create 5 documents for testing, they all have the same terms
|
||||
writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
for (int j = 0; j < 5; j++) {
|
||||
|
||||
writer.openDocument();
|
||||
|
||||
for (int k = 0; k < testFields.length; k++) {
|
||||
writer.openField(testFields[k]);
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
|
||||
}
|
||||
writer.closeField();
|
||||
int tokenUpto = 0;
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
positions[i] = new int[TERM_FREQ];
|
||||
offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
|
||||
// first position must be 0
|
||||
for (int j = 0; j < TERM_FREQ; j++) {
|
||||
// positions are always sorted in increasing order
|
||||
positions[i][j] = (int) (j * 10 + Math.random() * 10);
|
||||
// offsets are always sorted in increasing order
|
||||
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
|
||||
TestToken token = tokens[tokenUpto++] = new TestToken();
|
||||
token.text = testTerms[i];
|
||||
token.pos = positions[i][j];
|
||||
token.startOffset = offsets[i][j].getStartOffset();
|
||||
token.endOffset = offsets[i][j].getEndOffset();
|
||||
}
|
||||
writer.closeDocument();
|
||||
|
||||
}
|
||||
Arrays.sort(tokens);
|
||||
|
||||
IndexWriter writer = new IndexWriter(dir, new MyAnalyzer(), true);
|
||||
writer.setUseCompoundFile(false);
|
||||
Document doc = new Document();
|
||||
for(int i=0;i<testFields.length;i++) {
|
||||
final Field.TermVector tv;
|
||||
if (testFieldsStorePos[i] && testFieldsStoreOff[i])
|
||||
tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
|
||||
else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
|
||||
tv = Field.TermVector.WITH_POSITIONS;
|
||||
else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
|
||||
tv = Field.TermVector.WITH_OFFSETS;
|
||||
else
|
||||
tv = Field.TermVector.YES;
|
||||
doc.add(new Field(testFields[i], "", Field.Store.NO, Field.Index.TOKENIZED, tv));
|
||||
}
|
||||
|
||||
//Create 5 documents for testing, they all have the same
|
||||
//terms
|
||||
for(int j=0;j<5;j++)
|
||||
writer.addDocument(doc);
|
||||
writer.flush();
|
||||
seg = writer.newestSegment().name;
|
||||
writer.close();
|
||||
|
||||
fieldInfos = new FieldInfos(dir, seg + "." + IndexFileNames.FIELD_INFOS_EXTENSION);
|
||||
}
|
||||
|
||||
private class MyTokenStream extends TokenStream {
|
||||
int tokenUpto;
|
||||
public Token next() {
|
||||
if (tokenUpto >= tokens.length)
|
||||
return null;
|
||||
else {
|
||||
final Token t = new Token();
|
||||
final TestToken testToken = tokens[tokenUpto++];
|
||||
t.setTermText(testToken.text);
|
||||
if (tokenUpto > 1)
|
||||
t.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
|
||||
else
|
||||
t.setPositionIncrement(testToken.pos+1);
|
||||
t.setStartOffset(testToken.startOffset);
|
||||
t.setEndOffset(testToken.endOffset);
|
||||
return t;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private class MyAnalyzer extends Analyzer {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new MyTokenStream();
|
||||
}
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
@ -86,9 +147,8 @@ public class TestTermVectorsReader extends TestCase {
|
|||
|
||||
public void test() {
|
||||
//Check to see the files were created properly in setup
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + "." + IndexFileNames.VECTORS_INDEX_EXTENSION));
|
||||
}
|
||||
|
||||
public void testReader() throws IOException {
|
||||
|
@ -106,8 +166,6 @@ public class TestTermVectorsReader extends TestCase {
|
|||
assertTrue(term.equals(testTerms[i]));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void testPositionReader() throws IOException {
|
||||
|
|
|
@ -1,207 +0,0 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestTermVectorsWriter extends TestCase {
|
||||
|
||||
private String[] testTerms = {"this", "is", "a", "test"};
|
||||
private String [] testFields = {"f1", "f2", "f3"};
|
||||
private int[][] positions = new int[testTerms.length][];
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private String seg = "testSegment";
|
||||
private FieldInfos fieldInfos = new FieldInfos();
|
||||
|
||||
public TestTermVectorsWriter(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
|
||||
for (int i = 0; i < testFields.length; i++) {
|
||||
fieldInfos.add(testFields[i], true, true);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
positions[i] = new int[5];
|
||||
for (int j = 0; j < positions[i].length; j++) {
|
||||
positions[i][j] = j * 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(positions != null);
|
||||
}
|
||||
|
||||
/*public void testWriteNoPositions() {
|
||||
try {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, 50);
|
||||
writer.openDocument();
|
||||
assertTrue(writer.isDocumentOpen() == true);
|
||||
writer.openField(0);
|
||||
assertTrue(writer.isFieldOpen() == true);
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
writer.addTerm(testTerms[i], i);
|
||||
}
|
||||
writer.closeField();
|
||||
|
||||
writer.closeDocument();
|
||||
writer.close();
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
//Check to see the files were created
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
//Now read it back in
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg);
|
||||
assertTrue(reader != null);
|
||||
checkTermVector(reader, 0, 0);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
} */
|
||||
|
||||
public void testWriter() throws IOException {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
writer.openDocument();
|
||||
assertTrue(writer.isDocumentOpen() == true);
|
||||
writeField(writer, testFields[0]);
|
||||
writer.closeDocument();
|
||||
writer.close();
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
//Check to see the files were created
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
//Now read it back in
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
checkTermVector(reader, 0, testFields[0]);
|
||||
}
|
||||
|
||||
private void checkTermVector(TermVectorsReader reader, int docNum, String field) throws IOException {
|
||||
TermFreqVector vector = reader.get(docNum, field);
|
||||
assertTrue(vector != null);
|
||||
String[] terms = vector.getTerms();
|
||||
assertTrue(terms != null);
|
||||
assertTrue(terms.length == testTerms.length);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String term = terms[i];
|
||||
assertTrue(term.equals(testTerms[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test one document, multiple fields
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testMultipleFields() throws IOException {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
writeDocument(writer, testFields.length);
|
||||
|
||||
writer.close();
|
||||
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
//Check to see the files were created
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
//Now read it back in
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
|
||||
for (int j = 0; j < testFields.length; j++) {
|
||||
checkTermVector(reader, 0, testFields[j]);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeDocument(TermVectorsWriter writer, int numFields) throws IOException {
|
||||
writer.openDocument();
|
||||
assertTrue(writer.isDocumentOpen() == true);
|
||||
|
||||
for (int j = 0; j < numFields; j++) {
|
||||
writeField(writer, testFields[j]);
|
||||
}
|
||||
writer.closeDocument();
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param writer The writer to write to
|
||||
* @param f The field name
|
||||
* @throws IOException
|
||||
*/
|
||||
private void writeField(TermVectorsWriter writer, String f) throws IOException {
|
||||
writer.openField(f);
|
||||
assertTrue(writer.isFieldOpen() == true);
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
writer.addTerm(testTerms[i], i);
|
||||
}
|
||||
writer.closeField();
|
||||
}
|
||||
|
||||
|
||||
public void testMultipleDocuments() throws IOException {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
assertTrue(writer != null);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
writeDocument(writer, testFields.length);
|
||||
}
|
||||
writer.close();
|
||||
//Do some arbitrary tests
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertTrue(reader != null);
|
||||
checkTermVector(reader, 5, testFields[0]);
|
||||
checkTermVector(reader, 2, testFields[2]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that no NullPointerException will be raised,
|
||||
* when adding one document with a single, empty field
|
||||
* and term vectors enabled.
|
||||
* @throws IOException
|
||||
*
|
||||
*/
|
||||
public void testBadSegment() throws IOException {
|
||||
dir = new RAMDirectory();
|
||||
IndexWriter ir = new IndexWriter(dir, new StandardAnalyzer(), true);
|
||||
|
||||
Document document = new Document();
|
||||
document.add(new Field("tvtest", "", Field.Store.NO, Field.Index.TOKENIZED,
|
||||
Field.TermVector.YES));
|
||||
ir.addDocument(document);
|
||||
ir.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -47,7 +47,7 @@ public class MockRAMDirectory extends RAMDirectory {
|
|||
// like super is called, then our members are initialized:
|
||||
Map openFiles;
|
||||
|
||||
public MockRAMDirectory() throws IOException {
|
||||
public MockRAMDirectory() {
|
||||
super();
|
||||
if (openFiles == null) {
|
||||
openFiles = new HashMap();
|
||||
|
|
Loading…
Reference in New Issue