Grant's nw termvector patch (Bug #18927) applied with

some modifications.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150566 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christoph Goller 2004-10-05 17:30:48 +00:00
parent c6d6a390df
commit f1667be0fc
22 changed files with 1046 additions and 296 deletions

View File

@ -38,6 +38,8 @@ public final class Field implements java.io.Serializable {
private Object fieldsData = null; private Object fieldsData = null;
private boolean storeTermVector = false; private boolean storeTermVector = false;
private boolean storeOffsetWithTermVector = false;
private boolean storePositionWithTermVector = false;
private boolean isStored = false; private boolean isStored = false;
private boolean isIndexed = true; private boolean isIndexed = true;
private boolean isTokenized = true; private boolean isTokenized = true;
@ -55,16 +57,19 @@ public final class Field implements java.io.Serializable {
public String toString() { public String toString() {
return name; return name;
} }
/** Store the original field value in the index in a compressed form. This is /** Store the original field value in the index in a compressed form. This is
* useful for long documents and for binary valued fields. * useful for long documents and for binary valued fields.
*/ */
public static final Store COMPRESS = new Store("COMPRESS"); public static final Store COMPRESS = new Store("COMPRESS");
/** Store the original field value in the index. This is useful for short texts /** Store the original field value in the index. This is useful for short texts
* like a document's title which should be displayed with the results. The * like a document's title which should be displayed with the results. The
* value is stored in its original form, i.e. no analyzer is used before it is * value is stored in its original form, i.e. no analyzer is used before it is
* stored. * stored.
*/ */
public static final Store YES = new Store("YES"); public static final Store YES = new Store("YES");
/** Do not store the field value in the index. */ /** Do not store the field value in the index. */
public static final Store NO = new Store("NO"); public static final Store NO = new Store("NO");
} }
@ -100,15 +105,41 @@ public final class Field implements java.io.Serializable {
private TermVector(String name) { private TermVector(String name) {
this.name = name; this.name = name;
} }
public String toString() { public String toString() {
return name; return name;
} }
/** Do not store term vectors. /** Do not store term vectors.
*/ */
public static final TermVector NO = new TermVector("NO"); public static final TermVector NO = new TermVector("NO");
/** Store the term vectors of each document. A term vector is a list /** Store the term vectors of each document. A term vector is a list
* of the document's terms and their number of occurences in that document. */ * of the document's terms and their number of occurences in that document. */
public static final TermVector YES = new TermVector("YES"); public static final TermVector YES = new TermVector("YES");
/**
* Store the term vector + token position information
*
* @see #YES
*/
public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS");
/**
* Store the term vector + Token offset information
*
* @see #YES
*/
public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS");
/**
* Store the term vector + Token position and offset information
*
* @see #YES
* @see #WITH_POSITIONS
* @see #WITH_OFFSETS
*/
public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS");
} }
/** Sets the boost factor hits on this field. This value will be /** Sets the boost factor hits on this field. This value will be
@ -290,14 +321,18 @@ public final class Field implements java.io.Serializable {
this.name = name.intern(); // field names are interned this.name = name.intern(); // field names are interned
this.fieldsData = value; this.fieldsData = value;
if (store == Store.YES) if (store == Store.YES){
this.isStored = true; this.isStored = true;
this.isCompressed = false;
}
else if (store == Store.COMPRESS) { else if (store == Store.COMPRESS) {
this.isStored = true; this.isStored = true;
this.isCompressed = true; this.isCompressed = true;
} }
else if (store == Store.NO) else if (store == Store.NO){
this.isStored = false; this.isStored = false;
this.isCompressed = false;
}
else else
throw new IllegalArgumentException("unknown store parameter " + store); throw new IllegalArgumentException("unknown store parameter " + store);
@ -314,6 +349,8 @@ public final class Field implements java.io.Serializable {
throw new IllegalArgumentException("unknown index parameter " + index); throw new IllegalArgumentException("unknown index parameter " + index);
} }
this.isBinary = false;
setStoreTermVector(termVector); setStoreTermVector(termVector);
} }
@ -343,11 +380,18 @@ public final class Field implements java.io.Serializable {
throw new NullPointerException("name cannot be null"); throw new NullPointerException("name cannot be null");
if (reader == null) if (reader == null)
throw new NullPointerException("reader cannot be null"); throw new NullPointerException("reader cannot be null");
this.name = name.intern(); // field names are interned this.name = name.intern(); // field names are interned
this.fieldsData = reader; this.fieldsData = reader;
this.isStored = false; this.isStored = false;
this.isCompressed = false;
this.isIndexed = true; this.isIndexed = true;
this.isTokenized = true; this.isTokenized = true;
this.isBinary = false;
setStoreTermVector(termVector); setStoreTermVector(termVector);
} }
@ -374,21 +418,29 @@ public final class Field implements java.io.Serializable {
throw new IllegalArgumentException("name cannot be null"); throw new IllegalArgumentException("name cannot be null");
if (value == null) if (value == null)
throw new IllegalArgumentException("value cannot be null"); throw new IllegalArgumentException("value cannot be null");
if (store == Store.NO)
throw new IllegalArgumentException("binary values can't be unstored");
if (store == Store.COMPRESS)
this.isCompressed = true;
this.name = name.intern(); this.name = name.intern();
//wrap the byte[] to a ByteBuffer object
this.fieldsData = value; this.fieldsData = value;
this.isBinary = true; if (store == Store.YES){
this.isStored = true; this.isStored = true;
this.isCompressed = false;
}
else if (store == Store.COMPRESS) {
this.isStored = true;
this.isCompressed = true;
}
else if (store == Store.NO)
throw new IllegalArgumentException("binary values can't be unstored");
else
throw new IllegalArgumentException("unknown store parameter " + store);
this.isIndexed = false; this.isIndexed = false;
this.isTokenized = false; this.isTokenized = false;
this.storeTermVector = false;
this.isBinary = true;
setStoreTermVector(TermVector.NO);
} }
/** /**
@ -422,9 +474,30 @@ public final class Field implements java.io.Serializable {
private void setStoreTermVector(TermVector termVector) { private void setStoreTermVector(TermVector termVector) {
if (termVector == TermVector.NO) { if (termVector == TermVector.NO) {
this.storeTermVector = false; this.storeTermVector = false;
} else if (termVector == TermVector.YES) { this.storePositionWithTermVector = false;
this.storeOffsetWithTermVector = false;
}
else if (termVector == TermVector.YES) {
this.storeTermVector = true; this.storeTermVector = true;
} else { this.storePositionWithTermVector = false;
this.storeOffsetWithTermVector = false;
}
else if (termVector == TermVector.WITH_POSITIONS) {
this.storeTermVector = true;
this.storePositionWithTermVector = true;
this.storeOffsetWithTermVector = false;
}
else if (termVector == TermVector.WITH_OFFSETS) {
this.storeTermVector = true;
this.storePositionWithTermVector = false;
this.storeOffsetWithTermVector = true;
}
else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) {
this.storeTermVector = true;
this.storePositionWithTermVector = true;
this.storeOffsetWithTermVector = true;
}
else {
throw new IllegalArgumentException("unknown termVector parameter " + termVector); throw new IllegalArgumentException("unknown termVector parameter " + termVector);
} }
} }
@ -456,6 +529,23 @@ public final class Field implements java.io.Serializable {
*/ */
public final boolean isTermVectorStored() { return storeTermVector; } public final boolean isTermVectorStored() { return storeTermVector; }
/**
* True iff terms are stored as term vector together with their offsets
* (start and end positon in source text).
* @return
*/
public boolean isStoreOffsetWithTermVector(){
return storeOffsetWithTermVector;
}
/**
* True iff terms are stored as term vector together with their token positions.
* @return
*/
public boolean isStorePositionWithTermVector(){
return storePositionWithTermVector;
}
/** True iff the value of the filed is stored as binary */ /** True iff the value of the filed is stored as binary */
public final boolean isBinary() { return isBinary; } public final boolean isBinary() { return isBinary; }
@ -479,6 +569,16 @@ public final class Field implements java.io.Serializable {
result.append(","); result.append(",");
result.append("termVector"); result.append("termVector");
} }
if (storeOffsetWithTermVector) {
if (result.length() > 0)
result.append(",");
result.append("termVectorOffsets");
}
if (storePositionWithTermVector) {
if (result.length() > 0)
result.append(",");
result.append("termVectorPosition");
}
if (isBinary) { if (isBinary) {
if (result.length() > 0) if (result.length() > 0)
result.append(","); result.append(",");

View File

@ -74,6 +74,7 @@ final class DocumentWriter {
postingTable.clear(); // clear postingTable postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
Arrays.fill(fieldBoosts, doc.getBoost()); Arrays.fill(fieldBoosts, doc.getBoost());
@ -100,7 +101,7 @@ final class DocumentWriter {
writePostings(postings, segment); writePostings(postings, segment);
// write norms of indexed fields // write norms of indexed fields
writeNorms(doc, segment); writeNorms(segment);
} }
@ -109,6 +110,7 @@ final class DocumentWriter {
private final Hashtable postingTable = new Hashtable(); private final Hashtable postingTable = new Hashtable();
private int[] fieldLengths; private int[] fieldLengths;
private int[] fieldPositions; private int[] fieldPositions;
private int[] fieldOffsets;
private float[] fieldBoosts; private float[] fieldBoosts;
// Tokenizes the fields of a document into Postings. // Tokenizes the fields of a document into Postings.
@ -122,12 +124,19 @@ final class DocumentWriter {
int length = fieldLengths[fieldNumber]; // length of field int length = fieldLengths[fieldNumber]; // length of field
int position = fieldPositions[fieldNumber]; // position in field int position = fieldPositions[fieldNumber]; // position in field
int offset = fieldOffsets[fieldNumber]; // offset field
if (field.isIndexed()) { if (field.isIndexed()) {
if (!field.isTokenized()) { // un-tokenized field if (!field.isTokenized()) { // un-tokenized field
addPosition(fieldName, field.stringValue(), position++); String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector())
addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
else
addPosition(fieldName, stringValue, position++, null);
offset += stringValue.length();
length++; length++;
} else { } else
{
Reader reader; // find or make Reader Reader reader; // find or make Reader
if (field.readerValue() != null) if (field.readerValue() != null)
reader = field.readerValue(); reader = field.readerValue();
@ -140,11 +149,23 @@ final class DocumentWriter {
// Tokenize field and add to postingTable // Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader); TokenStream stream = analyzer.tokenStream(fieldName, reader);
try { try {
Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) { for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1); position += (t.getPositionIncrement() - 1);
addPosition(fieldName, t.termText(), position++);
if (++length > maxFieldLength) break; if(field.isStoreOffsetWithTermVector())
addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
else
addPosition(fieldName, t.termText(), position++, null);
lastToken = t;
if (++length > maxFieldLength)
break;
} }
if(lastToken != null)
offset += lastToken.endOffset() + 1;
} finally { } finally {
stream.close(); stream.close();
} }
@ -153,14 +174,16 @@ final class DocumentWriter {
fieldLengths[fieldNumber] = length; // save field length fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field.getBoost(); fieldBoosts[fieldNumber] *= field.getBoost();
fieldOffsets[fieldNumber] = offset;
} }
} }
} }
private final Term termBuffer = new Term("", ""); // avoid consing private final Term termBuffer = new Term("", ""); // avoid consing
private final void addPosition(String field, String text, int position) { private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
termBuffer.set(field, text); termBuffer.set(field, text);
//System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer); Posting ti = (Posting) postingTable.get(termBuffer);
if (ti != null) { // word seen before if (ti != null) { // word seen before
int freq = ti.freq; int freq = ti.freq;
@ -172,10 +195,23 @@ final class DocumentWriter {
ti.positions = newPositions; ti.positions = newPositions;
} }
ti.positions[freq] = position; // add new position ti.positions[freq] = position; // add new position
if (offset != null) {
if (ti.offsets.length == freq){
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
TermVectorOffsetInfo [] offsets = ti.offsets;
for (int i = 0; i < freq; i++)
{
newOffsets[i] = offsets[i];
}
ti.offsets = newOffsets;
}
ti.offsets[freq] = offset;
}
ti.freq = freq + 1; // update frequency ti.freq = freq + 1; // update frequency
} else { // word not seen before } else { // word not seen before
Term term = new Term(field, text, false); Term term = new Term(field, text, false);
postingTable.put(term, new Posting(term, position)); postingTable.put(term, new Posting(term, position, offset));
} }
} }
@ -294,12 +330,13 @@ final class DocumentWriter {
termVectorWriter.openDocument(); termVectorWriter.openDocument();
} }
termVectorWriter.openField(currentField); termVectorWriter.openField(currentField);
} else if (termVectorWriter != null) { } else if (termVectorWriter != null) {
termVectorWriter.closeField(); termVectorWriter.closeField();
} }
} }
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
termVectorWriter.addTerm(posting.term.text(), postingFreq); termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
} }
} }
if (termVectorWriter != null) if (termVectorWriter != null)
@ -316,7 +353,7 @@ final class DocumentWriter {
} }
} }
private final void writeNorms(Document doc, String segment) throws IOException { private final void writeNorms(String segment) throws IOException {
for(int n = 0; n < fieldInfos.size(); n++){ for(int n = 0; n < fieldInfos.size(); n++){
FieldInfo fi = fieldInfos.fieldInfo(n); FieldInfo fi = fieldInfos.fieldInfo(n);
if(fi.isIndexed){ if(fi.isIndexed){
@ -336,11 +373,18 @@ final class Posting { // info about a Term in a doc
Term term; // the Term Term term; // the Term
int freq; // its frequency in doc int freq; // its frequency in doc
int[] positions; // positions it occurs at int[] positions; // positions it occurs at
TermVectorOffsetInfo [] offsets;
Posting(Term t, int position) { Posting(Term t, int position, TermVectorOffsetInfo offset) {
term = t; term = t;
freq = 1; freq = 1;
positions = new int[1]; positions = new int[1];
positions[0] = position; positions[0] = position;
if(offset != null){
offsets = new TermVectorOffsetInfo[1];
offsets[0] = offset;
}
else
offsets = null;
} }
} }

View File

@ -23,11 +23,16 @@ final class FieldInfo {
// true if term vector for this field should be stored // true if term vector for this field should be stored
boolean storeTermVector; boolean storeTermVector;
boolean storeOffsetWithTermVector;
boolean storePositionWithTermVector;
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) { FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
name = na; name = na;
isIndexed = tk; isIndexed = tk;
number = nu; number = nu;
this.storeTermVector = storeTermVector; this.storeTermVector = storeTermVector;
this.storeOffsetWithTermVector = storeOffsetWithTermVector;
this.storePositionWithTermVector = storePositionWithTermVector;
} }
} }

View File

@ -33,6 +33,12 @@ import org.apache.lucene.store.IndexInput;
* accessing this object. * accessing this object.
*/ */
final class FieldInfos { final class FieldInfos {
static final byte IS_INDEXED = 0x1;
static final byte STORE_TERMVECTOR = 0x2;
static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
private ArrayList byNumber = new ArrayList(); private ArrayList byNumber = new ArrayList();
private HashMap byName = new HashMap(); private HashMap byName = new HashMap();
@ -61,23 +67,30 @@ final class FieldInfos {
Enumeration fields = doc.fields(); Enumeration fields = doc.fields();
while (fields.hasMoreElements()) { while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement(); Field field = (Field) fields.nextElement();
add(field.name(), field.isIndexed(), field.isTermVectorStored()); add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(),
field.isStoreOffsetWithTermVector());
} }
} }
/** /**
* Add fields that are indexed. Whether they have termvectors has to be specified.
*
* @param names The names of the fields * @param names The names of the fields
* @param storeTermVectors Whether the fields store term vectors or not * @param storeTermVectors Whether the fields store term vectors or not
* @param storePositionWithTermVector treu if positions should be stored.
* @param storeOffsetWithTermVector true if offsets should be stored
*/ */
public void addIndexed(Collection names, boolean storeTermVectors) { public void addIndexed(Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector) {
Iterator i = names.iterator(); Iterator i = names.iterator();
while (i.hasNext()) { while (i.hasNext()) {
add((String)i.next(), true, storeTermVectors); add((String)i.next(), true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector);
} }
} }
/** /**
* Assumes the field is not storing term vectors * Assumes the fields are not storing term vectors.
*
* @param names The names of the fields * @param names The names of the fields
* @param isIndexed Whether the fields are indexed or not * @param isIndexed Whether the fields are indexed or not
* *
@ -91,28 +104,43 @@ final class FieldInfos {
} }
/** /**
* Calls three parameter add with false for the storeTermVector parameter * Calls 5 parameter add with false for all TermVector parameters.
*
* @param name The name of the Field * @param name The name of the Field
* @param isIndexed true if the field is indexed * @param isIndexed true if the field is indexed
* @see #add(String, boolean, boolean) * @see #add(String, boolean, boolean, boolean, boolean)
*/ */
public void add(String name, boolean isIndexed) { public void add(String name, boolean isIndexed) {
add(name, isIndexed, false); add(name, isIndexed, false, false, false);
} }
/**
/** If the field is not yet known, adds it. If it is known, checks to make * Calls 5 parameter add with false for term vector positions and offsets.
* sure that the isIndexed flag is the same as was given previously for this
* field. If not - marks it as being indexed. Same goes for storeTermVector
* *
* @param name The name of the field * @param name The name of the field
* @param isIndexed true if the field is indexed * @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored * @param storeTermVector true if the term vector should be stored
*/ */
public void add(String name, boolean isIndexed, boolean storeTermVector){ public void add(String name, boolean isIndexed, boolean storeTermVector){
add(name, isIndexed, storeTermVector, false, false);
}
/** If the field is not yet known, adds it. If it is known, checks to make
* sure that the isIndexed flag is the same as was given previously for this
* field. If not - marks it as being indexed. Same goes for the TermVector
* parameters.
*
* @param name The name of the field
* @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored
* @param storePositionWithTermVector true if the term vector with positions should be stored
* @param storeOffsetWithTermVector true if the term vector with offsets should be stored
*/
public void add(String name, boolean isIndexed, boolean storeTermVector,
boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
FieldInfo fi = fieldInfo(name); FieldInfo fi = fieldInfo(name);
if (fi == null) { if (fi == null) {
addInternal(name, isIndexed, storeTermVector); addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector);
} else { } else {
if (fi.isIndexed != isIndexed) { if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index fi.isIndexed = true; // once indexed, always index
@ -120,13 +148,21 @@ final class FieldInfos {
if (fi.storeTermVector != storeTermVector) { if (fi.storeTermVector != storeTermVector) {
fi.storeTermVector = true; // once vector, always vector fi.storeTermVector = true; // once vector, always vector
} }
if (fi.storePositionWithTermVector != storePositionWithTermVector) {
fi.storePositionWithTermVector = true; // once vector, always vector
}
if (fi.storeOffsetWithTermVector != storeOffsetWithTermVector) {
fi.storeOffsetWithTermVector = true; // once vector, always vector
}
} }
} }
private void addInternal(String name, boolean isIndexed, private void addInternal(String name, boolean isIndexed,
boolean storeTermVector) { boolean storeTermVector, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector) {
FieldInfo fi = FieldInfo fi =
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector); new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
storeOffsetWithTermVector);
byNumber.add(fi); byNumber.add(fi);
byName.put(name, fi); byName.put(name, fi);
} }
@ -180,11 +216,11 @@ final class FieldInfos {
for (int i = 0; i < size(); i++) { for (int i = 0; i < size(); i++) {
FieldInfo fi = fieldInfo(i); FieldInfo fi = fieldInfo(i);
byte bits = 0x0; byte bits = 0x0;
if (fi.isIndexed) bits |= 0x1; if (fi.isIndexed) bits |= IS_INDEXED;
if (fi.storeTermVector) bits |= 0x2; if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
output.writeString(fi.name); output.writeString(fi.name);
//Was REMOVE
//output.writeByte((byte)(fi.isIndexed ? 1 : 0));
output.writeByte(bits); output.writeByte(bits);
} }
} }
@ -194,9 +230,11 @@ final class FieldInfos {
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
String name = input.readString().intern(); String name = input.readString().intern();
byte bits = input.readByte(); byte bits = input.readByte();
boolean isIndexed = (bits & 0x1) != 0; boolean isIndexed = (bits & IS_INDEXED) != 0;
boolean storeTermVector = (bits & 0x2) != 0; boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
addInternal(name, isIndexed, storeTermVector); boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector);
} }
} }

View File

@ -16,11 +16,12 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException; import java.io.IOException;
import java.util.Collection; import java.util.Collection;
import org.apache.lucene.document.Document;
/** A <code>FilterIndexReader</code> contains another IndexReader, which it /** A <code>FilterIndexReader</code> contains another IndexReader, which it
* uses as its basic source of data, possibly transforming the data along the * uses as its basic source of data, possibly transforming the data along the
* way or providing additional functionality. The class * way or providing additional functionality. The class
@ -146,4 +147,8 @@ public class FilterIndexReader extends IndexReader {
public Collection getIndexedFieldNames(boolean storedTermVector) { public Collection getIndexedFieldNames(boolean storedTermVector) {
return in.getIndexedFieldNames(storedTermVector); return in.getIndexedFieldNames(storedTermVector);
} }
public Collection getIndexedFieldNames (Field.TermVector tvSpec){
return in.getIndexedFieldNames(tvSpec);
}
} }

View File

@ -16,16 +16,16 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException; import org.apache.lucene.document.Document;
import java.io.File; import org.apache.lucene.document.Field;
import java.util.Collection; import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock; import org.apache.lucene.store.Lock;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; // for javadoc import java.io.File;
import org.apache.lucene.search.Similarity; import java.io.IOException;
import java.util.Collection;
/** IndexReader is an abstract class, providing an interface for accessing an /** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface, index. Search of an index is done entirely through this abstract interface,
@ -209,23 +209,37 @@ public abstract class IndexReader {
return SegmentInfos.readCurrentVersion(directory); return SegmentInfos.readCurrentVersion(directory);
} }
/** Return an array of term frequency vectors for the specified document. /**
* Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document. * The array contains a vector for each vectorized field in the document.
* Each vector contains terms and frequencies for all terms * Each vector contains terms and frequencies for all terms in a given vectorized field.
* in a given vectorized field. * If no such fields existed, the method returns null. The term vectors that are
* If no such fields existed, the method returns null. * returned my either be of type TermFreqVector or of type TermPositionsVector if
* positions or offsets have been stored.
* *
* @see Field#isTermVectorStored() * @param docNumber document for which term frequency vectors are returned
* @return array of term frequency vectors. May be null if no term vectors have been
* stored for the specified document.
* @throws IOException if index cannot be accessed
* @see Field#TermVector
*/ */
abstract public TermFreqVector[] getTermFreqVectors(int docNumber) abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException; throws IOException;
/** Return a term frequency vector for the specified document and field. The
* vector returned contains terms and frequencies for those terms in /**
* the specified field of this document, if the field had storeTermVector * Return a term frequency vector for the specified document and field. The
* flag set. If the flag was not set, the method returns null. * returned vector contains terms and frequencies for the terms in
* the specified field of this document, if the field had the storeTermVector
* flag set. If termvectors had been stored with positions or offsets, a
* TermPositionsVector is returned.
* *
* @see Field#isTermVectorStored() * @param docNumber document for which the term frequency vector is returned
* @param field field for which the term frequency vector is returned.
* @return term frequency vector May be null if field does not exist in the specified
* document or term vector was not stored.
* @throws IOException if index cannot be accessed
* @see Field#TermVector
*/ */
abstract public TermFreqVector getTermFreqVector(int docNumber, String field) abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException; throws IOException;
@ -547,9 +561,20 @@ public abstract class IndexReader {
* @param storedTermVector if true, returns only Indexed fields that have term vector info, * @param storedTermVector if true, returns only Indexed fields that have term vector info,
* else only indexed fields without term vector info * else only indexed fields without term vector info
* @return Collection of Strings indicating the names of the fields * @return Collection of Strings indicating the names of the fields
*
* @deprecated Replaced by {@link #getIndexedFieldNames (Field.TermVector tvSpec)}
*/ */
public abstract Collection getIndexedFieldNames(boolean storedTermVector); public abstract Collection getIndexedFieldNames(boolean storedTermVector);
/**
* Get a list of unique field names that exist in this index, are indexed, and have
* the specified term vector information.
*
* @param tvSpec specifies which term vector information shoul dbe available for the fields
* @return Collection of Strings indicating the names of the fields
*/
public abstract Collection getIndexedFieldNames(Field.TermVector tvSpec);
/** /**
* Returns <code>true</code> iff the index in the named directory is * Returns <code>true</code> iff the index in the named directory is
* currently locked. * currently locked.
@ -560,7 +585,6 @@ public abstract class IndexReader {
return return
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() || directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked(); directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
} }
/** /**

View File

@ -16,16 +16,13 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.Collection;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Set;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import java.io.IOException;
import java.util.*;
/** An IndexReader which reads multiple indexes, appending their content. /** An IndexReader which reads multiple indexes, appending their content.
* *
* @version $Id$ * @version $Id$
@ -219,11 +216,7 @@ public class MultiReader extends IndexReader {
for (int i = 0; i < subReaders.length; i++) { for (int i = 0; i < subReaders.length; i++) {
IndexReader reader = subReaders[i]; IndexReader reader = subReaders[i];
Collection names = reader.getFieldNames(); Collection names = reader.getFieldNames();
// iterate through the field names and add them to the set fieldSet.addAll(names);
for (Iterator iterator = names.iterator(); iterator.hasNext();) {
String s = (String) iterator.next();
fieldSet.add(s);
}
} }
return fieldSet; return fieldSet;
} }
@ -253,6 +246,17 @@ public class MultiReader extends IndexReader {
return fieldSet; return fieldSet;
} }
public Collection getIndexedFieldNames (Field.TermVector tvSpec){
// maintain a unique set of field names
Set fieldSet = new HashSet();
for (int i = 0; i < subReaders.length; i++) {
IndexReader reader = subReaders[i];
Collection names = reader.getIndexedFieldNames(tvSpec);
fieldSet.addAll(names);
}
return fieldSet;
}
} }
class MultiTermEnum extends TermEnum { class MultiTermEnum extends TermEnum {

View File

@ -20,6 +20,7 @@ import java.util.Vector;
import java.util.Iterator; import java.util.Iterator;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.store.RAMOutputStream;
@ -157,8 +158,11 @@ final class SegmentMerger {
int docCount = 0; int docCount = 0;
for (int i = 0; i < readers.size(); i++) { for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i); IndexReader reader = (IndexReader) readers.elementAt(i);
fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true); fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_POSITIONS_OFFSETS), true, true, true);
fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false); fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_POSITIONS), true, true, false);
fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_OFFSETS), true, false, true);
fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.YES), true, false, false);
fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.NO), false, false, false);
fieldInfos.add(reader.getFieldNames(false), false); fieldInfos.add(reader.getFieldNames(false), false);
} }
fieldInfos.write(directory, segment + ".fnm"); fieldInfos.write(directory, segment + ".fnm");
@ -195,29 +199,9 @@ final class SegmentMerger {
int maxDoc = reader.maxDoc(); int maxDoc = reader.maxDoc();
for (int docNum = 0; docNum < maxDoc; docNum++) { for (int docNum = 0; docNum < maxDoc; docNum++) {
// skip deleted docs // skip deleted docs
if (reader.isDeleted(docNum)) { if (reader.isDeleted(docNum))
continue; continue;
} termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
termVectorsWriter.openDocument();
// get all term vectors
TermFreqVector[] sourceTermVector =
reader.getTermFreqVectors(docNum);
if (sourceTermVector != null) {
for (int f = 0; f < sourceTermVector.length; f++) {
// translate field numbers
TermFreqVector termVector = sourceTermVector[f];
termVectorsWriter.openField(termVector.getField());
String [] terms = termVector.getTerms();
int [] freqs = termVector.getTermFrequencies();
for (int t = 0; t < terms.length; t++) {
termVectorsWriter.addTerm(terms[t], freqs[t]);
}
}
termVectorsWriter.closeDocument();
}
} }
} }
} finally { } finally {

View File

@ -25,6 +25,7 @@ import java.util.Set;
import java.util.Vector; import java.util.Vector;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
@ -191,7 +192,9 @@ class SegmentReader extends IndexReader {
proxStream.close(); proxStream.close();
closeNorms(); closeNorms();
if (termVectorsReader != null) termVectorsReader.close();
if (termVectorsReader != null)
termVectorsReader.close();
if (cfsReader != null) if (cfsReader != null)
cfsReader.close(); cfsReader.close();
@ -342,16 +345,63 @@ class SegmentReader extends IndexReader {
* @return Collection of Strings indicating the names of the fields * @return Collection of Strings indicating the names of the fields
*/ */
public Collection getIndexedFieldNames(boolean storedTermVector) { public Collection getIndexedFieldNames(boolean storedTermVector) {
if(storedTermVector){
Set fieldSet = new HashSet();
fieldSet.addAll(getIndexedFieldNames(Field.TermVector.YES));
fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_POSITIONS));
fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_OFFSETS));
fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_POSITIONS_OFFSETS));
return fieldSet;
}
else
return getIndexedFieldNames(Field.TermVector.NO);
}
public Collection getIndexedFieldNames (Field.TermVector tvSpec){
boolean storedTermVector;
boolean storePositionWithTermVector;
boolean storeOffsetWithTermVector;
if(tvSpec == Field.TermVector.NO){
storedTermVector = false;
storePositionWithTermVector = false;
storeOffsetWithTermVector = false;
}
else if(tvSpec == Field.TermVector.YES){
storedTermVector = true;
storePositionWithTermVector = false;
storeOffsetWithTermVector = false;
}
else if(tvSpec == Field.TermVector.WITH_POSITIONS){
storedTermVector = true;
storePositionWithTermVector = true;
storeOffsetWithTermVector = false;
}
else if(tvSpec == Field.TermVector.WITH_OFFSETS){
storedTermVector = true;
storePositionWithTermVector = false;
storeOffsetWithTermVector = true;
}
else if(tvSpec == Field.TermVector.WITH_POSITIONS_OFFSETS){
storedTermVector = true;
storePositionWithTermVector = true;
storeOffsetWithTermVector = true;
}
else{
throw new IllegalArgumentException("unknown termVector parameter " + tvSpec);
}
// maintain a unique set of field names // maintain a unique set of field names
Set fieldSet = new HashSet(); Set fieldSet = new HashSet();
for (int i = 0; i < fieldInfos.size(); i++) { for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i); FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed == true && fi.storeTermVector == storedTermVector){ if (fi.isIndexed && fi.storeTermVector == storedTermVector &&
fi.storePositionWithTermVector == storePositionWithTermVector &&
fi.storeOffsetWithTermVector == storeOffsetWithTermVector){
fieldSet.add(fi.name); fieldSet.add(fi.name);
} }
} }
return fieldSet; return fieldSet;
} }
public synchronized byte[] norms(String field) throws IOException { public synchronized byte[] norms(String field) throws IOException {
@ -429,11 +479,13 @@ class SegmentReader extends IndexReader {
* vector returned contains term numbers and frequencies for all terms in * vector returned contains term numbers and frequencies for all terms in
* the specified field of this document, if the field had storeTermVector * the specified field of this document, if the field had storeTermVector
* flag set. If the flag was not set, the method returns null. * flag set. If the flag was not set, the method returns null.
* @throws IOException
*/ */
public TermFreqVector getTermFreqVector(int docNumber, String field) { public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
// Check if this field is invalid or has no stored term vector // Check if this field is invalid or has no stored term vector
FieldInfo fi = fieldInfos.fieldInfo(field); FieldInfo fi = fieldInfos.fieldInfo(field);
if (fi == null || !fi.storeTermVector) return null; if (fi == null || !fi.storeTermVector || termVectorsReader == null)
return null;
return termVectorsReader.get(docNumber, field); return termVectorsReader.get(docNumber, field);
} }
@ -444,8 +496,9 @@ class SegmentReader extends IndexReader {
* Each vector vector contains term numbers and frequencies for all terms * Each vector vector contains term numbers and frequencies for all terms
* in a given vectorized field. * in a given vectorized field.
* If no such fields existed, the method returns null. * If no such fields existed, the method returns null.
* @throws IOException
*/ */
public TermFreqVector[] getTermFreqVectors(int docNumber) { public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
if (termVectorsReader == null) if (termVectorsReader == null)
return null; return null;

View File

@ -0,0 +1,64 @@
package org.apache.lucene.index;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class SegmentTermPositionVector extends SegmentTermVector implements TermPositionVector {
protected int[][] positions;
protected TermVectorOffsetInfo[][] offsets;
public static final int[] EMPTY_TERM_POS = new int[0];
public SegmentTermPositionVector(String field, String terms[], int termFreqs[], int[][] positions, TermVectorOffsetInfo[][] offsets) {
super(field, terms, termFreqs);
this.offsets = offsets;
this.positions = positions;
}
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
* @see org.apache.lucene.analysis.Token
*/
public TermVectorOffsetInfo[] getOffsets(int index) {
TermVectorOffsetInfo[] result = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
if(offsets == null)
return null;
if (index >=0 && index < offsets.length)
{
result = offsets[index];
}
return result;
}
/**
* Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term String array obtained from the <code>indexOf</code> method.
*/
public int[] getTermPositions(int index) {
int[] result = EMPTY_TERM_POS;
if(positions == null)
return null;
if (index >=0 && index < positions.length)
{
result = positions[index];
}
return result;
}
}

View File

@ -1,4 +1,21 @@
package org.apache.lucene.index; package org.apache.lucene.index;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.*; import java.util.*;
/** /**
@ -26,11 +43,14 @@ class SegmentTermVector implements TermFreqVector {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
sb.append('{'); sb.append('{');
sb.append(field).append(": "); sb.append(field).append(": ");
if(terms != null){
for (int i=0; i<terms.length; i++) { for (int i=0; i<terms.length; i++) {
if (i>0) sb.append(", "); if (i>0) sb.append(", ");
sb.append(terms[i]).append('/').append(termFreqs[i]); sb.append(terms[i]).append('/').append(termFreqs[i]);
} }
}
sb.append('}'); sb.append('}');
return sb.toString(); return sb.toString();
} }
@ -47,6 +67,8 @@ class SegmentTermVector implements TermFreqVector {
} }
public int indexOf(String termText) { public int indexOf(String termText) {
if(terms == null)
return -1;
int res = Arrays.binarySearch(terms, termText); int res = Arrays.binarySearch(terms, termText);
return res >= 0 ? res : -1; return res >= 0 ? res : -1;
} }
@ -60,7 +82,7 @@ class SegmentTermVector implements TermFreqVector {
int res[] = new int[len]; int res[] = new int[len];
for (int i=0; i < len; i++) { for (int i=0; i < len; i++) {
res[i] = indexOf(termNumbers[i]); res[i] = indexOf(termNumbers[start+ i]);
} }
return res; return res;
} }

View File

@ -1,5 +1,21 @@
package org.apache.lucene.index; package org.apache.lucene.index;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Provides access to stored term vector of /** Provides access to stored term vector of
* a document field. * a document field.
*/ */

View File

@ -1,13 +1,42 @@
package org.apache.lucene.index; package org.apache.lucene.index;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Extends <code>TermFreqVector</code> to provide additional information about /** Extends <code>TermFreqVector</code> to provide additional information about
* positions in which each of the terms is found. * positions in which each of the terms is found. A TermPositionVector not necessarily
* contains both positions and offsets, but at least one of these arrays exists.
*/ */
public interface TermPositionVector extends TermFreqVector { public interface TermPositionVector extends TermFreqVector {
/** Returns an array of positions in which the term is found. /** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the * Terms are identified by the index at which its number appears in the
* term number array obtained from <code>getTermNumbers</code> method. * term String array obtained from the <code>indexOf</code> method.
* May return null if positions have not been stored.
*/ */
public int[] getTermPositions(int index); public int[] getTermPositions(int index);
/**
* Returns an array of TermVectorOffsetInfo in which the term is found.
* May return null if offsets have not been stored.
*
* @see org.apache.lucene.analysis.Token
*
* @param index The position in the array to get the offsets from
* @return An array of TermVectorOffsetInfo objects or the empty list
*/
public TermVectorOffsetInfo [] getOffsets(int index);
} }

View File

@ -0,0 +1,66 @@
package org.apache.lucene.index;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TermVectorOffsetInfo {
public static final TermVectorOffsetInfo [] EMPTY_OFFSET_INFO = new TermVectorOffsetInfo[0];
private int startOffset;
private int endOffset;
public TermVectorOffsetInfo() {
}
public TermVectorOffsetInfo(int startOffset, int endOffset) {
this.endOffset = endOffset;
this.startOffset = startOffset;
}
public int getEndOffset() {
return endOffset;
}
public void setEndOffset(int endOffset) {
this.endOffset = endOffset;
}
public int getStartOffset() {
return startOffset;
}
public void setStartOffset(int startOffset) {
this.startOffset = startOffset;
}
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof TermVectorOffsetInfo)) return false;
final TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o;
if (endOffset != termVectorOffsetInfo.endOffset) return false;
if (startOffset != termVectorOffsetInfo.startOffset) return false;
return true;
}
public int hashCode() {
int result;
result = startOffset;
result = 29 * result + endOffset;
return result;
}
}

View File

@ -34,22 +34,25 @@ class TermVectorsReader {
private IndexInput tvf; private IndexInput tvf;
private int size; private int size;
private int tvdFormat;
private int tvfFormat;
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
throws IOException { throws IOException {
if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) { if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION); tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION);
checkValidFormat(tvx); checkValidFormat(tvx);
tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION); tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION);
checkValidFormat(tvd); tvdFormat = checkValidFormat(tvd);
tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION); tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION);
checkValidFormat(tvf); tvfFormat = checkValidFormat(tvf);
size = (int) tvx.length() / 8; size = (int) tvx.length() / 8;
} }
this.fieldInfos = fieldInfos; this.fieldInfos = fieldInfos;
} }
private void checkValidFormat(IndexInput in) throws IOException private int checkValidFormat(IndexInput in) throws IOException
{ {
int format = in.readInt(); int format = in.readInt();
if (format > TermVectorsWriter.FORMAT_VERSION) if (format > TermVectorsWriter.FORMAT_VERSION)
@ -57,7 +60,7 @@ class TermVectorsReader {
throw new IOException("Incompatible format version: " + format + " expected " throw new IOException("Incompatible format version: " + format + " expected "
+ TermVectorsWriter.FORMAT_VERSION + " or less"); + TermVectorsWriter.FORMAT_VERSION + " or less");
} }
return format;
} }
void close() throws IOException { void close() throws IOException {
@ -82,16 +85,17 @@ class TermVectorsReader {
* Retrieve the term vector for the given document and field * Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for * @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve * @param field The field within the document to retrieve
* @return The TermFreqVector for the document and field or null * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
* @throws IOException
*/ */
synchronized TermFreqVector get(int docNum, String field) { synchronized TermFreqVector get(int docNum, String field) throws IOException {
// Check if no term vectors are available for this segment at all // Check if no term vectors are available for this segment at all
int fieldNumber = fieldInfos.fieldNumber(field); int fieldNumber = fieldInfos.fieldNumber(field);
TermFreqVector result = null; TermFreqVector result = null;
if (tvx != null) { if (tvx != null) {
try {
//We need to account for the FORMAT_SIZE at when seeking in the tvx //We need to account for the FORMAT_SIZE at when seeking in the tvx
//We don't need to do this in other seeks because we already have the file pointer //We don't need to do this in other seeks because we already have the
// file pointer
//that was written in another file //that was written in another file
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
//System.out.println("TVX Pointer: " + tvx.getFilePointer()); //System.out.println("TVX Pointer: " + tvx.getFilePointer());
@ -106,42 +110,43 @@ class TermVectorsReader {
int number = 0; int number = 0;
int found = -1; int found = -1;
for (int i = 0; i < fieldCount; i++) { for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt(); number += tvd.readVInt();
if (number == fieldNumber) found = i;
if (number == fieldNumber)
found = i;
} }
// This field, although valid in the segment, was not found in this document // This field, although valid in the segment, was not found in this
// document
if (found != -1) { if (found != -1) {
// Compute position in the tvf file // Compute position in the tvf file
position = 0; position = 0;
for (int i = 0; i <= found; i++) for (int i = 0; i <= found; i++)
{
position += tvd.readVLong(); position += tvd.readVLong();
}
result = readTermVector(field, position); result = readTermVector(field, position);
} } else {
else {
//System.out.println("Field not found"); //System.out.println("Field not found");
} }
} else {
} catch (Exception e) { //System.out.println("No tvx file");
//e.printStackTrace();
}
}
else
{
System.out.println("No tvx file");
} }
return result; return result;
} }
/** Return all term vectors stored for this document or null if the could not be read in. */ /**
synchronized TermFreqVector[] get(int docNum) { * Return all term vectors stored for this document or null if there are no term vectors
* for the document.
* @throws IOException
*/
synchronized TermFreqVector[] get(int docNum) throws IOException {
TermFreqVector[] result = null; TermFreqVector[] result = null;
// Check if no term vectors are available for this segment at all // Check if no term vectors are available for this segment at all
if (tvx != null) { if (tvx != null) {
try {
//We need to offset by //We need to offset by
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
long position = tvx.readLong(); long position = tvx.readLong();
@ -155,7 +160,11 @@ class TermVectorsReader {
String[] fields = new String[fieldCount]; String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) { for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt(); number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number); fields[i] = fieldInfos.fieldName(number);
} }
@ -169,13 +178,8 @@ class TermVectorsReader {
result = readTermVectors(fields, tvfPointers); result = readTermVectors(fields, tvfPointers);
} }
} catch (IOException e) { } else {
e.printStackTrace(); //System.out.println("No tvx file");
}
}
else
{
System.out.println("No tvx file");
} }
return result; return result;
} }
@ -206,20 +210,41 @@ class TermVectorsReader {
int numTerms = tvf.readVInt(); int numTerms = tvf.readVInt();
//System.out.println("Num Terms: " + numTerms); //System.out.println("Num Terms: " + numTerms);
// If no terms - return a constant empty termvector // If no terms - return a constant empty termvector. However, this should never occur!
if (numTerms == 0) return new SegmentTermVector(field, null, null); if (numTerms == 0)
return new SegmentTermVector(field, null, null);
boolean storePositions;
boolean storeOffsets;
if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){
byte bits = tvf.readByte();
storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
}
else{
tvf.readVInt(); tvf.readVInt();
storePositions = false;
storeOffsets = false;
}
String terms[] = new String[numTerms]; String terms[] = new String[numTerms];
int termFreqs[] = new int[numTerms]; int termFreqs[] = new int[numTerms];
// we may not need these, but declare them
int positions[][] = null;
TermVectorOffsetInfo offsets[][] = null;
if(storePositions)
positions = new int[numTerms][];
if(storeOffsets)
offsets = new TermVectorOffsetInfo[numTerms][];
int start = 0; int start = 0;
int deltaLength = 0; int deltaLength = 0;
int totalLength = 0; int totalLength = 0;
char [] buffer = {}; char [] buffer = {};
String previousString = ""; String previousString = "";
for (int i = 0; i < numTerms; i++) { for (int i = 0; i < numTerms; i++) {
start = tvf.readVInt(); start = tvf.readVInt();
deltaLength = tvf.readVInt(); deltaLength = tvf.readVInt();
@ -233,9 +258,40 @@ class TermVectorsReader {
tvf.readChars(buffer, start, deltaLength); tvf.readChars(buffer, start, deltaLength);
terms[i] = new String(buffer, 0, totalLength); terms[i] = new String(buffer, 0, totalLength);
previousString = terms[i]; previousString = terms[i];
termFreqs[i] = tvf.readVInt(); int freq = tvf.readVInt();
termFreqs[i] = freq;
if (storePositions) { //read in the positions
int [] pos = new int[freq];
positions[i] = pos;
int prevPosition = 0;
for (int j = 0; j < freq; j++)
{
pos[j] = prevPosition + tvf.readVInt();
prevPosition = pos[j];
}
}
if (storeOffsets) {
TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
offsets[i] = offs;
int prevOffset = 0;
for (int j = 0; j < freq; j++) {
int startOffset = prevOffset + tvf.readVInt();
int endOffset = startOffset + tvf.readVInt();
offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
prevOffset = endOffset;
}
}
}
SegmentTermVector tv;
if (storePositions || storeOffsets){
tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
}
else {
tv = new SegmentTermVector(field, terms, termFreqs);
} }
SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
return tv; return tv;
} }

View File

@ -50,14 +50,17 @@ import java.util.Vector;
* *
*/ */
final class TermVectorsWriter { final class TermVectorsWriter {
public static final int FORMAT_VERSION = 1; public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
public static final int FORMAT_VERSION = 2;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
public static final int FORMAT_SIZE = 4; public static final int FORMAT_SIZE = 4;
//TODO: Figure out how to write with or w/o position information and read back in
public static final String TVX_EXTENSION = ".tvx"; public static final String TVX_EXTENSION = ".tvx";
public static final String TVD_EXTENSION = ".tvd"; public static final String TVD_EXTENSION = ".tvd";
public static final String TVF_EXTENSION = ".tvf"; public static final String TVF_EXTENSION = ".tvf";
private IndexOutput tvx = null, tvd = null, tvf = null; private IndexOutput tvx = null, tvd = null, tvf = null;
private Vector fields = null; private Vector fields = null;
private Vector terms = null; private Vector terms = null;
@ -66,13 +69,6 @@ final class TermVectorsWriter {
private TVField currentField = null; private TVField currentField = null;
private long currentDocPointer = -1; private long currentDocPointer = -1;
/** Create term vectors writer for the specified segment in specified
* directory. A new TermVectorsWriter should be created for each
* segment. The parameter <code>maxFields</code> indicates how many total
* fields are found in this document. Not all of these fields may require
* termvectors to be stored, so the number of calls to
* <code>openField</code> is less or equal to this number.
*/
public TermVectorsWriter(Directory directory, String segment, public TermVectorsWriter(Directory directory, String segment,
FieldInfos fieldInfos) FieldInfos fieldInfos)
throws IOException { throws IOException {
@ -93,7 +89,6 @@ final class TermVectorsWriter {
public final void openDocument() public final void openDocument()
throws IOException { throws IOException {
closeDocument(); closeDocument();
currentDocPointer = tvd.getFilePointer(); currentDocPointer = tvd.getFilePointer();
} }
@ -119,12 +114,17 @@ final class TermVectorsWriter {
* processing of this field. If a field was previously open, it is * processing of this field. If a field was previously open, it is
* closed automatically. * closed automatically.
*/ */
public final void openField(String field) public final void openField(String field) throws IOException {
throws IOException { FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open."); openField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
}
private void openField(int fieldNumber, boolean storePositionWithTermVector,
boolean storeOffsetWithTermVector) throws IOException{
if (!isDocumentOpen())
throw new IllegalStateException("Cannot open field when no document is open.");
closeField(); closeField();
currentField = new TVField(fieldInfos.fieldNumber(field)); currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
} }
/** Finished processing current field. This should be followed by a call to /** Finished processing current field. This should be followed by a call to
@ -157,57 +157,80 @@ final class TermVectorsWriter {
* times this term appears in this field, in this document. * times this term appears in this field, in this document.
*/ */
public final void addTerm(String termText, int freq) { public final void addTerm(String termText, int freq) {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open"); addTerm(termText, freq, null, null);
if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
addTermInternal(termText, freq);
} }
private final void addTermInternal(String termText, int freq) { public final void addTerm(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets)
currentField.length += freq; {
if (!isDocumentOpen())
throw new IllegalStateException("Cannot add terms when document is not open");
if (!isFieldOpen())
throw new IllegalStateException("Cannot add terms when field is not open");
addTermInternal(termText, freq, positions, offsets);
}
private final void addTermInternal(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) {
TVTerm term = new TVTerm(); TVTerm term = new TVTerm();
term.termText = termText; term.termText = termText;
term.freq = freq; term.freq = freq;
term.positions = positions;
term.offsets = offsets;
terms.add(term); terms.add(term);
} }
/**
/** Add specified vectors to the document. * Add a complete document specified by all its term vectors. If document has no
* term vectors, add value for tvx.
*
* @param vectors
* @throws IOException
*/ */
public final void addVectors(TermFreqVector[] vectors) public final void addAllDocVectors(TermFreqVector[] vectors)
throws IOException { throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open"); openDocument();
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
if (vectors != null) {
for (int i = 0; i < vectors.length; i++) { for (int i = 0; i < vectors.length; i++) {
addTermFreqVector(vectors[i]); boolean storePositionWithTermVector = false;
} boolean storeOffsetWithTermVector = false;
}
try {
/** Add specified vector to the document. Document must be open but no field TermPositionVector tpVector = (TermPositionVector) vectors[i];
* should be open or exception is thrown. The same document can have <code>addTerm</code>
* and <code>addVectors</code> calls mixed, however a given field must either be if (tpVector.size() > 0 && tpVector.getTermPositions(0) != null)
* populated with <code>addTerm</code> or with <code>addVector</code>. * storePositionWithTermVector = true;
*/ if (tpVector.size() > 0 && tpVector.getOffsets(0) != null)
public final void addTermFreqVector(TermFreqVector vector) storeOffsetWithTermVector = true;
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open"); FieldInfo fieldInfo = fieldInfos.fieldInfo(tpVector.getField());
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open"); openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
addTermFreqVectorInternal(vector);
} for (int j = 0; j < tpVector.size(); j++)
addTermInternal(tpVector.getTerms()[j], tpVector.getTermFrequencies()[j], tpVector.getTermPositions(j),
tpVector.getOffsets(j));
private final void addTermFreqVectorInternal(TermFreqVector vector)
throws IOException {
openField(vector.getField());
for (int i = 0; i < vector.size(); i++) {
addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
}
closeField(); closeField();
} catch (ClassCastException ignore) {
TermFreqVector tfVector = vectors[i];
FieldInfo fieldInfo = fieldInfos.fieldInfo(tfVector.getField());
openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
for (int j = 0; j < tfVector.size(); j++)
addTermInternal(tfVector.getTerms()[j], tfVector.getTermFrequencies()[j], null, null);
closeField();
}
}
} }
closeDocument();
}
/** Close all streams. */ /** Close all streams. */
final void close() throws IOException { final void close() throws IOException {
@ -245,15 +268,22 @@ final class TermVectorsWriter {
// remember where this field is written // remember where this field is written
currentField.tvfPointer = tvf.getFilePointer(); currentField.tvfPointer = tvf.getFilePointer();
//System.out.println("Field Pointer: " + currentField.tvfPointer); //System.out.println("Field Pointer: " + currentField.tvfPointer);
final int size;
tvf.writeVInt(size = terms.size()); final int size = terms.size();
tvf.writeVInt(currentField.length - size); tvf.writeVInt(size);
boolean storePositions = currentField.storePositions;
boolean storeOffsets = currentField.storeOffsets;
byte bits = 0x0;
if (storePositions)
bits |= STORE_POSITIONS_WITH_TERMVECTOR;
if (storeOffsets)
bits |= STORE_OFFSET_WITH_TERMVECTOR;
tvf.writeByte(bits);
String lastTermText = ""; String lastTermText = "";
// write term ids and positions
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
TVTerm term = (TVTerm) terms.elementAt(i); TVTerm term = (TVTerm) terms.elementAt(i);
//tvf.writeString(term.termText);
int start = StringHelper.stringDifference(lastTermText, term.termText); int start = StringHelper.stringDifference(lastTermText, term.termText);
int length = term.termText.length() - start; int length = term.termText.length() - start;
tvf.writeVInt(start); // write shared prefix length tvf.writeVInt(start); // write shared prefix length
@ -261,31 +291,51 @@ final class TermVectorsWriter {
tvf.writeChars(term.termText, start, length); // write delta chars tvf.writeChars(term.termText, start, length); // write delta chars
tvf.writeVInt(term.freq); tvf.writeVInt(term.freq);
lastTermText = term.termText; lastTermText = term.termText;
if(storePositions){
if(term.positions == null)
throw new IllegalStateException("Trying to write positions that are null!");
// use delta encoding for positions
int position = 0;
for (int j = 0; j < term.freq; j++){
tvf.writeVInt(term.positions[j] - position);
position = term.positions[j];
} }
} }
if(storeOffsets){
if(term.offsets == null)
throw new IllegalStateException("Trying to write offsets that are null!");
// use delta encoding for offsets
int position = 0;
for (int j = 0; j < term.freq; j++) {
tvf.writeVInt(term.offsets[j].getStartOffset() - position);
tvf.writeVInt(term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); //Save the diff between the two.
position = term.offsets[j].getEndOffset();
}
}
}
}
private void writeDoc() throws IOException { private void writeDoc() throws IOException {
if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document"); if (isFieldOpen())
throw new IllegalStateException("Field is still open while writing document");
//System.out.println("Writing doc pointer: " + currentDocPointer); //System.out.println("Writing doc pointer: " + currentDocPointer);
// write document index record // write document index record
tvx.writeLong(currentDocPointer); tvx.writeLong(currentDocPointer);
// write document data record // write document data record
final int size; final int size = fields.size();
// write the number of fields // write the number of fields
tvd.writeVInt(size = fields.size()); tvd.writeVInt(size);
// write field numbers // write field numbers
int lastFieldNumber = 0;
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i); TVField field = (TVField) fields.elementAt(i);
tvd.writeVInt(field.number - lastFieldNumber); tvd.writeVInt(field.number);
lastFieldNumber = field.number;
} }
// write field pointers // write field pointers
@ -293,7 +343,6 @@ final class TermVectorsWriter {
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i); TVField field = (TVField) fields.elementAt(i);
tvd.writeVLong(field.tvfPointer - lastFieldPointer); tvd.writeVLong(field.tvfPointer - lastFieldPointer);
lastFieldPointer = field.tvfPointer; lastFieldPointer = field.tvfPointer;
} }
//System.out.println("After writing doc pointer: " + tvx.getFilePointer()); //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
@ -303,17 +352,20 @@ final class TermVectorsWriter {
private static class TVField { private static class TVField {
int number; int number;
long tvfPointer = 0; long tvfPointer = 0;
int length = 0; // number of distinct term positions boolean storePositions = false;
boolean storeOffsets = false;
TVField(int number) { TVField(int number, boolean storePos, boolean storeOff) {
this.number = number; this.number = number;
storePositions = storePos;
storeOffsets = storeOff;
} }
} }
private static class TVTerm { private static class TVTerm {
String termText; String termText;
int freq = 0; int freq = 0;
//int positions[] = null; int positions[] = null;
TermVectorOffsetInfo [] offsets = null;
} }

View File

@ -34,7 +34,7 @@ class DocHelper {
//Fields will be lexicographically sorted. So, the order is: field, text, two //Fields will be lexicographically sorted. So, the order is: field, text, two
public static final int [] FIELD_2_FREQS = {3, 1, 1}; public static final int [] FIELD_2_FREQS = {3, 1, 1};
public static final String TEXT_FIELD_2_KEY = "textField2"; public static final String TEXT_FIELD_2_KEY = "textField2";
public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true); public static Field textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
public static final String KEYWORD_TEXT = "Keyword"; public static final String KEYWORD_TEXT = "Keyword";
public static final String KEYWORD_FIELD_KEY = "keyField"; public static final String KEYWORD_FIELD_KEY = "keyField";
@ -135,7 +135,7 @@ class DocHelper {
Enumeration fields = doc.fields(); Enumeration fields = doc.fields();
int result = 0; int result = 0;
while (fields.hasMoreElements()) { while (fields.hasMoreElements()) {
fields.nextElement(); String name = fields.nextElement().toString();
result++; result++;
} }
return result; return result;

View File

@ -109,6 +109,7 @@ public class TestSegmentMerger extends TestCase {
int [] freqs = vector.getTermFrequencies(); int [] freqs = vector.getTermFrequencies();
assertTrue(freqs != null); assertTrue(freqs != null);
//System.out.println("Freqs size: " + freqs.length); //System.out.println("Freqs size: " + freqs.length);
assertTrue(vector instanceof TermPositionVector == true);
for (int i = 0; i < terms.length; i++) { for (int i = 0; i < terms.length; i++) {
String term = terms[i]; String term = terms[i];

View File

@ -178,7 +178,7 @@ public class TestSegmentReader extends TestCase {
} }
public void testTermVectors() { public void testTermVectors() throws IOException {
TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
assertTrue(result != null); assertTrue(result != null);
String [] terms = result.getTerms(); String [] terms = result.getTerms();

View File

@ -11,7 +11,11 @@ public class TestTermVectorsReader extends TestCase {
private TermVectorsWriter writer = null; private TermVectorsWriter writer = null;
//Must be lexicographically sorted, will do in setup, versus trying to maintain here //Must be lexicographically sorted, will do in setup, versus trying to maintain here
private String [] testFields = {"f1", "f2", "f3"}; private String [] testFields = {"f1", "f2", "f3"};
private boolean [] testFieldsStorePos = {true, false, true, false};
private boolean [] testFieldsStoreOff = {true, false, false, true};
private String [] testTerms = {"this", "is", "a", "test"}; private String [] testTerms = {"this", "is", "a", "test"};
private int [][] positions = new int[testTerms.length][];
private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
private RAMDirectory dir = new RAMDirectory(); private RAMDirectory dir = new RAMDirectory();
private String seg = "testSegment"; private String seg = "testSegment";
private FieldInfos fieldInfos = new FieldInfos(); private FieldInfos fieldInfos = new FieldInfos();
@ -22,9 +26,22 @@ public class TestTermVectorsReader extends TestCase {
protected void setUp() { protected void setUp() {
for (int i = 0; i < testFields.length; i++) { for (int i = 0; i < testFields.length; i++) {
fieldInfos.add(testFields[i], true, true); fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
} }
for (int i = 0; i < testTerms.length; i++)
{
positions[i] = new int[3];
for (int j = 0; j < positions[i].length; j++) {
// poditions are always sorted in increasing order
positions[i][j] = (int)(j * 10 + Math.random() * 10);
}
offsets[i] = new TermVectorOffsetInfo[3];
for (int j = 0; j < offsets[i].length; j++){
// ofsets are alway sorted in increasing order
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
}
}
try { try {
Arrays.sort(testTerms); Arrays.sort(testTerms);
for (int j = 0; j < 5; j++) { for (int j = 0; j < 5; j++) {
@ -34,7 +51,7 @@ public class TestTermVectorsReader extends TestCase {
for (int k = 0; k < testFields.length; k++) { for (int k = 0; k < testFields.length; k++) {
writer.openField(testFields[k]); writer.openField(testFields[k]);
for (int i = 0; i < testTerms.length; i++) { for (int i = 0; i < testTerms.length; i++) {
writer.addTerm(testTerms[i], i); writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
} }
writer.closeField(); writer.closeField();
} }
@ -80,6 +97,103 @@ public class TestTermVectorsReader extends TestCase {
} }
} }
public void testPositionReader() {
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermPositionVector vector;
String [] terms;
vector = (TermPositionVector)reader.get(0, testFields[0]);
assertTrue(vector != null);
terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
int [] positions = vector.getTermPositions(i);
assertTrue(positions != null);
assertTrue(positions.length == this.positions[i].length);
for (int j = 0; j < positions.length; j++) {
int position = positions[j];
assertTrue(position == this.positions[i][j]);
}
TermVectorOffsetInfo [] offset = vector.getOffsets(i);
assertTrue(offset != null);
assertTrue(offset.length == this.offsets[i].length);
for (int j = 0; j < offset.length; j++) {
TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
}
}
TermFreqVector freqVector = (TermFreqVector)reader.get(0, testFields[1]); //no pos, no offset
assertTrue(freqVector != null);
assertTrue(freqVector instanceof TermPositionVector == false);
terms = freqVector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
catch (ClassCastException cce)
{
cce.printStackTrace();
assertTrue(false);
}
}
public void testOffsetReader() {
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
assertTrue(vector != null);
String [] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
int [] positions = vector.getTermPositions(i);
assertTrue(positions != null);
assertTrue(positions.length == this.positions[i].length);
for (int j = 0; j < positions.length; j++) {
int position = positions[j];
assertTrue(position == this.positions[i][j]);
}
TermVectorOffsetInfo [] offset = vector.getOffsets(i);
assertTrue(offset != null);
assertTrue(offset.length == this.offsets[i].length);
for (int j = 0; j < offset.length; j++) {
TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
}
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
catch (ClassCastException cce)
{
cce.printStackTrace();
assertTrue(false);
}
}
/** /**
* Make sure exceptions and bad params are handled appropriately * Make sure exceptions and bad params are handled appropriately
*/ */
@ -89,9 +203,9 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(reader != null); assertTrue(reader != null);
//Bad document number, good field number //Bad document number, good field number
TermFreqVector vector = reader.get(50, testFields[0]); TermFreqVector vector = reader.get(50, testFields[0]);
assertTrue(vector == null);
} catch (Exception e) {
assertTrue(false); assertTrue(false);
} catch (Exception e) {
assertTrue(true);
} }
try { try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);

View File

@ -44,7 +44,7 @@ public class TestTermVectorsWriter extends TestCase {
for (int i = 0; i < testTerms.length; i++) { for (int i = 0; i < testTerms.length; i++) {
positions[i] = new int[5]; positions[i] = new int[5];
for (int j = 0; j < positions[i].length; j++) { for (int j = 0; j < positions[i].length; j++) {
positions[i][j] = i * 100; positions[i][j] = j * 10;
} }
} }
} }
@ -107,7 +107,7 @@ public class TestTermVectorsWriter extends TestCase {
} }
} }
private void checkTermVector(TermVectorsReader reader, int docNum, String field) { private void checkTermVector(TermVectorsReader reader, int docNum, String field) throws IOException {
TermFreqVector vector = reader.get(docNum, field); TermFreqVector vector = reader.get(docNum, field);
assertTrue(vector != null); assertTrue(vector != null);
String[] terms = vector.getTerms(); String[] terms = vector.getTerms();

View File

@ -43,8 +43,23 @@ public class TestTermVectors extends TestCase {
//writer.infoStream = System.out; //writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) { for (int i = 0; i < 1000; i++) {
Document doc = new Document(); Document doc = new Document();
Field.TermVector termVector;
int mod3 = i % 3;
int mod2 = i % 2;
if (mod2 == 0 && mod3 == 0){
termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
}
else if (mod2 == 0){
termVector = Field.TermVector.WITH_POSITIONS;
}
else if (mod3 == 0){
termVector = Field.TermVector.WITH_OFFSETS;
}
else {
termVector = Field.TermVector.YES;
}
doc.add(new Field("field", English.intToEnglish(i), doc.add(new Field("field", English.intToEnglish(i),
Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); Field.Store.YES, Field.Index.TOKENIZED, termVector));
writer.addDocument(doc); writer.addDocument(doc);
} }
writer.close(); writer.close();
@ -70,17 +85,74 @@ public class TestTermVectors extends TestCase {
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null); assertTrue(vector != null);
assertTrue(vector.length == 1); assertTrue(vector.length == 1);
//assertTrue();
} }
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50));
//System.out.println("Explain: " + searcher.explain(query, hits.id(50)));
//System.out.println("Vector: " + vector[0].toString());
} catch (IOException e) { } catch (IOException e) {
assertTrue(false); assertTrue(false);
} }
} }
public void testTermPositionVectors() { public void testTermPositionVectors() {
Query query = new TermQuery(new Term("field", "zero"));
try {
Hits hits = searcher.search(query);
assertEquals(1, hits.length());
for (int i = 0; i < hits.length(); i++)
{
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null);
assertTrue(vector.length == 1);
boolean shouldBePosVector = (hits.id(i) % 2 == 0) ? true : false;
assertTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] instanceof TermPositionVector == true)));
boolean shouldBeOffVector = (hits.id(i) % 3 == 0) ? true : false;
assertTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] instanceof TermPositionVector == true)));
if(shouldBePosVector || shouldBeOffVector){
TermPositionVector posVec = (TermPositionVector)vector[0];
String [] terms = posVec.getTerms();
assertTrue(terms != null && terms.length > 0);
for (int j = 0; j < terms.length; j++) {
int [] positions = posVec.getTermPositions(j);
TermVectorOffsetInfo [] offsets = posVec.getOffsets(j);
if(shouldBePosVector){
assertTrue(positions != null);
assertTrue(positions.length > 0);
}
else
assertTrue(positions == null);
if(shouldBeOffVector){
assertTrue(offsets != null);
assertTrue(offsets.length > 0);
}
else
assertTrue(offsets == null);
}
}
else{
try{
TermPositionVector posVec = (TermPositionVector)vector[0];
assertTrue(false);
}
catch(ClassCastException ignore){
TermFreqVector freqVec = vector[0];
String [] terms = freqVec.getTerms();
assertTrue(terms != null && terms.length > 0);
}
}
}
} catch (IOException e) {
assertTrue(false);
}
}
public void testTermOffsetVectors() {
Query query = new TermQuery(new Term("field", "fifty")); Query query = new TermQuery(new Term("field", "fifty"));
try { try {
Hits hits = searcher.search(query); Hits hits = searcher.search(query);
@ -91,6 +163,7 @@ public class TestTermVectors extends TestCase {
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null); assertTrue(vector != null);
assertTrue(vector.length == 1); assertTrue(vector.length == 1);
//assertTrue(); //assertTrue();
} }
} catch (IOException e) { } catch (IOException e) {
@ -164,7 +237,7 @@ public class TestTermVectors extends TestCase {
int [] freqs = vector.getTermFrequencies(); int [] freqs = vector.getTermFrequencies();
for (int i = 0; i < vTerms.length; i++) for (int i = 0; i < vTerms.length; i++)
{ {
if (term.text().equals(vTerms[i]) == true) if (term.text().equals(vTerms[i]))
{ {
assertTrue(freqs[i] == freq); assertTrue(freqs[i] == freq);
} }
@ -184,9 +257,9 @@ public class TestTermVectors extends TestCase {
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
assertTrue(testDoc3.toString().equals(hits.doc(0).toString())); assertTrue(hits.id(0) == 2);
assertTrue(testDoc4.toString().equals(hits.doc(1).toString())); assertTrue(hits.id(1) == 3);
assertTrue(testDoc1.toString().equals(hits.doc(2).toString())); assertTrue(hits.id(2) == 0);
TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field"); TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field");
assertTrue(vector != null); assertTrue(vector != null);
//System.out.println("Vector: " + vector); //System.out.println("Vector: " + vector);