diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java
index 5757f7aa1a9..d95a6e6ac97 100644
--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@@ -38,6 +38,8 @@ public final class Field implements java.io.Serializable {
private Object fieldsData = null;
private boolean storeTermVector = false;
+ private boolean storeOffsetWithTermVector = false;
+ private boolean storePositionWithTermVector = false;
private boolean isStored = false;
private boolean isIndexed = true;
private boolean isTokenized = true;
@@ -55,16 +57,19 @@ public final class Field implements java.io.Serializable {
public String toString() {
return name;
}
+
/** Store the original field value in the index in a compressed form. This is
* useful for long documents and for binary valued fields.
*/
public static final Store COMPRESS = new Store("COMPRESS");
+
/** Store the original field value in the index. This is useful for short texts
* like a document's title which should be displayed with the results. The
* value is stored in its original form, i.e. no analyzer is used before it is
* stored.
*/
public static final Store YES = new Store("YES");
+
/** Do not store the field value in the index. */
public static final Store NO = new Store("NO");
}
@@ -100,15 +105,41 @@ public final class Field implements java.io.Serializable {
private TermVector(String name) {
this.name = name;
}
+
public String toString() {
return name;
}
+
/** Do not store term vectors.
*/
public static final TermVector NO = new TermVector("NO");
+
/** Store the term vectors of each document. A term vector is a list
* of the document's terms and their number of occurences in that document. */
public static final TermVector YES = new TermVector("YES");
+
+ /**
+ * Store the term vector + token position information
+ *
+ * @see #YES
+ */
+ public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS");
+
+ /**
+ * Store the term vector + Token offset information
+ *
+ * @see #YES
+ */
+ public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS");
+
+ /**
+ * Store the term vector + Token position and offset information
+ *
+ * @see #YES
+ * @see #WITH_POSITIONS
+ * @see #WITH_OFFSETS
+ */
+ public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS");
}
/** Sets the boost factor hits on this field. This value will be
@@ -290,14 +321,18 @@ public final class Field implements java.io.Serializable {
this.name = name.intern(); // field names are interned
this.fieldsData = value;
- if (store == Store.YES)
+ if (store == Store.YES){
this.isStored = true;
+ this.isCompressed = false;
+ }
else if (store == Store.COMPRESS) {
this.isStored = true;
this.isCompressed = true;
}
- else if (store == Store.NO)
+ else if (store == Store.NO){
this.isStored = false;
+ this.isCompressed = false;
+ }
else
throw new IllegalArgumentException("unknown store parameter " + store);
@@ -313,6 +348,8 @@ public final class Field implements java.io.Serializable {
} else {
throw new IllegalArgumentException("unknown index parameter " + index);
}
+
+ this.isBinary = false;
setStoreTermVector(termVector);
}
@@ -343,11 +380,18 @@ public final class Field implements java.io.Serializable {
throw new NullPointerException("name cannot be null");
if (reader == null)
throw new NullPointerException("reader cannot be null");
+
this.name = name.intern(); // field names are interned
this.fieldsData = reader;
+
this.isStored = false;
+ this.isCompressed = false;
+
this.isIndexed = true;
this.isTokenized = true;
+
+ this.isBinary = false;
+
setStoreTermVector(termVector);
}
@@ -374,21 +418,29 @@ public final class Field implements java.io.Serializable {
throw new IllegalArgumentException("name cannot be null");
if (value == null)
throw new IllegalArgumentException("value cannot be null");
- if (store == Store.NO)
- throw new IllegalArgumentException("binary values can't be unstored");
- if (store == Store.COMPRESS)
- this.isCompressed = true;
this.name = name.intern();
- //wrap the byte[] to a ByteBuffer object
this.fieldsData = value;
- this.isBinary = true;
- this.isStored = true;
+ if (store == Store.YES){
+ this.isStored = true;
+ this.isCompressed = false;
+ }
+ else if (store == Store.COMPRESS) {
+ this.isStored = true;
+ this.isCompressed = true;
+ }
+ else if (store == Store.NO)
+ throw new IllegalArgumentException("binary values can't be unstored");
+ else
+ throw new IllegalArgumentException("unknown store parameter " + store);
this.isIndexed = false;
this.isTokenized = false;
- this.storeTermVector = false;
+
+ this.isBinary = true;
+
+ setStoreTermVector(TermVector.NO);
}
/**
@@ -422,9 +474,30 @@ public final class Field implements java.io.Serializable {
private void setStoreTermVector(TermVector termVector) {
if (termVector == TermVector.NO) {
this.storeTermVector = false;
- } else if (termVector == TermVector.YES) {
+ this.storePositionWithTermVector = false;
+ this.storeOffsetWithTermVector = false;
+ }
+ else if (termVector == TermVector.YES) {
this.storeTermVector = true;
- } else {
+ this.storePositionWithTermVector = false;
+ this.storeOffsetWithTermVector = false;
+ }
+ else if (termVector == TermVector.WITH_POSITIONS) {
+ this.storeTermVector = true;
+ this.storePositionWithTermVector = true;
+ this.storeOffsetWithTermVector = false;
+ }
+ else if (termVector == TermVector.WITH_OFFSETS) {
+ this.storeTermVector = true;
+ this.storePositionWithTermVector = false;
+ this.storeOffsetWithTermVector = true;
+ }
+ else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) {
+ this.storeTermVector = true;
+ this.storePositionWithTermVector = true;
+ this.storeOffsetWithTermVector = true;
+ }
+ else {
throw new IllegalArgumentException("unknown termVector parameter " + termVector);
}
}
@@ -455,7 +528,24 @@ public final class Field implements java.io.Serializable {
* @see IndexReader#getTermFreqVector(int, String)
*/
public final boolean isTermVectorStored() { return storeTermVector; }
-
+
+ /**
+ * True iff terms are stored as term vector together with their offsets
+ * (start and end positon in source text).
+ * @return
+ */
+ public boolean isStoreOffsetWithTermVector(){
+ return storeOffsetWithTermVector;
+ }
+
+ /**
+ * True iff terms are stored as term vector together with their token positions.
+ * @return
+ */
+ public boolean isStorePositionWithTermVector(){
+ return storePositionWithTermVector;
+ }
+
/** True iff the value of the filed is stored as binary */
public final boolean isBinary() { return isBinary; }
@@ -479,6 +569,16 @@ public final class Field implements java.io.Serializable {
result.append(",");
result.append("termVector");
}
+ if (storeOffsetWithTermVector) {
+ if (result.length() > 0)
+ result.append(",");
+ result.append("termVectorOffsets");
+ }
+ if (storePositionWithTermVector) {
+ if (result.length() > 0)
+ result.append(",");
+ result.append("termVectorPosition");
+ }
if (isBinary) {
if (result.length() > 0)
result.append(",");
diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java
index 9ec7a735c49..4952d227b69 100644
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@@ -74,6 +74,7 @@ final class DocumentWriter {
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
+ fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
Arrays.fill(fieldBoosts, doc.getBoost());
@@ -100,7 +101,7 @@ final class DocumentWriter {
writePostings(postings, segment);
// write norms of indexed fields
- writeNorms(doc, segment);
+ writeNorms(segment);
}
@@ -109,6 +110,7 @@ final class DocumentWriter {
private final Hashtable postingTable = new Hashtable();
private int[] fieldLengths;
private int[] fieldPositions;
+ private int[] fieldOffsets;
private float[] fieldBoosts;
// Tokenizes the fields of a document into Postings.
@@ -122,12 +124,19 @@ final class DocumentWriter {
int length = fieldLengths[fieldNumber]; // length of field
int position = fieldPositions[fieldNumber]; // position in field
+ int offset = fieldOffsets[fieldNumber]; // offset field
if (field.isIndexed()) {
if (!field.isTokenized()) { // un-tokenized field
- addPosition(fieldName, field.stringValue(), position++);
+ String stringValue = field.stringValue();
+ if(field.isStoreOffsetWithTermVector())
+ addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
+ else
+ addPosition(fieldName, stringValue, position++, null);
+ offset += stringValue.length();
length++;
- } else {
+ } else
+ {
Reader reader; // find or make Reader
if (field.readerValue() != null)
reader = field.readerValue();
@@ -140,11 +149,23 @@ final class DocumentWriter {
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
+ Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
- addPosition(fieldName, t.termText(), position++);
- if (++length > maxFieldLength) break;
+
+ if(field.isStoreOffsetWithTermVector())
+ addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()));
+ else
+ addPosition(fieldName, t.termText(), position++, null);
+
+ lastToken = t;
+ if (++length > maxFieldLength)
+ break;
}
+
+ if(lastToken != null)
+ offset += lastToken.endOffset() + 1;
+
} finally {
stream.close();
}
@@ -153,14 +174,16 @@ final class DocumentWriter {
fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field.getBoost();
+ fieldOffsets[fieldNumber] = offset;
}
}
}
private final Term termBuffer = new Term("", ""); // avoid consing
- private final void addPosition(String field, String text, int position) {
+ private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) {
termBuffer.set(field, text);
+ //System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer);
if (ti != null) { // word seen before
int freq = ti.freq;
@@ -172,10 +195,23 @@ final class DocumentWriter {
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
+
+ if (offset != null) {
+ if (ti.offsets.length == freq){
+ TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
+ TermVectorOffsetInfo [] offsets = ti.offsets;
+ for (int i = 0; i < freq; i++)
+ {
+ newOffsets[i] = offsets[i];
+ }
+ ti.offsets = newOffsets;
+ }
+ ti.offsets[freq] = offset;
+ }
ti.freq = freq + 1; // update frequency
} else { // word not seen before
Term term = new Term(field, text, false);
- postingTable.put(term, new Posting(term, position));
+ postingTable.put(term, new Posting(term, position, offset));
}
}
@@ -294,12 +330,13 @@ final class DocumentWriter {
termVectorWriter.openDocument();
}
termVectorWriter.openField(currentField);
+
} else if (termVectorWriter != null) {
termVectorWriter.closeField();
}
}
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
- termVectorWriter.addTerm(posting.term.text(), postingFreq);
+ termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
}
}
if (termVectorWriter != null)
@@ -316,7 +353,7 @@ final class DocumentWriter {
}
}
- private final void writeNorms(Document doc, String segment) throws IOException {
+ private final void writeNorms(String segment) throws IOException {
for(int n = 0; n < fieldInfos.size(); n++){
FieldInfo fi = fieldInfos.fieldInfo(n);
if(fi.isIndexed){
@@ -336,11 +373,18 @@ final class Posting { // info about a Term in a doc
Term term; // the Term
int freq; // its frequency in doc
int[] positions; // positions it occurs at
+ TermVectorOffsetInfo [] offsets;
- Posting(Term t, int position) {
+ Posting(Term t, int position, TermVectorOffsetInfo offset) {
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
+ if(offset != null){
+ offsets = new TermVectorOffsetInfo[1];
+ offsets[0] = offset;
+ }
+ else
+ offsets = null;
}
}
diff --git a/src/java/org/apache/lucene/index/FieldInfo.java b/src/java/org/apache/lucene/index/FieldInfo.java
index 47102c9be69..2b575fbb1ce 100644
--- a/src/java/org/apache/lucene/index/FieldInfo.java
+++ b/src/java/org/apache/lucene/index/FieldInfo.java
@@ -23,11 +23,16 @@ final class FieldInfo {
// true if term vector for this field should be stored
boolean storeTermVector;
+ boolean storeOffsetWithTermVector;
+ boolean storePositionWithTermVector;
- FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) {
+ FieldInfo(String na, boolean tk, int nu, boolean storeTermVector,
+ boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
name = na;
isIndexed = tk;
number = nu;
this.storeTermVector = storeTermVector;
+ this.storeOffsetWithTermVector = storeOffsetWithTermVector;
+ this.storePositionWithTermVector = storePositionWithTermVector;
}
}
diff --git a/src/java/org/apache/lucene/index/FieldInfos.java b/src/java/org/apache/lucene/index/FieldInfos.java
index 35bdee8ca36..b20b8d56663 100644
--- a/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/src/java/org/apache/lucene/index/FieldInfos.java
@@ -33,6 +33,12 @@ import org.apache.lucene.store.IndexInput;
* accessing this object.
*/
final class FieldInfos {
+
+ static final byte IS_INDEXED = 0x1;
+ static final byte STORE_TERMVECTOR = 0x2;
+ static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4;
+ static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8;
+
private ArrayList byNumber = new ArrayList();
private HashMap byName = new HashMap();
@@ -61,23 +67,30 @@ final class FieldInfos {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
- add(field.name(), field.isIndexed(), field.isTermVectorStored());
+ add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(),
+ field.isStoreOffsetWithTermVector());
}
}
-
+
/**
+ * Add fields that are indexed. Whether they have termvectors has to be specified.
+ *
* @param names The names of the fields
* @param storeTermVectors Whether the fields store term vectors or not
+ * @param storePositionWithTermVector treu if positions should be stored.
+ * @param storeOffsetWithTermVector true if offsets should be stored
*/
- public void addIndexed(Collection names, boolean storeTermVectors) {
+ public void addIndexed(Collection names, boolean storeTermVectors, boolean storePositionWithTermVector,
+ boolean storeOffsetWithTermVector) {
Iterator i = names.iterator();
while (i.hasNext()) {
- add((String)i.next(), true, storeTermVectors);
+ add((String)i.next(), true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector);
}
}
/**
- * Assumes the field is not storing term vectors
+ * Assumes the fields are not storing term vectors.
+ *
* @param names The names of the fields
* @param isIndexed Whether the fields are indexed or not
*
@@ -91,28 +104,43 @@ final class FieldInfos {
}
/**
- * Calls three parameter add with false for the storeTermVector parameter
+ * Calls 5 parameter add with false for all TermVector parameters.
+ *
* @param name The name of the Field
* @param isIndexed true if the field is indexed
- * @see #add(String, boolean, boolean)
+ * @see #add(String, boolean, boolean, boolean, boolean)
*/
public void add(String name, boolean isIndexed) {
- add(name, isIndexed, false);
+ add(name, isIndexed, false, false, false);
}
-
+ /**
+ * Calls 5 parameter add with false for term vector positions and offsets.
+ *
+ * @param name The name of the field
+ * @param isIndexed true if the field is indexed
+ * @param storeTermVector true if the term vector should be stored
+ */
+ public void add(String name, boolean isIndexed, boolean storeTermVector){
+ add(name, isIndexed, storeTermVector, false, false);
+ }
+
/** If the field is not yet known, adds it. If it is known, checks to make
* sure that the isIndexed flag is the same as was given previously for this
- * field. If not - marks it as being indexed. Same goes for storeTermVector
+ * field. If not - marks it as being indexed. Same goes for the TermVector
+ * parameters.
*
* @param name The name of the field
* @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored
+ * @param storePositionWithTermVector true if the term vector with positions should be stored
+ * @param storeOffsetWithTermVector true if the term vector with offsets should be stored
*/
- public void add(String name, boolean isIndexed, boolean storeTermVector) {
+ public void add(String name, boolean isIndexed, boolean storeTermVector,
+ boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) {
FieldInfo fi = fieldInfo(name);
if (fi == null) {
- addInternal(name, isIndexed, storeTermVector);
+ addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector);
} else {
if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index
@@ -120,13 +148,21 @@ final class FieldInfos {
if (fi.storeTermVector != storeTermVector) {
fi.storeTermVector = true; // once vector, always vector
}
+ if (fi.storePositionWithTermVector != storePositionWithTermVector) {
+ fi.storePositionWithTermVector = true; // once vector, always vector
+ }
+ if (fi.storeOffsetWithTermVector != storeOffsetWithTermVector) {
+ fi.storeOffsetWithTermVector = true; // once vector, always vector
+ }
}
}
private void addInternal(String name, boolean isIndexed,
- boolean storeTermVector) {
+ boolean storeTermVector, boolean storePositionWithTermVector,
+ boolean storeOffsetWithTermVector) {
FieldInfo fi =
- new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector);
+ new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector,
+ storeOffsetWithTermVector);
byNumber.add(fi);
byName.put(name, fi);
}
@@ -180,11 +216,11 @@ final class FieldInfos {
for (int i = 0; i < size(); i++) {
FieldInfo fi = fieldInfo(i);
byte bits = 0x0;
- if (fi.isIndexed) bits |= 0x1;
- if (fi.storeTermVector) bits |= 0x2;
+ if (fi.isIndexed) bits |= IS_INDEXED;
+ if (fi.storeTermVector) bits |= STORE_TERMVECTOR;
+ if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR;
+ if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR;
output.writeString(fi.name);
- //Was REMOVE
- //output.writeByte((byte)(fi.isIndexed ? 1 : 0));
output.writeByte(bits);
}
}
@@ -194,9 +230,11 @@ final class FieldInfos {
for (int i = 0; i < size; i++) {
String name = input.readString().intern();
byte bits = input.readByte();
- boolean isIndexed = (bits & 0x1) != 0;
- boolean storeTermVector = (bits & 0x2) != 0;
- addInternal(name, isIndexed, storeTermVector);
+ boolean isIndexed = (bits & IS_INDEXED) != 0;
+ boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
+ boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+ boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
+ addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector);
}
}
diff --git a/src/java/org/apache/lucene/index/FilterIndexReader.java b/src/java/org/apache/lucene/index/FilterIndexReader.java
index 61c5de86595..797e00afd1a 100644
--- a/src/java/org/apache/lucene/index/FilterIndexReader.java
+++ b/src/java/org/apache/lucene/index/FilterIndexReader.java
@@ -16,11 +16,12 @@ package org.apache.lucene.index;
* limitations under the License.
*/
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
import java.io.IOException;
import java.util.Collection;
-import org.apache.lucene.document.Document;
-
/** A FilterIndexReader
contains another IndexReader, which it
* uses as its basic source of data, possibly transforming the data along the
* way or providing additional functionality. The class
@@ -146,4 +147,8 @@ public class FilterIndexReader extends IndexReader {
public Collection getIndexedFieldNames(boolean storedTermVector) {
return in.getIndexedFieldNames(storedTermVector);
}
+
+ public Collection getIndexedFieldNames (Field.TermVector tvSpec){
+ return in.getIndexedFieldNames(tvSpec);
+ }
}
diff --git a/src/java/org/apache/lucene/index/IndexReader.java b/src/java/org/apache/lucene/index/IndexReader.java
index dd617290343..1ac28b392ef 100644
--- a/src/java/org/apache/lucene/index/IndexReader.java
+++ b/src/java/org/apache/lucene/index/IndexReader.java
@@ -16,16 +16,16 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import java.io.IOException;
-import java.io.File;
-import java.util.Collection;
-
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field; // for javadoc
-import org.apache.lucene.search.Similarity;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@@ -209,23 +209,37 @@ public abstract class IndexReader {
return SegmentInfos.readCurrentVersion(directory);
}
- /** Return an array of term frequency vectors for the specified document.
+ /**
+ * Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document.
- * Each vector contains terms and frequencies for all terms
- * in a given vectorized field.
- * If no such fields existed, the method returns null.
- *
- * @see Field#isTermVectorStored()
+ * Each vector contains terms and frequencies for all terms in a given vectorized field.
+ * If no such fields existed, the method returns null. The term vectors that are
+ * returned my either be of type TermFreqVector or of type TermPositionsVector if
+ * positions or offsets have been stored.
+ *
+ * @param docNumber document for which term frequency vectors are returned
+ * @return array of term frequency vectors. May be null if no term vectors have been
+ * stored for the specified document.
+ * @throws IOException if index cannot be accessed
+ * @see Field#TermVector
*/
abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException;
- /** Return a term frequency vector for the specified document and field. The
- * vector returned contains terms and frequencies for those terms in
- * the specified field of this document, if the field had storeTermVector
- * flag set. If the flag was not set, the method returns null.
- *
- * @see Field#isTermVectorStored()
+
+ /**
+ * Return a term frequency vector for the specified document and field. The
+ * returned vector contains terms and frequencies for the terms in
+ * the specified field of this document, if the field had the storeTermVector
+ * flag set. If termvectors had been stored with positions or offsets, a
+ * TermPositionsVector is returned.
+ *
+ * @param docNumber document for which the term frequency vector is returned
+ * @param field field for which the term frequency vector is returned.
+ * @return term frequency vector May be null if field does not exist in the specified
+ * document or term vector was not stored.
+ * @throws IOException if index cannot be accessed
+ * @see Field#TermVector
*/
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException;
@@ -547,8 +561,19 @@ public abstract class IndexReader {
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
* else only indexed fields without term vector info
* @return Collection of Strings indicating the names of the fields
+ *
+ * @deprecated Replaced by {@link #getIndexedFieldNames (Field.TermVector tvSpec)}
*/
public abstract Collection getIndexedFieldNames(boolean storedTermVector);
+
+ /**
+ * Get a list of unique field names that exist in this index, are indexed, and have
+ * the specified term vector information.
+ *
+ * @param tvSpec specifies which term vector information shoul dbe available for the fields
+ * @return Collection of Strings indicating the names of the fields
+ */
+ public abstract Collection getIndexedFieldNames(Field.TermVector tvSpec);
/**
* Returns true
iff the index in the named directory is
@@ -560,7 +585,6 @@ public abstract class IndexReader {
return
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
-
}
/**
diff --git a/src/java/org/apache/lucene/index/MultiReader.java b/src/java/org/apache/lucene/index/MultiReader.java
index ffa438854d8..09886ccb539 100644
--- a/src/java/org/apache/lucene/index/MultiReader.java
+++ b/src/java/org/apache/lucene/index/MultiReader.java
@@ -16,16 +16,13 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import java.io.IOException;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.Set;
-
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
+import java.io.IOException;
+import java.util.*;
+
/** An IndexReader which reads multiple indexes, appending their content.
*
* @version $Id$
@@ -219,11 +216,7 @@ public class MultiReader extends IndexReader {
for (int i = 0; i < subReaders.length; i++) {
IndexReader reader = subReaders[i];
Collection names = reader.getFieldNames();
- // iterate through the field names and add them to the set
- for (Iterator iterator = names.iterator(); iterator.hasNext();) {
- String s = (String) iterator.next();
- fieldSet.add(s);
- }
+ fieldSet.addAll(names);
}
return fieldSet;
}
@@ -253,6 +246,17 @@ public class MultiReader extends IndexReader {
return fieldSet;
}
+ public Collection getIndexedFieldNames (Field.TermVector tvSpec){
+ // maintain a unique set of field names
+ Set fieldSet = new HashSet();
+ for (int i = 0; i < subReaders.length; i++) {
+ IndexReader reader = subReaders[i];
+ Collection names = reader.getIndexedFieldNames(tvSpec);
+ fieldSet.addAll(names);
+ }
+ return fieldSet;
+ }
+
}
class MultiTermEnum extends TermEnum {
diff --git a/src/java/org/apache/lucene/index/SegmentMerger.java b/src/java/org/apache/lucene/index/SegmentMerger.java
index b97144c8bb0..b9402977761 100644
--- a/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/src/java/org/apache/lucene/index/SegmentMerger.java
@@ -20,6 +20,7 @@ import java.util.Vector;
import java.util.Iterator;
import java.io.IOException;
+import org.apache.lucene.document.Field;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
@@ -157,8 +158,11 @@ final class SegmentMerger {
int docCount = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader) readers.elementAt(i);
- fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true);
- fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false);
+ fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_POSITIONS_OFFSETS), true, true, true);
+ fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_POSITIONS), true, true, false);
+ fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_OFFSETS), true, false, true);
+ fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.YES), true, false, false);
+ fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.NO), false, false, false);
fieldInfos.add(reader.getFieldNames(false), false);
}
fieldInfos.write(directory, segment + ".fnm");
@@ -195,29 +199,9 @@ final class SegmentMerger {
int maxDoc = reader.maxDoc();
for (int docNum = 0; docNum < maxDoc; docNum++) {
// skip deleted docs
- if (reader.isDeleted(docNum)) {
+ if (reader.isDeleted(docNum))
continue;
- }
- termVectorsWriter.openDocument();
-
- // get all term vectors
- TermFreqVector[] sourceTermVector =
- reader.getTermFreqVectors(docNum);
-
- if (sourceTermVector != null) {
- for (int f = 0; f < sourceTermVector.length; f++) {
- // translate field numbers
- TermFreqVector termVector = sourceTermVector[f];
- termVectorsWriter.openField(termVector.getField());
- String [] terms = termVector.getTerms();
- int [] freqs = termVector.getTermFrequencies();
-
- for (int t = 0; t < terms.length; t++) {
- termVectorsWriter.addTerm(terms[t], freqs[t]);
- }
- }
- termVectorsWriter.closeDocument();
- }
+ termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
}
}
} finally {
diff --git a/src/java/org/apache/lucene/index/SegmentReader.java b/src/java/org/apache/lucene/index/SegmentReader.java
index 3dd7fd2956c..9cfcdfad38c 100644
--- a/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/src/java/org/apache/lucene/index/SegmentReader.java
@@ -25,6 +25,7 @@ import java.util.Set;
import java.util.Vector;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Directory;
@@ -191,7 +192,9 @@ class SegmentReader extends IndexReader {
proxStream.close();
closeNorms();
- if (termVectorsReader != null) termVectorsReader.close();
+
+ if (termVectorsReader != null)
+ termVectorsReader.close();
if (cfsReader != null)
cfsReader.close();
@@ -342,16 +345,63 @@ class SegmentReader extends IndexReader {
* @return Collection of Strings indicating the names of the fields
*/
public Collection getIndexedFieldNames(boolean storedTermVector) {
+ if(storedTermVector){
+ Set fieldSet = new HashSet();
+ fieldSet.addAll(getIndexedFieldNames(Field.TermVector.YES));
+ fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_POSITIONS));
+ fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_OFFSETS));
+ fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_POSITIONS_OFFSETS));
+ return fieldSet;
+ }
+ else
+ return getIndexedFieldNames(Field.TermVector.NO);
+ }
+
+ public Collection getIndexedFieldNames (Field.TermVector tvSpec){
+ boolean storedTermVector;
+ boolean storePositionWithTermVector;
+ boolean storeOffsetWithTermVector;
+
+ if(tvSpec == Field.TermVector.NO){
+ storedTermVector = false;
+ storePositionWithTermVector = false;
+ storeOffsetWithTermVector = false;
+ }
+ else if(tvSpec == Field.TermVector.YES){
+ storedTermVector = true;
+ storePositionWithTermVector = false;
+ storeOffsetWithTermVector = false;
+ }
+ else if(tvSpec == Field.TermVector.WITH_POSITIONS){
+ storedTermVector = true;
+ storePositionWithTermVector = true;
+ storeOffsetWithTermVector = false;
+ }
+ else if(tvSpec == Field.TermVector.WITH_OFFSETS){
+ storedTermVector = true;
+ storePositionWithTermVector = false;
+ storeOffsetWithTermVector = true;
+ }
+ else if(tvSpec == Field.TermVector.WITH_POSITIONS_OFFSETS){
+ storedTermVector = true;
+ storePositionWithTermVector = true;
+ storeOffsetWithTermVector = true;
+ }
+ else{
+ throw new IllegalArgumentException("unknown termVector parameter " + tvSpec);
+ }
+
// maintain a unique set of field names
Set fieldSet = new HashSet();
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
- if (fi.isIndexed == true && fi.storeTermVector == storedTermVector){
+ if (fi.isIndexed && fi.storeTermVector == storedTermVector &&
+ fi.storePositionWithTermVector == storePositionWithTermVector &&
+ fi.storeOffsetWithTermVector == storeOffsetWithTermVector){
fieldSet.add(fi.name);
}
}
- return fieldSet;
-
+ return fieldSet;
}
public synchronized byte[] norms(String field) throws IOException {
@@ -429,11 +479,13 @@ class SegmentReader extends IndexReader {
* vector returned contains term numbers and frequencies for all terms in
* the specified field of this document, if the field had storeTermVector
* flag set. If the flag was not set, the method returns null.
+ * @throws IOException
*/
- public TermFreqVector getTermFreqVector(int docNumber, String field) {
+ public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException {
// Check if this field is invalid or has no stored term vector
FieldInfo fi = fieldInfos.fieldInfo(field);
- if (fi == null || !fi.storeTermVector) return null;
+ if (fi == null || !fi.storeTermVector || termVectorsReader == null)
+ return null;
return termVectorsReader.get(docNumber, field);
}
@@ -444,8 +496,9 @@ class SegmentReader extends IndexReader {
* Each vector vector contains term numbers and frequencies for all terms
* in a given vectorized field.
* If no such fields existed, the method returns null.
+ * @throws IOException
*/
- public TermFreqVector[] getTermFreqVectors(int docNumber) {
+ public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException {
if (termVectorsReader == null)
return null;
diff --git a/src/java/org/apache/lucene/index/SegmentTermPositionVector.java b/src/java/org/apache/lucene/index/SegmentTermPositionVector.java
new file mode 100644
index 00000000000..3f916fb0892
--- /dev/null
+++ b/src/java/org/apache/lucene/index/SegmentTermPositionVector.java
@@ -0,0 +1,64 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class SegmentTermPositionVector extends SegmentTermVector implements TermPositionVector {
+ protected int[][] positions;
+ protected TermVectorOffsetInfo[][] offsets;
+ public static final int[] EMPTY_TERM_POS = new int[0];
+
+ public SegmentTermPositionVector(String field, String terms[], int termFreqs[], int[][] positions, TermVectorOffsetInfo[][] offsets) {
+ super(field, terms, termFreqs);
+ this.offsets = offsets;
+ this.positions = positions;
+ }
+
+ /**
+ * Returns an array of TermVectorOffsetInfo in which the term is found.
+ *
+ * @param index The position in the array to get the offsets from
+ * @return An array of TermVectorOffsetInfo objects or the empty list
+ * @see org.apache.lucene.analysis.Token
+ */
+ public TermVectorOffsetInfo[] getOffsets(int index) {
+ TermVectorOffsetInfo[] result = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
+ if(offsets == null)
+ return null;
+ if (index >=0 && index < offsets.length)
+ {
+ result = offsets[index];
+ }
+ return result;
+ }
+
+ /**
+ * Returns an array of positions in which the term is found.
+ * Terms are identified by the index at which its number appears in the
+ * term String array obtained from the indexOf
method.
+ */
+ public int[] getTermPositions(int index) {
+ int[] result = EMPTY_TERM_POS;
+ if(positions == null)
+ return null;
+ if (index >=0 && index < positions.length)
+ {
+ result = positions[index];
+ }
+
+ return result;
+ }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/lucene/index/SegmentTermVector.java b/src/java/org/apache/lucene/index/SegmentTermVector.java
index 81d7237a3dd..c70560e1e72 100644
--- a/src/java/org/apache/lucene/index/SegmentTermVector.java
+++ b/src/java/org/apache/lucene/index/SegmentTermVector.java
@@ -1,4 +1,21 @@
package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
import java.util.*;
/**
@@ -26,11 +43,14 @@ class SegmentTermVector implements TermFreqVector {
StringBuffer sb = new StringBuffer();
sb.append('{');
sb.append(field).append(": ");
- for (int i=0; i0) sb.append(", ");
- sb.append(terms[i]).append('/').append(termFreqs[i]);
+ if(terms != null){
+ for (int i=0; i0) sb.append(", ");
+ sb.append(terms[i]).append('/').append(termFreqs[i]);
+ }
}
sb.append('}');
+
return sb.toString();
}
@@ -47,6 +67,8 @@ class SegmentTermVector implements TermFreqVector {
}
public int indexOf(String termText) {
+ if(terms == null)
+ return -1;
int res = Arrays.binarySearch(terms, termText);
return res >= 0 ? res : -1;
}
@@ -60,7 +82,7 @@ class SegmentTermVector implements TermFreqVector {
int res[] = new int[len];
for (int i=0; i < len; i++) {
- res[i] = indexOf(termNumbers[i]);
+ res[i] = indexOf(termNumbers[start+ i]);
}
return res;
}
diff --git a/src/java/org/apache/lucene/index/TermFreqVector.java b/src/java/org/apache/lucene/index/TermFreqVector.java
index 89565848b80..a919e5140de 100644
--- a/src/java/org/apache/lucene/index/TermFreqVector.java
+++ b/src/java/org/apache/lucene/index/TermFreqVector.java
@@ -1,5 +1,21 @@
package org.apache.lucene.index;
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
/** Provides access to stored term vector of
* a document field.
*/
diff --git a/src/java/org/apache/lucene/index/TermPositionVector.java b/src/java/org/apache/lucene/index/TermPositionVector.java
index 9ec2cd5dafe..1ce31b9217c 100644
--- a/src/java/org/apache/lucene/index/TermPositionVector.java
+++ b/src/java/org/apache/lucene/index/TermPositionVector.java
@@ -1,13 +1,42 @@
package org.apache.lucene.index;
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
/** Extends TermFreqVector
to provide additional information about
- * positions in which each of the terms is found.
+ * positions in which each of the terms is found. A TermPositionVector not necessarily
+ * contains both positions and offsets, but at least one of these arrays exists.
*/
public interface TermPositionVector extends TermFreqVector {
-
+
/** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
- * term number array obtained from getTermNumbers
method.
+ * term String array obtained from the indexOf
method.
+ * May return null if positions have not been stored.
*/
public int[] getTermPositions(int index);
+
+ /**
+ * Returns an array of TermVectorOffsetInfo in which the term is found.
+ * May return null if offsets have not been stored.
+ *
+ * @see org.apache.lucene.analysis.Token
+ *
+ * @param index The position in the array to get the offsets from
+ * @return An array of TermVectorOffsetInfo objects or the empty list
+ */
+ public TermVectorOffsetInfo [] getOffsets(int index);
}
\ No newline at end of file
diff --git a/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java b/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java
new file mode 100644
index 00000000000..1008351540a
--- /dev/null
+++ b/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java
@@ -0,0 +1,66 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class TermVectorOffsetInfo {
+ public static final TermVectorOffsetInfo [] EMPTY_OFFSET_INFO = new TermVectorOffsetInfo[0];
+ private int startOffset;
+ private int endOffset;
+
+ public TermVectorOffsetInfo() {
+ }
+
+ public TermVectorOffsetInfo(int startOffset, int endOffset) {
+ this.endOffset = endOffset;
+ this.startOffset = startOffset;
+ }
+
+ public int getEndOffset() {
+ return endOffset;
+ }
+
+ public void setEndOffset(int endOffset) {
+ this.endOffset = endOffset;
+ }
+
+ public int getStartOffset() {
+ return startOffset;
+ }
+
+ public void setStartOffset(int startOffset) {
+ this.startOffset = startOffset;
+ }
+
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (!(o instanceof TermVectorOffsetInfo)) return false;
+
+ final TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o;
+
+ if (endOffset != termVectorOffsetInfo.endOffset) return false;
+ if (startOffset != termVectorOffsetInfo.startOffset) return false;
+
+ return true;
+ }
+
+ public int hashCode() {
+ int result;
+ result = startOffset;
+ result = 29 * result + endOffset;
+ return result;
+ }
+}
diff --git a/src/java/org/apache/lucene/index/TermVectorsReader.java b/src/java/org/apache/lucene/index/TermVectorsReader.java
index 56c288caff6..d6852e9b8c4 100644
--- a/src/java/org/apache/lucene/index/TermVectorsReader.java
+++ b/src/java/org/apache/lucene/index/TermVectorsReader.java
@@ -33,6 +33,9 @@ class TermVectorsReader {
private IndexInput tvd;
private IndexInput tvf;
private int size;
+
+ private int tvdFormat;
+ private int tvfFormat;
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
throws IOException {
@@ -40,16 +43,16 @@ class TermVectorsReader {
tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION);
checkValidFormat(tvx);
tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION);
- checkValidFormat(tvd);
+ tvdFormat = checkValidFormat(tvd);
tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION);
- checkValidFormat(tvf);
+ tvfFormat = checkValidFormat(tvf);
size = (int) tvx.length() / 8;
}
this.fieldInfos = fieldInfos;
}
- private void checkValidFormat(IndexInput in) throws IOException
+ private int checkValidFormat(IndexInput in) throws IOException
{
int format = in.readInt();
if (format > TermVectorsWriter.FORMAT_VERSION)
@@ -57,7 +60,7 @@ class TermVectorsReader {
throw new IOException("Incompatible format version: " + format + " expected "
+ TermVectorsWriter.FORMAT_VERSION + " or less");
}
-
+ return format;
}
void close() throws IOException {
@@ -82,100 +85,101 @@ class TermVectorsReader {
* Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve
- * @return The TermFreqVector for the document and field or null
+ * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
+ * @throws IOException
*/
- synchronized TermFreqVector get(int docNum, String field) {
+ synchronized TermFreqVector get(int docNum, String field) throws IOException {
// Check if no term vectors are available for this segment at all
int fieldNumber = fieldInfos.fieldNumber(field);
TermFreqVector result = null;
if (tvx != null) {
- try {
- //We need to account for the FORMAT_SIZE at when seeking in the tvx
- //We don't need to do this in other seeks because we already have the file pointer
- //that was written in another file
- tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
- //System.out.println("TVX Pointer: " + tvx.getFilePointer());
- long position = tvx.readLong();
+ //We need to account for the FORMAT_SIZE at when seeking in the tvx
+ //We don't need to do this in other seeks because we already have the
+ // file pointer
+ //that was written in another file
+ tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
+ //System.out.println("TVX Pointer: " + tvx.getFilePointer());
+ long position = tvx.readLong();
- tvd.seek(position);
- int fieldCount = tvd.readVInt();
- //System.out.println("Num Fields: " + fieldCount);
- // There are only a few fields per document. We opt for a full scan
- // rather then requiring that they be ordered. We need to read through
- // all of the fields anyway to get to the tvf pointers.
- int number = 0;
- int found = -1;
- for (int i = 0; i < fieldCount; i++) {
+ tvd.seek(position);
+ int fieldCount = tvd.readVInt();
+ //System.out.println("Num Fields: " + fieldCount);
+ // There are only a few fields per document. We opt for a full scan
+ // rather then requiring that they be ordered. We need to read through
+ // all of the fields anyway to get to the tvf pointers.
+ int number = 0;
+ int found = -1;
+ for (int i = 0; i < fieldCount; i++) {
+ if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
+ number = tvd.readVInt();
+ else
number += tvd.readVInt();
- if (number == fieldNumber) found = i;
- }
-
- // This field, although valid in the segment, was not found in this document
- if (found != -1) {
- // Compute position in the tvf file
- position = 0;
- for (int i = 0; i <= found; i++)
- {
- position += tvd.readVLong();
- }
- result = readTermVector(field, position);
- }
- else {
- //System.out.println("Field not found");
- }
-
- } catch (Exception e) {
- //e.printStackTrace();
+
+ if (number == fieldNumber)
+ found = i;
}
- }
- else
- {
- System.out.println("No tvx file");
+
+ // This field, although valid in the segment, was not found in this
+ // document
+ if (found != -1) {
+ // Compute position in the tvf file
+ position = 0;
+ for (int i = 0; i <= found; i++)
+ position += tvd.readVLong();
+
+ result = readTermVector(field, position);
+ } else {
+ //System.out.println("Field not found");
+ }
+ } else {
+ //System.out.println("No tvx file");
}
return result;
}
- /** Return all term vectors stored for this document or null if the could not be read in. */
- synchronized TermFreqVector[] get(int docNum) {
+ /**
+ * Return all term vectors stored for this document or null if there are no term vectors
+ * for the document.
+ * @throws IOException
+ */
+ synchronized TermFreqVector[] get(int docNum) throws IOException {
TermFreqVector[] result = null;
// Check if no term vectors are available for this segment at all
if (tvx != null) {
- try {
- //We need to offset by
- tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
- long position = tvx.readLong();
+ //We need to offset by
+ tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
+ long position = tvx.readLong();
- tvd.seek(position);
- int fieldCount = tvd.readVInt();
+ tvd.seek(position);
+ int fieldCount = tvd.readVInt();
- // No fields are vectorized for this document
- if (fieldCount != 0) {
- int number = 0;
- String[] fields = new String[fieldCount];
-
- for (int i = 0; i < fieldCount; i++) {
+ // No fields are vectorized for this document
+ if (fieldCount != 0) {
+ int number = 0;
+ String[] fields = new String[fieldCount];
+
+ for (int i = 0; i < fieldCount; i++) {
+ if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
+ number = tvd.readVInt();
+ else
number += tvd.readVInt();
- fields[i] = fieldInfos.fieldName(number);
- }
-
- // Compute position in the tvf file
- position = 0;
- long[] tvfPointers = new long[fieldCount];
- for (int i = 0; i < fieldCount; i++) {
- position += tvd.readVLong();
- tvfPointers[i] = position;
- }
- result = readTermVectors(fields, tvfPointers);
+ fields[i] = fieldInfos.fieldName(number);
}
- } catch (IOException e) {
- e.printStackTrace();
+
+ // Compute position in the tvf file
+ position = 0;
+ long[] tvfPointers = new long[fieldCount];
+ for (int i = 0; i < fieldCount; i++) {
+ position += tvd.readVLong();
+ tvfPointers[i] = position;
+ }
+
+ result = readTermVectors(fields, tvfPointers);
}
- }
- else
- {
- System.out.println("No tvx file");
+ } else {
+ //System.out.println("No tvx file");
}
return result;
}
@@ -206,20 +210,41 @@ class TermVectorsReader {
int numTerms = tvf.readVInt();
//System.out.println("Num Terms: " + numTerms);
- // If no terms - return a constant empty termvector
- if (numTerms == 0) return new SegmentTermVector(field, null, null);
-
- tvf.readVInt();
+ // If no terms - return a constant empty termvector. However, this should never occur!
+ if (numTerms == 0)
+ return new SegmentTermVector(field, null, null);
+ boolean storePositions;
+ boolean storeOffsets;
+
+ if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){
+ byte bits = tvf.readByte();
+ storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
+ storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
+ }
+ else{
+ tvf.readVInt();
+ storePositions = false;
+ storeOffsets = false;
+ }
+
String terms[] = new String[numTerms];
-
int termFreqs[] = new int[numTerms];
-
+
+ // we may not need these, but declare them
+ int positions[][] = null;
+ TermVectorOffsetInfo offsets[][] = null;
+ if(storePositions)
+ positions = new int[numTerms][];
+ if(storeOffsets)
+ offsets = new TermVectorOffsetInfo[numTerms][];
+
int start = 0;
int deltaLength = 0;
int totalLength = 0;
char [] buffer = {};
String previousString = "";
+
for (int i = 0; i < numTerms; i++) {
start = tvf.readVInt();
deltaLength = tvf.readVInt();
@@ -233,9 +258,40 @@ class TermVectorsReader {
tvf.readChars(buffer, start, deltaLength);
terms[i] = new String(buffer, 0, totalLength);
previousString = terms[i];
- termFreqs[i] = tvf.readVInt();
+ int freq = tvf.readVInt();
+ termFreqs[i] = freq;
+
+ if (storePositions) { //read in the positions
+ int [] pos = new int[freq];
+ positions[i] = pos;
+ int prevPosition = 0;
+ for (int j = 0; j < freq; j++)
+ {
+ pos[j] = prevPosition + tvf.readVInt();
+ prevPosition = pos[j];
+ }
+ }
+
+ if (storeOffsets) {
+ TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
+ offsets[i] = offs;
+ int prevOffset = 0;
+ for (int j = 0; j < freq; j++) {
+ int startOffset = prevOffset + tvf.readVInt();
+ int endOffset = startOffset + tvf.readVInt();
+ offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
+ prevOffset = endOffset;
+ }
+ }
+ }
+
+ SegmentTermVector tv;
+ if (storePositions || storeOffsets){
+ tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
+ }
+ else {
+ tv = new SegmentTermVector(field, terms, termFreqs);
}
- SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
return tv;
}
diff --git a/src/java/org/apache/lucene/index/TermVectorsWriter.java b/src/java/org/apache/lucene/index/TermVectorsWriter.java
index 2b4de123074..f61e5a2db44 100644
--- a/src/java/org/apache/lucene/index/TermVectorsWriter.java
+++ b/src/java/org/apache/lucene/index/TermVectorsWriter.java
@@ -50,14 +50,17 @@ import java.util.Vector;
*
*/
final class TermVectorsWriter {
- public static final int FORMAT_VERSION = 1;
+ public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
+ public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
+
+ public static final int FORMAT_VERSION = 2;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
public static final int FORMAT_SIZE = 4;
- //TODO: Figure out how to write with or w/o position information and read back in
public static final String TVX_EXTENSION = ".tvx";
public static final String TVD_EXTENSION = ".tvd";
public static final String TVF_EXTENSION = ".tvf";
+
private IndexOutput tvx = null, tvd = null, tvf = null;
private Vector fields = null;
private Vector terms = null;
@@ -66,13 +69,6 @@ final class TermVectorsWriter {
private TVField currentField = null;
private long currentDocPointer = -1;
- /** Create term vectors writer for the specified segment in specified
- * directory. A new TermVectorsWriter should be created for each
- * segment. The parameter maxFields
indicates how many total
- * fields are found in this document. Not all of these fields may require
- * termvectors to be stored, so the number of calls to
- * openField
is less or equal to this number.
- */
public TermVectorsWriter(Directory directory, String segment,
FieldInfos fieldInfos)
throws IOException {
@@ -93,7 +89,6 @@ final class TermVectorsWriter {
public final void openDocument()
throws IOException {
closeDocument();
-
currentDocPointer = tvd.getFilePointer();
}
@@ -119,12 +114,17 @@ final class TermVectorsWriter {
* processing of this field. If a field was previously open, it is
* closed automatically.
*/
- public final void openField(String field)
- throws IOException {
- if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");
-
+ public final void openField(String field) throws IOException {
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
+ openField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
+ }
+
+ private void openField(int fieldNumber, boolean storePositionWithTermVector,
+ boolean storeOffsetWithTermVector) throws IOException{
+ if (!isDocumentOpen())
+ throw new IllegalStateException("Cannot open field when no document is open.");
closeField();
- currentField = new TVField(fieldInfos.fieldNumber(field));
+ currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
}
/** Finished processing current field. This should be followed by a call to
@@ -157,57 +157,80 @@ final class TermVectorsWriter {
* times this term appears in this field, in this document.
*/
public final void addTerm(String termText, int freq) {
- if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");
- if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
-
- addTermInternal(termText, freq);
+ addTerm(termText, freq, null, null);
+ }
+
+ public final void addTerm(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets)
+ {
+ if (!isDocumentOpen())
+ throw new IllegalStateException("Cannot add terms when document is not open");
+ if (!isFieldOpen())
+ throw new IllegalStateException("Cannot add terms when field is not open");
+
+ addTermInternal(termText, freq, positions, offsets);
}
- private final void addTermInternal(String termText, int freq) {
- currentField.length += freq;
+ private final void addTermInternal(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) {
TVTerm term = new TVTerm();
term.termText = termText;
term.freq = freq;
+ term.positions = positions;
+ term.offsets = offsets;
terms.add(term);
}
-
- /** Add specified vectors to the document.
+ /**
+ * Add a complete document specified by all its term vectors. If document has no
+ * term vectors, add value for tvx.
+ *
+ * @param vectors
+ * @throws IOException
*/
- public final void addVectors(TermFreqVector[] vectors)
- throws IOException {
- if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");
- if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
+ public final void addAllDocVectors(TermFreqVector[] vectors)
+ throws IOException {
+ openDocument();
- for (int i = 0; i < vectors.length; i++) {
- addTermFreqVector(vectors[i]);
+ if (vectors != null) {
+ for (int i = 0; i < vectors.length; i++) {
+ boolean storePositionWithTermVector = false;
+ boolean storeOffsetWithTermVector = false;
+
+ try {
+
+ TermPositionVector tpVector = (TermPositionVector) vectors[i];
+
+ if (tpVector.size() > 0 && tpVector.getTermPositions(0) != null)
+ storePositionWithTermVector = true;
+ if (tpVector.size() > 0 && tpVector.getOffsets(0) != null)
+ storeOffsetWithTermVector = true;
+
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(tpVector.getField());
+ openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
+
+ for (int j = 0; j < tpVector.size(); j++)
+ addTermInternal(tpVector.getTerms()[j], tpVector.getTermFrequencies()[j], tpVector.getTermPositions(j),
+ tpVector.getOffsets(j));
+
+ closeField();
+
+ } catch (ClassCastException ignore) {
+
+ TermFreqVector tfVector = vectors[i];
+
+ FieldInfo fieldInfo = fieldInfos.fieldInfo(tfVector.getField());
+ openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);
+
+ for (int j = 0; j < tfVector.size(); j++)
+ addTermInternal(tfVector.getTerms()[j], tfVector.getTermFrequencies()[j], null, null);
+
+ closeField();
+
+ }
+ }
}
+
+ closeDocument();
}
-
-
- /** Add specified vector to the document. Document must be open but no field
- * should be open or exception is thrown. The same document can have addTerm
- * and addVectors
calls mixed, however a given field must either be
- * populated with addTerm
or with addVector
. *
- */
- public final void addTermFreqVector(TermFreqVector vector)
- throws IOException {
- if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");
- if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");
- addTermFreqVectorInternal(vector);
- }
-
- private final void addTermFreqVectorInternal(TermFreqVector vector)
- throws IOException {
- openField(vector.getField());
- for (int i = 0; i < vector.size(); i++) {
- addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
- }
- closeField();
- }
-
-
-
/** Close all streams. */
final void close() throws IOException {
@@ -245,47 +268,74 @@ final class TermVectorsWriter {
// remember where this field is written
currentField.tvfPointer = tvf.getFilePointer();
//System.out.println("Field Pointer: " + currentField.tvfPointer);
- final int size;
-
- tvf.writeVInt(size = terms.size());
- tvf.writeVInt(currentField.length - size);
+
+ final int size = terms.size();
+ tvf.writeVInt(size);
+
+ boolean storePositions = currentField.storePositions;
+ boolean storeOffsets = currentField.storeOffsets;
+ byte bits = 0x0;
+ if (storePositions)
+ bits |= STORE_POSITIONS_WITH_TERMVECTOR;
+ if (storeOffsets)
+ bits |= STORE_OFFSET_WITH_TERMVECTOR;
+ tvf.writeByte(bits);
+
String lastTermText = "";
- // write term ids and positions
for (int i = 0; i < size; i++) {
TVTerm term = (TVTerm) terms.elementAt(i);
- //tvf.writeString(term.termText);
int start = StringHelper.stringDifference(lastTermText, term.termText);
int length = term.termText.length() - start;
- tvf.writeVInt(start); // write shared prefix length
- tvf.writeVInt(length); // write delta length
+ tvf.writeVInt(start); // write shared prefix length
+ tvf.writeVInt(length); // write delta length
tvf.writeChars(term.termText, start, length); // write delta chars
tvf.writeVInt(term.freq);
lastTermText = term.termText;
+
+ if(storePositions){
+ if(term.positions == null)
+ throw new IllegalStateException("Trying to write positions that are null!");
+
+ // use delta encoding for positions
+ int position = 0;
+ for (int j = 0; j < term.freq; j++){
+ tvf.writeVInt(term.positions[j] - position);
+ position = term.positions[j];
+ }
+ }
+
+ if(storeOffsets){
+ if(term.offsets == null)
+ throw new IllegalStateException("Trying to write offsets that are null!");
+
+ // use delta encoding for offsets
+ int position = 0;
+ for (int j = 0; j < term.freq; j++) {
+ tvf.writeVInt(term.offsets[j].getStartOffset() - position);
+ tvf.writeVInt(term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); //Save the diff between the two.
+ position = term.offsets[j].getEndOffset();
+ }
+ }
}
}
-
-
-
private void writeDoc() throws IOException {
- if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document");
+ if (isFieldOpen())
+ throw new IllegalStateException("Field is still open while writing document");
//System.out.println("Writing doc pointer: " + currentDocPointer);
// write document index record
tvx.writeLong(currentDocPointer);
// write document data record
- final int size;
+ final int size = fields.size();
// write the number of fields
- tvd.writeVInt(size = fields.size());
+ tvd.writeVInt(size);
// write field numbers
- int lastFieldNumber = 0;
for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i);
- tvd.writeVInt(field.number - lastFieldNumber);
-
- lastFieldNumber = field.number;
+ tvd.writeVInt(field.number);
}
// write field pointers
@@ -293,7 +343,6 @@ final class TermVectorsWriter {
for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i);
tvd.writeVLong(field.tvfPointer - lastFieldPointer);
-
lastFieldPointer = field.tvfPointer;
}
//System.out.println("After writing doc pointer: " + tvx.getFilePointer());
@@ -303,17 +352,20 @@ final class TermVectorsWriter {
private static class TVField {
int number;
long tvfPointer = 0;
- int length = 0; // number of distinct term positions
-
- TVField(int number) {
+ boolean storePositions = false;
+ boolean storeOffsets = false;
+ TVField(int number, boolean storePos, boolean storeOff) {
this.number = number;
+ storePositions = storePos;
+ storeOffsets = storeOff;
}
}
private static class TVTerm {
String termText;
int freq = 0;
- //int positions[] = null;
+ int positions[] = null;
+ TermVectorOffsetInfo [] offsets = null;
}
diff --git a/src/test/org/apache/lucene/index/DocHelper.java b/src/test/org/apache/lucene/index/DocHelper.java
index 2a8c3876b31..5f3c6daf956 100644
--- a/src/test/org/apache/lucene/index/DocHelper.java
+++ b/src/test/org/apache/lucene/index/DocHelper.java
@@ -34,7 +34,7 @@ class DocHelper {
//Fields will be lexicographically sorted. So, the order is: field, text, two
public static final int [] FIELD_2_FREQS = {3, 1, 1};
public static final String TEXT_FIELD_2_KEY = "textField2";
- public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true);
+ public static Field textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
public static final String KEYWORD_TEXT = "Keyword";
public static final String KEYWORD_FIELD_KEY = "keyField";
@@ -135,7 +135,7 @@ class DocHelper {
Enumeration fields = doc.fields();
int result = 0;
while (fields.hasMoreElements()) {
- fields.nextElement();
+ String name = fields.nextElement().toString();
result++;
}
return result;
diff --git a/src/test/org/apache/lucene/index/TestSegmentMerger.java b/src/test/org/apache/lucene/index/TestSegmentMerger.java
index c4b43472ce3..12da850ce60 100644
--- a/src/test/org/apache/lucene/index/TestSegmentMerger.java
+++ b/src/test/org/apache/lucene/index/TestSegmentMerger.java
@@ -109,6 +109,7 @@ public class TestSegmentMerger extends TestCase {
int [] freqs = vector.getTermFrequencies();
assertTrue(freqs != null);
//System.out.println("Freqs size: " + freqs.length);
+ assertTrue(vector instanceof TermPositionVector == true);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
diff --git a/src/test/org/apache/lucene/index/TestSegmentReader.java b/src/test/org/apache/lucene/index/TestSegmentReader.java
index fce8e1cb823..507ac3cddab 100644
--- a/src/test/org/apache/lucene/index/TestSegmentReader.java
+++ b/src/test/org/apache/lucene/index/TestSegmentReader.java
@@ -178,7 +178,7 @@ public class TestSegmentReader extends TestCase {
}
- public void testTermVectors() {
+ public void testTermVectors() throws IOException {
TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
assertTrue(result != null);
String [] terms = result.getTerms();
diff --git a/src/test/org/apache/lucene/index/TestTermVectorsReader.java b/src/test/org/apache/lucene/index/TestTermVectorsReader.java
index c742eededfc..40fdd818f55 100644
--- a/src/test/org/apache/lucene/index/TestTermVectorsReader.java
+++ b/src/test/org/apache/lucene/index/TestTermVectorsReader.java
@@ -11,7 +11,11 @@ public class TestTermVectorsReader extends TestCase {
private TermVectorsWriter writer = null;
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
private String [] testFields = {"f1", "f2", "f3"};
+ private boolean [] testFieldsStorePos = {true, false, true, false};
+ private boolean [] testFieldsStoreOff = {true, false, false, true};
private String [] testTerms = {"this", "is", "a", "test"};
+ private int [][] positions = new int[testTerms.length][];
+ private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
private RAMDirectory dir = new RAMDirectory();
private String seg = "testSegment";
private FieldInfos fieldInfos = new FieldInfos();
@@ -22,9 +26,22 @@ public class TestTermVectorsReader extends TestCase {
protected void setUp() {
for (int i = 0; i < testFields.length; i++) {
- fieldInfos.add(testFields[i], true, true);
+ fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
}
+ for (int i = 0; i < testTerms.length; i++)
+ {
+ positions[i] = new int[3];
+ for (int j = 0; j < positions[i].length; j++) {
+ // poditions are always sorted in increasing order
+ positions[i][j] = (int)(j * 10 + Math.random() * 10);
+ }
+ offsets[i] = new TermVectorOffsetInfo[3];
+ for (int j = 0; j < offsets[i].length; j++){
+ // ofsets are alway sorted in increasing order
+ offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
+ }
+ }
try {
Arrays.sort(testTerms);
for (int j = 0; j < 5; j++) {
@@ -34,7 +51,7 @@ public class TestTermVectorsReader extends TestCase {
for (int k = 0; k < testFields.length; k++) {
writer.openField(testFields[k]);
for (int i = 0; i < testTerms.length; i++) {
- writer.addTerm(testTerms[i], i);
+ writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
}
writer.closeField();
}
@@ -79,6 +96,103 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(false);
}
}
+
+ public void testPositionReader() {
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ TermPositionVector vector;
+ String [] terms;
+ vector = (TermPositionVector)reader.get(0, testFields[0]);
+ assertTrue(vector != null);
+ terms = vector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ int [] positions = vector.getTermPositions(i);
+ assertTrue(positions != null);
+ assertTrue(positions.length == this.positions[i].length);
+ for (int j = 0; j < positions.length; j++) {
+ int position = positions[j];
+ assertTrue(position == this.positions[i][j]);
+ }
+ TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+ assertTrue(offset != null);
+ assertTrue(offset.length == this.offsets[i].length);
+ for (int j = 0; j < offset.length; j++) {
+ TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
+ assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
+ }
+ }
+
+ TermFreqVector freqVector = (TermFreqVector)reader.get(0, testFields[1]); //no pos, no offset
+ assertTrue(freqVector != null);
+ assertTrue(freqVector instanceof TermPositionVector == false);
+ terms = freqVector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ }
+
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ catch (ClassCastException cce)
+ {
+ cce.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
+ public void testOffsetReader() {
+ try {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
+ assertTrue(vector != null);
+ String [] terms = vector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ int [] positions = vector.getTermPositions(i);
+ assertTrue(positions != null);
+ assertTrue(positions.length == this.positions[i].length);
+ for (int j = 0; j < positions.length; j++) {
+ int position = positions[j];
+ assertTrue(position == this.positions[i][j]);
+ }
+ TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+ assertTrue(offset != null);
+ assertTrue(offset.length == this.offsets[i].length);
+ for (int j = 0; j < offset.length; j++) {
+ TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
+ assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
+ }
+ }
+
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ assertTrue(false);
+ }
+ catch (ClassCastException cce)
+ {
+ cce.printStackTrace();
+ assertTrue(false);
+ }
+ }
+
/**
* Make sure exceptions and bad params are handled appropriately
@@ -89,9 +203,9 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(reader != null);
//Bad document number, good field number
TermFreqVector vector = reader.get(50, testFields[0]);
- assertTrue(vector == null);
+ assertTrue(false);
} catch (Exception e) {
- assertTrue(false);
+ assertTrue(true);
}
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
diff --git a/src/test/org/apache/lucene/index/TestTermVectorsWriter.java b/src/test/org/apache/lucene/index/TestTermVectorsWriter.java
index a36207444d0..b759c65f578 100644
--- a/src/test/org/apache/lucene/index/TestTermVectorsWriter.java
+++ b/src/test/org/apache/lucene/index/TestTermVectorsWriter.java
@@ -44,7 +44,7 @@ public class TestTermVectorsWriter extends TestCase {
for (int i = 0; i < testTerms.length; i++) {
positions[i] = new int[5];
for (int j = 0; j < positions[i].length; j++) {
- positions[i][j] = i * 100;
+ positions[i][j] = j * 10;
}
}
}
@@ -107,7 +107,7 @@ public class TestTermVectorsWriter extends TestCase {
}
}
- private void checkTermVector(TermVectorsReader reader, int docNum, String field) {
+ private void checkTermVector(TermVectorsReader reader, int docNum, String field) throws IOException {
TermFreqVector vector = reader.get(docNum, field);
assertTrue(vector != null);
String[] terms = vector.getTerms();
diff --git a/src/test/org/apache/lucene/search/TestTermVectors.java b/src/test/org/apache/lucene/search/TestTermVectors.java
index 0fcb35228da..106a36d311a 100644
--- a/src/test/org/apache/lucene/search/TestTermVectors.java
+++ b/src/test/org/apache/lucene/search/TestTermVectors.java
@@ -43,8 +43,23 @@ public class TestTermVectors extends TestCase {
//writer.infoStream = System.out;
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
+ Field.TermVector termVector;
+ int mod3 = i % 3;
+ int mod2 = i % 2;
+ if (mod2 == 0 && mod3 == 0){
+ termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
+ }
+ else if (mod2 == 0){
+ termVector = Field.TermVector.WITH_POSITIONS;
+ }
+ else if (mod3 == 0){
+ termVector = Field.TermVector.WITH_OFFSETS;
+ }
+ else {
+ termVector = Field.TermVector.YES;
+ }
doc.add(new Field("field", English.intToEnglish(i),
- Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+ Field.Store.YES, Field.Index.TOKENIZED, termVector));
writer.addDocument(doc);
}
writer.close();
@@ -70,17 +85,74 @@ public class TestTermVectors extends TestCase {
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null);
assertTrue(vector.length == 1);
- //assertTrue();
}
- TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50));
- //System.out.println("Explain: " + searcher.explain(query, hits.id(50)));
- //System.out.println("Vector: " + vector[0].toString());
} catch (IOException e) {
assertTrue(false);
}
}
public void testTermPositionVectors() {
+ Query query = new TermQuery(new Term("field", "zero"));
+ try {
+ Hits hits = searcher.search(query);
+ assertEquals(1, hits.length());
+
+ for (int i = 0; i < hits.length(); i++)
+ {
+ TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
+ assertTrue(vector != null);
+ assertTrue(vector.length == 1);
+
+ boolean shouldBePosVector = (hits.id(i) % 2 == 0) ? true : false;
+ assertTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] instanceof TermPositionVector == true)));
+
+ boolean shouldBeOffVector = (hits.id(i) % 3 == 0) ? true : false;
+ assertTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] instanceof TermPositionVector == true)));
+
+ if(shouldBePosVector || shouldBeOffVector){
+ TermPositionVector posVec = (TermPositionVector)vector[0];
+ String [] terms = posVec.getTerms();
+ assertTrue(terms != null && terms.length > 0);
+
+ for (int j = 0; j < terms.length; j++) {
+ int [] positions = posVec.getTermPositions(j);
+ TermVectorOffsetInfo [] offsets = posVec.getOffsets(j);
+
+ if(shouldBePosVector){
+ assertTrue(positions != null);
+ assertTrue(positions.length > 0);
+ }
+ else
+ assertTrue(positions == null);
+
+ if(shouldBeOffVector){
+ assertTrue(offsets != null);
+ assertTrue(offsets.length > 0);
+ }
+ else
+ assertTrue(offsets == null);
+ }
+ }
+ else{
+ try{
+ TermPositionVector posVec = (TermPositionVector)vector[0];
+ assertTrue(false);
+ }
+ catch(ClassCastException ignore){
+ TermFreqVector freqVec = vector[0];
+ String [] terms = freqVec.getTerms();
+ assertTrue(terms != null && terms.length > 0);
+ }
+
+ }
+
+ }
+ } catch (IOException e) {
+ assertTrue(false);
+ }
+ }
+
+ public void testTermOffsetVectors() {
Query query = new TermQuery(new Term("field", "fifty"));
try {
Hits hits = searcher.search(query);
@@ -91,6 +163,7 @@ public class TestTermVectors extends TestCase {
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null);
assertTrue(vector.length == 1);
+
//assertTrue();
}
} catch (IOException e) {
@@ -164,7 +237,7 @@ public class TestTermVectors extends TestCase {
int [] freqs = vector.getTermFrequencies();
for (int i = 0; i < vTerms.length; i++)
{
- if (term.text().equals(vTerms[i]) == true)
+ if (term.text().equals(vTerms[i]))
{
assertTrue(freqs[i] == freq);
}
@@ -184,9 +257,9 @@ public class TestTermVectors extends TestCase {
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
- assertTrue(testDoc3.toString().equals(hits.doc(0).toString()));
- assertTrue(testDoc4.toString().equals(hits.doc(1).toString()));
- assertTrue(testDoc1.toString().equals(hits.doc(2).toString()));
+ assertTrue(hits.id(0) == 2);
+ assertTrue(hits.id(1) == 3);
+ assertTrue(hits.id(2) == 0);
TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field");
assertTrue(vector != null);
//System.out.println("Vector: " + vector);