From f1667be0fc27f733c6bf4ad0a96fa799278aa501 Mon Sep 17 00:00:00 2001 From: Christoph Goller Date: Tue, 5 Oct 2004 17:30:48 +0000 Subject: [PATCH] Grant's nw termvector patch (Bug #18927) applied with some modifications. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150566 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/document/Field.java | 126 +++++++++- .../apache/lucene/index/DocumentWriter.java | 64 ++++- .../org/apache/lucene/index/FieldInfo.java | 7 +- .../org/apache/lucene/index/FieldInfos.java | 80 +++++-- .../lucene/index/FilterIndexReader.java | 9 +- .../org/apache/lucene/index/IndexReader.java | 64 +++-- .../org/apache/lucene/index/MultiReader.java | 28 ++- .../apache/lucene/index/SegmentMerger.java | 32 +-- .../apache/lucene/index/SegmentReader.java | 67 +++++- .../index/SegmentTermPositionVector.java | 64 +++++ .../lucene/index/SegmentTermVector.java | 30 ++- .../apache/lucene/index/TermFreqVector.java | 16 ++ .../lucene/index/TermPositionVector.java | 35 ++- .../lucene/index/TermVectorOffsetInfo.java | 66 ++++++ .../lucene/index/TermVectorsReader.java | 224 +++++++++++------- .../lucene/index/TermVectorsWriter.java | 206 ++++++++++------ .../org/apache/lucene/index/DocHelper.java | 4 +- .../lucene/index/TestSegmentMerger.java | 1 + .../lucene/index/TestSegmentReader.java | 2 +- .../lucene/index/TestTermVectorsReader.java | 122 +++++++++- .../lucene/index/TestTermVectorsWriter.java | 4 +- .../apache/lucene/search/TestTermVectors.java | 91 ++++++- 22 files changed, 1046 insertions(+), 296 deletions(-) create mode 100644 src/java/org/apache/lucene/index/SegmentTermPositionVector.java create mode 100644 src/java/org/apache/lucene/index/TermVectorOffsetInfo.java diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java index 5757f7aa1a9..d95a6e6ac97 100644 --- a/src/java/org/apache/lucene/document/Field.java +++ b/src/java/org/apache/lucene/document/Field.java @@ -38,6 +38,8 @@ public final class Field implements java.io.Serializable { private Object fieldsData = null; private boolean storeTermVector = false; + private boolean storeOffsetWithTermVector = false; + private boolean storePositionWithTermVector = false; private boolean isStored = false; private boolean isIndexed = true; private boolean isTokenized = true; @@ -55,16 +57,19 @@ public final class Field implements java.io.Serializable { public String toString() { return name; } + /** Store the original field value in the index in a compressed form. This is * useful for long documents and for binary valued fields. */ public static final Store COMPRESS = new Store("COMPRESS"); + /** Store the original field value in the index. This is useful for short texts * like a document's title which should be displayed with the results. The * value is stored in its original form, i.e. no analyzer is used before it is * stored. */ public static final Store YES = new Store("YES"); + /** Do not store the field value in the index. */ public static final Store NO = new Store("NO"); } @@ -100,15 +105,41 @@ public final class Field implements java.io.Serializable { private TermVector(String name) { this.name = name; } + public String toString() { return name; } + /** Do not store term vectors. */ public static final TermVector NO = new TermVector("NO"); + /** Store the term vectors of each document. A term vector is a list * of the document's terms and their number of occurences in that document. */ public static final TermVector YES = new TermVector("YES"); + + /** + * Store the term vector + token position information + * + * @see #YES + */ + public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS"); + + /** + * Store the term vector + Token offset information + * + * @see #YES + */ + public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS"); + + /** + * Store the term vector + Token position and offset information + * + * @see #YES + * @see #WITH_POSITIONS + * @see #WITH_OFFSETS + */ + public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS"); } /** Sets the boost factor hits on this field. This value will be @@ -290,14 +321,18 @@ public final class Field implements java.io.Serializable { this.name = name.intern(); // field names are interned this.fieldsData = value; - if (store == Store.YES) + if (store == Store.YES){ this.isStored = true; + this.isCompressed = false; + } else if (store == Store.COMPRESS) { this.isStored = true; this.isCompressed = true; } - else if (store == Store.NO) + else if (store == Store.NO){ this.isStored = false; + this.isCompressed = false; + } else throw new IllegalArgumentException("unknown store parameter " + store); @@ -313,6 +348,8 @@ public final class Field implements java.io.Serializable { } else { throw new IllegalArgumentException("unknown index parameter " + index); } + + this.isBinary = false; setStoreTermVector(termVector); } @@ -343,11 +380,18 @@ public final class Field implements java.io.Serializable { throw new NullPointerException("name cannot be null"); if (reader == null) throw new NullPointerException("reader cannot be null"); + this.name = name.intern(); // field names are interned this.fieldsData = reader; + this.isStored = false; + this.isCompressed = false; + this.isIndexed = true; this.isTokenized = true; + + this.isBinary = false; + setStoreTermVector(termVector); } @@ -374,21 +418,29 @@ public final class Field implements java.io.Serializable { throw new IllegalArgumentException("name cannot be null"); if (value == null) throw new IllegalArgumentException("value cannot be null"); - if (store == Store.NO) - throw new IllegalArgumentException("binary values can't be unstored"); - if (store == Store.COMPRESS) - this.isCompressed = true; this.name = name.intern(); - //wrap the byte[] to a ByteBuffer object this.fieldsData = value; - this.isBinary = true; - this.isStored = true; + if (store == Store.YES){ + this.isStored = true; + this.isCompressed = false; + } + else if (store == Store.COMPRESS) { + this.isStored = true; + this.isCompressed = true; + } + else if (store == Store.NO) + throw new IllegalArgumentException("binary values can't be unstored"); + else + throw new IllegalArgumentException("unknown store parameter " + store); this.isIndexed = false; this.isTokenized = false; - this.storeTermVector = false; + + this.isBinary = true; + + setStoreTermVector(TermVector.NO); } /** @@ -422,9 +474,30 @@ public final class Field implements java.io.Serializable { private void setStoreTermVector(TermVector termVector) { if (termVector == TermVector.NO) { this.storeTermVector = false; - } else if (termVector == TermVector.YES) { + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = false; + } + else if (termVector == TermVector.YES) { this.storeTermVector = true; - } else { + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = false; + } + else if (termVector == TermVector.WITH_POSITIONS) { + this.storeTermVector = true; + this.storePositionWithTermVector = true; + this.storeOffsetWithTermVector = false; + } + else if (termVector == TermVector.WITH_OFFSETS) { + this.storeTermVector = true; + this.storePositionWithTermVector = false; + this.storeOffsetWithTermVector = true; + } + else if (termVector == TermVector.WITH_POSITIONS_OFFSETS) { + this.storeTermVector = true; + this.storePositionWithTermVector = true; + this.storeOffsetWithTermVector = true; + } + else { throw new IllegalArgumentException("unknown termVector parameter " + termVector); } } @@ -455,7 +528,24 @@ public final class Field implements java.io.Serializable { * @see IndexReader#getTermFreqVector(int, String) */ public final boolean isTermVectorStored() { return storeTermVector; } - + + /** + * True iff terms are stored as term vector together with their offsets + * (start and end positon in source text). + * @return + */ + public boolean isStoreOffsetWithTermVector(){ + return storeOffsetWithTermVector; + } + + /** + * True iff terms are stored as term vector together with their token positions. + * @return + */ + public boolean isStorePositionWithTermVector(){ + return storePositionWithTermVector; + } + /** True iff the value of the filed is stored as binary */ public final boolean isBinary() { return isBinary; } @@ -479,6 +569,16 @@ public final class Field implements java.io.Serializable { result.append(","); result.append("termVector"); } + if (storeOffsetWithTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVectorOffsets"); + } + if (storePositionWithTermVector) { + if (result.length() > 0) + result.append(","); + result.append("termVectorPosition"); + } if (isBinary) { if (result.length() > 0) result.append(","); diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java index 9ec7a735c49..4952d227b69 100644 --- a/src/java/org/apache/lucene/index/DocumentWriter.java +++ b/src/java/org/apache/lucene/index/DocumentWriter.java @@ -74,6 +74,7 @@ final class DocumentWriter { postingTable.clear(); // clear postingTable fieldLengths = new int[fieldInfos.size()]; // init fieldLengths fieldPositions = new int[fieldInfos.size()]; // init fieldPositions + fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts Arrays.fill(fieldBoosts, doc.getBoost()); @@ -100,7 +101,7 @@ final class DocumentWriter { writePostings(postings, segment); // write norms of indexed fields - writeNorms(doc, segment); + writeNorms(segment); } @@ -109,6 +110,7 @@ final class DocumentWriter { private final Hashtable postingTable = new Hashtable(); private int[] fieldLengths; private int[] fieldPositions; + private int[] fieldOffsets; private float[] fieldBoosts; // Tokenizes the fields of a document into Postings. @@ -122,12 +124,19 @@ final class DocumentWriter { int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field + int offset = fieldOffsets[fieldNumber]; // offset field if (field.isIndexed()) { if (!field.isTokenized()) { // un-tokenized field - addPosition(fieldName, field.stringValue(), position++); + String stringValue = field.stringValue(); + if(field.isStoreOffsetWithTermVector()) + addPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.length())); + else + addPosition(fieldName, stringValue, position++, null); + offset += stringValue.length(); length++; - } else { + } else + { Reader reader; // find or make Reader if (field.readerValue() != null) reader = field.readerValue(); @@ -140,11 +149,23 @@ final class DocumentWriter { // Tokenize field and add to postingTable TokenStream stream = analyzer.tokenStream(fieldName, reader); try { + Token lastToken = null; for (Token t = stream.next(); t != null; t = stream.next()) { position += (t.getPositionIncrement() - 1); - addPosition(fieldName, t.termText(), position++); - if (++length > maxFieldLength) break; + + if(field.isStoreOffsetWithTermVector()) + addPosition(fieldName, t.termText(), position++, new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset())); + else + addPosition(fieldName, t.termText(), position++, null); + + lastToken = t; + if (++length > maxFieldLength) + break; } + + if(lastToken != null) + offset += lastToken.endOffset() + 1; + } finally { stream.close(); } @@ -153,14 +174,16 @@ final class DocumentWriter { fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.getBoost(); + fieldOffsets[fieldNumber] = offset; } } } private final Term termBuffer = new Term("", ""); // avoid consing - private final void addPosition(String field, String text, int position) { + private final void addPosition(String field, String text, int position, TermVectorOffsetInfo offset) { termBuffer.set(field, text); + //System.out.println("Offset: " + offset); Posting ti = (Posting) postingTable.get(termBuffer); if (ti != null) { // word seen before int freq = ti.freq; @@ -172,10 +195,23 @@ final class DocumentWriter { ti.positions = newPositions; } ti.positions[freq] = position; // add new position + + if (offset != null) { + if (ti.offsets.length == freq){ + TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; + TermVectorOffsetInfo [] offsets = ti.offsets; + for (int i = 0; i < freq; i++) + { + newOffsets[i] = offsets[i]; + } + ti.offsets = newOffsets; + } + ti.offsets[freq] = offset; + } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); - postingTable.put(term, new Posting(term, position)); + postingTable.put(term, new Posting(term, position, offset)); } } @@ -294,12 +330,13 @@ final class DocumentWriter { termVectorWriter.openDocument(); } termVectorWriter.openField(currentField); + } else if (termVectorWriter != null) { termVectorWriter.closeField(); } } if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { - termVectorWriter.addTerm(posting.term.text(), postingFreq); + termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) @@ -316,7 +353,7 @@ final class DocumentWriter { } } - private final void writeNorms(Document doc, String segment) throws IOException { + private final void writeNorms(String segment) throws IOException { for(int n = 0; n < fieldInfos.size(); n++){ FieldInfo fi = fieldInfos.fieldInfo(n); if(fi.isIndexed){ @@ -336,11 +373,18 @@ final class Posting { // info about a Term in a doc Term term; // the Term int freq; // its frequency in doc int[] positions; // positions it occurs at + TermVectorOffsetInfo [] offsets; - Posting(Term t, int position) { + Posting(Term t, int position, TermVectorOffsetInfo offset) { term = t; freq = 1; positions = new int[1]; positions[0] = position; + if(offset != null){ + offsets = new TermVectorOffsetInfo[1]; + offsets[0] = offset; + } + else + offsets = null; } } diff --git a/src/java/org/apache/lucene/index/FieldInfo.java b/src/java/org/apache/lucene/index/FieldInfo.java index 47102c9be69..2b575fbb1ce 100644 --- a/src/java/org/apache/lucene/index/FieldInfo.java +++ b/src/java/org/apache/lucene/index/FieldInfo.java @@ -23,11 +23,16 @@ final class FieldInfo { // true if term vector for this field should be stored boolean storeTermVector; + boolean storeOffsetWithTermVector; + boolean storePositionWithTermVector; - FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) { + FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { name = na; isIndexed = tk; number = nu; this.storeTermVector = storeTermVector; + this.storeOffsetWithTermVector = storeOffsetWithTermVector; + this.storePositionWithTermVector = storePositionWithTermVector; } } diff --git a/src/java/org/apache/lucene/index/FieldInfos.java b/src/java/org/apache/lucene/index/FieldInfos.java index 35bdee8ca36..b20b8d56663 100644 --- a/src/java/org/apache/lucene/index/FieldInfos.java +++ b/src/java/org/apache/lucene/index/FieldInfos.java @@ -33,6 +33,12 @@ import org.apache.lucene.store.IndexInput; * accessing this object. */ final class FieldInfos { + + static final byte IS_INDEXED = 0x1; + static final byte STORE_TERMVECTOR = 0x2; + static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4; + static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8; + private ArrayList byNumber = new ArrayList(); private HashMap byName = new HashMap(); @@ -61,23 +67,30 @@ final class FieldInfos { Enumeration fields = doc.fields(); while (fields.hasMoreElements()) { Field field = (Field) fields.nextElement(); - add(field.name(), field.isIndexed(), field.isTermVectorStored()); + add(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), + field.isStoreOffsetWithTermVector()); } } - + /** + * Add fields that are indexed. Whether they have termvectors has to be specified. + * * @param names The names of the fields * @param storeTermVectors Whether the fields store term vectors or not + * @param storePositionWithTermVector treu if positions should be stored. + * @param storeOffsetWithTermVector true if offsets should be stored */ - public void addIndexed(Collection names, boolean storeTermVectors) { + public void addIndexed(Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, + boolean storeOffsetWithTermVector) { Iterator i = names.iterator(); while (i.hasNext()) { - add((String)i.next(), true, storeTermVectors); + add((String)i.next(), true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector); } } /** - * Assumes the field is not storing term vectors + * Assumes the fields are not storing term vectors. + * * @param names The names of the fields * @param isIndexed Whether the fields are indexed or not * @@ -91,28 +104,43 @@ final class FieldInfos { } /** - * Calls three parameter add with false for the storeTermVector parameter + * Calls 5 parameter add with false for all TermVector parameters. + * * @param name The name of the Field * @param isIndexed true if the field is indexed - * @see #add(String, boolean, boolean) + * @see #add(String, boolean, boolean, boolean, boolean) */ public void add(String name, boolean isIndexed) { - add(name, isIndexed, false); + add(name, isIndexed, false, false, false); } - + /** + * Calls 5 parameter add with false for term vector positions and offsets. + * + * @param name The name of the field + * @param isIndexed true if the field is indexed + * @param storeTermVector true if the term vector should be stored + */ + public void add(String name, boolean isIndexed, boolean storeTermVector){ + add(name, isIndexed, storeTermVector, false, false); + } + /** If the field is not yet known, adds it. If it is known, checks to make * sure that the isIndexed flag is the same as was given previously for this - * field. If not - marks it as being indexed. Same goes for storeTermVector + * field. If not - marks it as being indexed. Same goes for the TermVector + * parameters. * * @param name The name of the field * @param isIndexed true if the field is indexed * @param storeTermVector true if the term vector should be stored + * @param storePositionWithTermVector true if the term vector with positions should be stored + * @param storeOffsetWithTermVector true if the term vector with offsets should be stored */ - public void add(String name, boolean isIndexed, boolean storeTermVector) { + public void add(String name, boolean isIndexed, boolean storeTermVector, + boolean storePositionWithTermVector, boolean storeOffsetWithTermVector) { FieldInfo fi = fieldInfo(name); if (fi == null) { - addInternal(name, isIndexed, storeTermVector); + addInternal(name, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector); } else { if (fi.isIndexed != isIndexed) { fi.isIndexed = true; // once indexed, always index @@ -120,13 +148,21 @@ final class FieldInfos { if (fi.storeTermVector != storeTermVector) { fi.storeTermVector = true; // once vector, always vector } + if (fi.storePositionWithTermVector != storePositionWithTermVector) { + fi.storePositionWithTermVector = true; // once vector, always vector + } + if (fi.storeOffsetWithTermVector != storeOffsetWithTermVector) { + fi.storeOffsetWithTermVector = true; // once vector, always vector + } } } private void addInternal(String name, boolean isIndexed, - boolean storeTermVector) { + boolean storeTermVector, boolean storePositionWithTermVector, + boolean storeOffsetWithTermVector) { FieldInfo fi = - new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector); + new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector, storePositionWithTermVector, + storeOffsetWithTermVector); byNumber.add(fi); byName.put(name, fi); } @@ -180,11 +216,11 @@ final class FieldInfos { for (int i = 0; i < size(); i++) { FieldInfo fi = fieldInfo(i); byte bits = 0x0; - if (fi.isIndexed) bits |= 0x1; - if (fi.storeTermVector) bits |= 0x2; + if (fi.isIndexed) bits |= IS_INDEXED; + if (fi.storeTermVector) bits |= STORE_TERMVECTOR; + if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; + if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; output.writeString(fi.name); - //Was REMOVE - //output.writeByte((byte)(fi.isIndexed ? 1 : 0)); output.writeByte(bits); } } @@ -194,9 +230,11 @@ final class FieldInfos { for (int i = 0; i < size; i++) { String name = input.readString().intern(); byte bits = input.readByte(); - boolean isIndexed = (bits & 0x1) != 0; - boolean storeTermVector = (bits & 0x2) != 0; - addInternal(name, isIndexed, storeTermVector); + boolean isIndexed = (bits & IS_INDEXED) != 0; + boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; + boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; + boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; + addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector); } } diff --git a/src/java/org/apache/lucene/index/FilterIndexReader.java b/src/java/org/apache/lucene/index/FilterIndexReader.java index 61c5de86595..797e00afd1a 100644 --- a/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -16,11 +16,12 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + import java.io.IOException; import java.util.Collection; -import org.apache.lucene.document.Document; - /** A FilterIndexReader contains another IndexReader, which it * uses as its basic source of data, possibly transforming the data along the * way or providing additional functionality. The class @@ -146,4 +147,8 @@ public class FilterIndexReader extends IndexReader { public Collection getIndexedFieldNames(boolean storedTermVector) { return in.getIndexedFieldNames(storedTermVector); } + + public Collection getIndexedFieldNames (Field.TermVector tvSpec){ + return in.getIndexedFieldNames(tvSpec); + } } diff --git a/src/java/org/apache/lucene/index/IndexReader.java b/src/java/org/apache/lucene/index/IndexReader.java index dd617290343..1ac28b392ef 100644 --- a/src/java/org/apache/lucene/index/IndexReader.java +++ b/src/java/org/apache/lucene/index/IndexReader.java @@ -16,16 +16,16 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; -import java.io.File; -import java.util.Collection; - +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.search.Similarity; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Lock; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; // for javadoc -import org.apache.lucene.search.Similarity; + +import java.io.File; +import java.io.IOException; +import java.util.Collection; /** IndexReader is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, @@ -209,23 +209,37 @@ public abstract class IndexReader { return SegmentInfos.readCurrentVersion(directory); } - /** Return an array of term frequency vectors for the specified document. + /** + * Return an array of term frequency vectors for the specified document. * The array contains a vector for each vectorized field in the document. - * Each vector contains terms and frequencies for all terms - * in a given vectorized field. - * If no such fields existed, the method returns null. - * - * @see Field#isTermVectorStored() + * Each vector contains terms and frequencies for all terms in a given vectorized field. + * If no such fields existed, the method returns null. The term vectors that are + * returned my either be of type TermFreqVector or of type TermPositionsVector if + * positions or offsets have been stored. + * + * @param docNumber document for which term frequency vectors are returned + * @return array of term frequency vectors. May be null if no term vectors have been + * stored for the specified document. + * @throws IOException if index cannot be accessed + * @see Field#TermVector */ abstract public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException; - /** Return a term frequency vector for the specified document and field. The - * vector returned contains terms and frequencies for those terms in - * the specified field of this document, if the field had storeTermVector - * flag set. If the flag was not set, the method returns null. - * - * @see Field#isTermVectorStored() + + /** + * Return a term frequency vector for the specified document and field. The + * returned vector contains terms and frequencies for the terms in + * the specified field of this document, if the field had the storeTermVector + * flag set. If termvectors had been stored with positions or offsets, a + * TermPositionsVector is returned. + * + * @param docNumber document for which the term frequency vector is returned + * @param field field for which the term frequency vector is returned. + * @return term frequency vector May be null if field does not exist in the specified + * document or term vector was not stored. + * @throws IOException if index cannot be accessed + * @see Field#TermVector */ abstract public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException; @@ -547,8 +561,19 @@ public abstract class IndexReader { * @param storedTermVector if true, returns only Indexed fields that have term vector info, * else only indexed fields without term vector info * @return Collection of Strings indicating the names of the fields + * + * @deprecated Replaced by {@link #getIndexedFieldNames (Field.TermVector tvSpec)} */ public abstract Collection getIndexedFieldNames(boolean storedTermVector); + + /** + * Get a list of unique field names that exist in this index, are indexed, and have + * the specified term vector information. + * + * @param tvSpec specifies which term vector information shoul dbe available for the fields + * @return Collection of Strings indicating the names of the fields + */ + public abstract Collection getIndexedFieldNames(Field.TermVector tvSpec); /** * Returns true iff the index in the named directory is @@ -560,7 +585,6 @@ public abstract class IndexReader { return directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() || directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked(); - } /** diff --git a/src/java/org/apache/lucene/index/MultiReader.java b/src/java/org/apache/lucene/index/MultiReader.java index ffa438854d8..09886ccb539 100644 --- a/src/java/org/apache/lucene/index/MultiReader.java +++ b/src/java/org/apache/lucene/index/MultiReader.java @@ -16,16 +16,13 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Hashtable; -import java.util.Iterator; -import java.util.Set; - import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; +import java.io.IOException; +import java.util.*; + /** An IndexReader which reads multiple indexes, appending their content. * * @version $Id$ @@ -219,11 +216,7 @@ public class MultiReader extends IndexReader { for (int i = 0; i < subReaders.length; i++) { IndexReader reader = subReaders[i]; Collection names = reader.getFieldNames(); - // iterate through the field names and add them to the set - for (Iterator iterator = names.iterator(); iterator.hasNext();) { - String s = (String) iterator.next(); - fieldSet.add(s); - } + fieldSet.addAll(names); } return fieldSet; } @@ -253,6 +246,17 @@ public class MultiReader extends IndexReader { return fieldSet; } + public Collection getIndexedFieldNames (Field.TermVector tvSpec){ + // maintain a unique set of field names + Set fieldSet = new HashSet(); + for (int i = 0; i < subReaders.length; i++) { + IndexReader reader = subReaders[i]; + Collection names = reader.getIndexedFieldNames(tvSpec); + fieldSet.addAll(names); + } + return fieldSet; + } + } class MultiTermEnum extends TermEnum { diff --git a/src/java/org/apache/lucene/index/SegmentMerger.java b/src/java/org/apache/lucene/index/SegmentMerger.java index b97144c8bb0..b9402977761 100644 --- a/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/src/java/org/apache/lucene/index/SegmentMerger.java @@ -20,6 +20,7 @@ import java.util.Vector; import java.util.Iterator; import java.io.IOException; +import org.apache.lucene.document.Field; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; @@ -157,8 +158,11 @@ final class SegmentMerger { int docCount = 0; for (int i = 0; i < readers.size(); i++) { IndexReader reader = (IndexReader) readers.elementAt(i); - fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true); - fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false); + fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_POSITIONS_OFFSETS), true, true, true); + fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_POSITIONS), true, true, false); + fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.WITH_OFFSETS), true, false, true); + fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.YES), true, false, false); + fieldInfos.addIndexed(reader.getIndexedFieldNames(Field.TermVector.NO), false, false, false); fieldInfos.add(reader.getFieldNames(false), false); } fieldInfos.write(directory, segment + ".fnm"); @@ -195,29 +199,9 @@ final class SegmentMerger { int maxDoc = reader.maxDoc(); for (int docNum = 0; docNum < maxDoc; docNum++) { // skip deleted docs - if (reader.isDeleted(docNum)) { + if (reader.isDeleted(docNum)) continue; - } - termVectorsWriter.openDocument(); - - // get all term vectors - TermFreqVector[] sourceTermVector = - reader.getTermFreqVectors(docNum); - - if (sourceTermVector != null) { - for (int f = 0; f < sourceTermVector.length; f++) { - // translate field numbers - TermFreqVector termVector = sourceTermVector[f]; - termVectorsWriter.openField(termVector.getField()); - String [] terms = termVector.getTerms(); - int [] freqs = termVector.getTermFrequencies(); - - for (int t = 0; t < terms.length; t++) { - termVectorsWriter.addTerm(terms[t], freqs[t]); - } - } - termVectorsWriter.closeDocument(); - } + termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum)); } } } finally { diff --git a/src/java/org/apache/lucene/index/SegmentReader.java b/src/java/org/apache/lucene/index/SegmentReader.java index 3dd7fd2956c..9cfcdfad38c 100644 --- a/src/java/org/apache/lucene/index/SegmentReader.java +++ b/src/java/org/apache/lucene/index/SegmentReader.java @@ -25,6 +25,7 @@ import java.util.Set; import java.util.Vector; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Directory; @@ -191,7 +192,9 @@ class SegmentReader extends IndexReader { proxStream.close(); closeNorms(); - if (termVectorsReader != null) termVectorsReader.close(); + + if (termVectorsReader != null) + termVectorsReader.close(); if (cfsReader != null) cfsReader.close(); @@ -342,16 +345,63 @@ class SegmentReader extends IndexReader { * @return Collection of Strings indicating the names of the fields */ public Collection getIndexedFieldNames(boolean storedTermVector) { + if(storedTermVector){ + Set fieldSet = new HashSet(); + fieldSet.addAll(getIndexedFieldNames(Field.TermVector.YES)); + fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_POSITIONS)); + fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_OFFSETS)); + fieldSet.addAll(getIndexedFieldNames(Field.TermVector.WITH_POSITIONS_OFFSETS)); + return fieldSet; + } + else + return getIndexedFieldNames(Field.TermVector.NO); + } + + public Collection getIndexedFieldNames (Field.TermVector tvSpec){ + boolean storedTermVector; + boolean storePositionWithTermVector; + boolean storeOffsetWithTermVector; + + if(tvSpec == Field.TermVector.NO){ + storedTermVector = false; + storePositionWithTermVector = false; + storeOffsetWithTermVector = false; + } + else if(tvSpec == Field.TermVector.YES){ + storedTermVector = true; + storePositionWithTermVector = false; + storeOffsetWithTermVector = false; + } + else if(tvSpec == Field.TermVector.WITH_POSITIONS){ + storedTermVector = true; + storePositionWithTermVector = true; + storeOffsetWithTermVector = false; + } + else if(tvSpec == Field.TermVector.WITH_OFFSETS){ + storedTermVector = true; + storePositionWithTermVector = false; + storeOffsetWithTermVector = true; + } + else if(tvSpec == Field.TermVector.WITH_POSITIONS_OFFSETS){ + storedTermVector = true; + storePositionWithTermVector = true; + storeOffsetWithTermVector = true; + } + else{ + throw new IllegalArgumentException("unknown termVector parameter " + tvSpec); + } + // maintain a unique set of field names Set fieldSet = new HashSet(); for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); - if (fi.isIndexed == true && fi.storeTermVector == storedTermVector){ + if (fi.isIndexed && fi.storeTermVector == storedTermVector && + fi.storePositionWithTermVector == storePositionWithTermVector && + fi.storeOffsetWithTermVector == storeOffsetWithTermVector){ fieldSet.add(fi.name); } } - return fieldSet; - + return fieldSet; } public synchronized byte[] norms(String field) throws IOException { @@ -429,11 +479,13 @@ class SegmentReader extends IndexReader { * vector returned contains term numbers and frequencies for all terms in * the specified field of this document, if the field had storeTermVector * flag set. If the flag was not set, the method returns null. + * @throws IOException */ - public TermFreqVector getTermFreqVector(int docNumber, String field) { + public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { // Check if this field is invalid or has no stored term vector FieldInfo fi = fieldInfos.fieldInfo(field); - if (fi == null || !fi.storeTermVector) return null; + if (fi == null || !fi.storeTermVector || termVectorsReader == null) + return null; return termVectorsReader.get(docNumber, field); } @@ -444,8 +496,9 @@ class SegmentReader extends IndexReader { * Each vector vector contains term numbers and frequencies for all terms * in a given vectorized field. * If no such fields existed, the method returns null. + * @throws IOException */ - public TermFreqVector[] getTermFreqVectors(int docNumber) { + public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { if (termVectorsReader == null) return null; diff --git a/src/java/org/apache/lucene/index/SegmentTermPositionVector.java b/src/java/org/apache/lucene/index/SegmentTermPositionVector.java new file mode 100644 index 00000000000..3f916fb0892 --- /dev/null +++ b/src/java/org/apache/lucene/index/SegmentTermPositionVector.java @@ -0,0 +1,64 @@ +package org.apache.lucene.index; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class SegmentTermPositionVector extends SegmentTermVector implements TermPositionVector { + protected int[][] positions; + protected TermVectorOffsetInfo[][] offsets; + public static final int[] EMPTY_TERM_POS = new int[0]; + + public SegmentTermPositionVector(String field, String terms[], int termFreqs[], int[][] positions, TermVectorOffsetInfo[][] offsets) { + super(field, terms, termFreqs); + this.offsets = offsets; + this.positions = positions; + } + + /** + * Returns an array of TermVectorOffsetInfo in which the term is found. + * + * @param index The position in the array to get the offsets from + * @return An array of TermVectorOffsetInfo objects or the empty list + * @see org.apache.lucene.analysis.Token + */ + public TermVectorOffsetInfo[] getOffsets(int index) { + TermVectorOffsetInfo[] result = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; + if(offsets == null) + return null; + if (index >=0 && index < offsets.length) + { + result = offsets[index]; + } + return result; + } + + /** + * Returns an array of positions in which the term is found. + * Terms are identified by the index at which its number appears in the + * term String array obtained from the indexOf method. + */ + public int[] getTermPositions(int index) { + int[] result = EMPTY_TERM_POS; + if(positions == null) + return null; + if (index >=0 && index < positions.length) + { + result = positions[index]; + } + + return result; + } +} \ No newline at end of file diff --git a/src/java/org/apache/lucene/index/SegmentTermVector.java b/src/java/org/apache/lucene/index/SegmentTermVector.java index 81d7237a3dd..c70560e1e72 100644 --- a/src/java/org/apache/lucene/index/SegmentTermVector.java +++ b/src/java/org/apache/lucene/index/SegmentTermVector.java @@ -1,4 +1,21 @@ package org.apache.lucene.index; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + import java.util.*; /** @@ -26,11 +43,14 @@ class SegmentTermVector implements TermFreqVector { StringBuffer sb = new StringBuffer(); sb.append('{'); sb.append(field).append(": "); - for (int i=0; i0) sb.append(", "); - sb.append(terms[i]).append('/').append(termFreqs[i]); + if(terms != null){ + for (int i=0; i0) sb.append(", "); + sb.append(terms[i]).append('/').append(termFreqs[i]); + } } sb.append('}'); + return sb.toString(); } @@ -47,6 +67,8 @@ class SegmentTermVector implements TermFreqVector { } public int indexOf(String termText) { + if(terms == null) + return -1; int res = Arrays.binarySearch(terms, termText); return res >= 0 ? res : -1; } @@ -60,7 +82,7 @@ class SegmentTermVector implements TermFreqVector { int res[] = new int[len]; for (int i=0; i < len; i++) { - res[i] = indexOf(termNumbers[i]); + res[i] = indexOf(termNumbers[start+ i]); } return res; } diff --git a/src/java/org/apache/lucene/index/TermFreqVector.java b/src/java/org/apache/lucene/index/TermFreqVector.java index 89565848b80..a919e5140de 100644 --- a/src/java/org/apache/lucene/index/TermFreqVector.java +++ b/src/java/org/apache/lucene/index/TermFreqVector.java @@ -1,5 +1,21 @@ package org.apache.lucene.index; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /** Provides access to stored term vector of * a document field. */ diff --git a/src/java/org/apache/lucene/index/TermPositionVector.java b/src/java/org/apache/lucene/index/TermPositionVector.java index 9ec2cd5dafe..1ce31b9217c 100644 --- a/src/java/org/apache/lucene/index/TermPositionVector.java +++ b/src/java/org/apache/lucene/index/TermPositionVector.java @@ -1,13 +1,42 @@ package org.apache.lucene.index; +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /** Extends TermFreqVector to provide additional information about - * positions in which each of the terms is found. + * positions in which each of the terms is found. A TermPositionVector not necessarily + * contains both positions and offsets, but at least one of these arrays exists. */ public interface TermPositionVector extends TermFreqVector { - + /** Returns an array of positions in which the term is found. * Terms are identified by the index at which its number appears in the - * term number array obtained from getTermNumbers method. + * term String array obtained from the indexOf method. + * May return null if positions have not been stored. */ public int[] getTermPositions(int index); + + /** + * Returns an array of TermVectorOffsetInfo in which the term is found. + * May return null if offsets have not been stored. + * + * @see org.apache.lucene.analysis.Token + * + * @param index The position in the array to get the offsets from + * @return An array of TermVectorOffsetInfo objects or the empty list + */ + public TermVectorOffsetInfo [] getOffsets(int index); } \ No newline at end of file diff --git a/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java b/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java new file mode 100644 index 00000000000..1008351540a --- /dev/null +++ b/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java @@ -0,0 +1,66 @@ +package org.apache.lucene.index; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TermVectorOffsetInfo { + public static final TermVectorOffsetInfo [] EMPTY_OFFSET_INFO = new TermVectorOffsetInfo[0]; + private int startOffset; + private int endOffset; + + public TermVectorOffsetInfo() { + } + + public TermVectorOffsetInfo(int startOffset, int endOffset) { + this.endOffset = endOffset; + this.startOffset = startOffset; + } + + public int getEndOffset() { + return endOffset; + } + + public void setEndOffset(int endOffset) { + this.endOffset = endOffset; + } + + public int getStartOffset() { + return startOffset; + } + + public void setStartOffset(int startOffset) { + this.startOffset = startOffset; + } + + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof TermVectorOffsetInfo)) return false; + + final TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o; + + if (endOffset != termVectorOffsetInfo.endOffset) return false; + if (startOffset != termVectorOffsetInfo.startOffset) return false; + + return true; + } + + public int hashCode() { + int result; + result = startOffset; + result = 29 * result + endOffset; + return result; + } +} diff --git a/src/java/org/apache/lucene/index/TermVectorsReader.java b/src/java/org/apache/lucene/index/TermVectorsReader.java index 56c288caff6..d6852e9b8c4 100644 --- a/src/java/org/apache/lucene/index/TermVectorsReader.java +++ b/src/java/org/apache/lucene/index/TermVectorsReader.java @@ -33,6 +33,9 @@ class TermVectorsReader { private IndexInput tvd; private IndexInput tvf; private int size; + + private int tvdFormat; + private int tvfFormat; TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) throws IOException { @@ -40,16 +43,16 @@ class TermVectorsReader { tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION); checkValidFormat(tvx); tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION); - checkValidFormat(tvd); + tvdFormat = checkValidFormat(tvd); tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION); - checkValidFormat(tvf); + tvfFormat = checkValidFormat(tvf); size = (int) tvx.length() / 8; } this.fieldInfos = fieldInfos; } - private void checkValidFormat(IndexInput in) throws IOException + private int checkValidFormat(IndexInput in) throws IOException { int format = in.readInt(); if (format > TermVectorsWriter.FORMAT_VERSION) @@ -57,7 +60,7 @@ class TermVectorsReader { throw new IOException("Incompatible format version: " + format + " expected " + TermVectorsWriter.FORMAT_VERSION + " or less"); } - + return format; } void close() throws IOException { @@ -82,100 +85,101 @@ class TermVectorsReader { * Retrieve the term vector for the given document and field * @param docNum The document number to retrieve the vector for * @param field The field within the document to retrieve - * @return The TermFreqVector for the document and field or null + * @return The TermFreqVector for the document and field or null if there is no termVector for this field. + * @throws IOException */ - synchronized TermFreqVector get(int docNum, String field) { + synchronized TermFreqVector get(int docNum, String field) throws IOException { // Check if no term vectors are available for this segment at all int fieldNumber = fieldInfos.fieldNumber(field); TermFreqVector result = null; if (tvx != null) { - try { - //We need to account for the FORMAT_SIZE at when seeking in the tvx - //We don't need to do this in other seeks because we already have the file pointer - //that was written in another file - tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); - //System.out.println("TVX Pointer: " + tvx.getFilePointer()); - long position = tvx.readLong(); + //We need to account for the FORMAT_SIZE at when seeking in the tvx + //We don't need to do this in other seeks because we already have the + // file pointer + //that was written in another file + tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); + //System.out.println("TVX Pointer: " + tvx.getFilePointer()); + long position = tvx.readLong(); - tvd.seek(position); - int fieldCount = tvd.readVInt(); - //System.out.println("Num Fields: " + fieldCount); - // There are only a few fields per document. We opt for a full scan - // rather then requiring that they be ordered. We need to read through - // all of the fields anyway to get to the tvf pointers. - int number = 0; - int found = -1; - for (int i = 0; i < fieldCount; i++) { + tvd.seek(position); + int fieldCount = tvd.readVInt(); + //System.out.println("Num Fields: " + fieldCount); + // There are only a few fields per document. We opt for a full scan + // rather then requiring that they be ordered. We need to read through + // all of the fields anyway to get to the tvf pointers. + int number = 0; + int found = -1; + for (int i = 0; i < fieldCount; i++) { + if(tvdFormat == TermVectorsWriter.FORMAT_VERSION) + number = tvd.readVInt(); + else number += tvd.readVInt(); - if (number == fieldNumber) found = i; - } - - // This field, although valid in the segment, was not found in this document - if (found != -1) { - // Compute position in the tvf file - position = 0; - for (int i = 0; i <= found; i++) - { - position += tvd.readVLong(); - } - result = readTermVector(field, position); - } - else { - //System.out.println("Field not found"); - } - - } catch (Exception e) { - //e.printStackTrace(); + + if (number == fieldNumber) + found = i; } - } - else - { - System.out.println("No tvx file"); + + // This field, although valid in the segment, was not found in this + // document + if (found != -1) { + // Compute position in the tvf file + position = 0; + for (int i = 0; i <= found; i++) + position += tvd.readVLong(); + + result = readTermVector(field, position); + } else { + //System.out.println("Field not found"); + } + } else { + //System.out.println("No tvx file"); } return result; } - /** Return all term vectors stored for this document or null if the could not be read in. */ - synchronized TermFreqVector[] get(int docNum) { + /** + * Return all term vectors stored for this document or null if there are no term vectors + * for the document. + * @throws IOException + */ + synchronized TermFreqVector[] get(int docNum) throws IOException { TermFreqVector[] result = null; // Check if no term vectors are available for this segment at all if (tvx != null) { - try { - //We need to offset by - tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); - long position = tvx.readLong(); + //We need to offset by + tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); + long position = tvx.readLong(); - tvd.seek(position); - int fieldCount = tvd.readVInt(); + tvd.seek(position); + int fieldCount = tvd.readVInt(); - // No fields are vectorized for this document - if (fieldCount != 0) { - int number = 0; - String[] fields = new String[fieldCount]; - - for (int i = 0; i < fieldCount; i++) { + // No fields are vectorized for this document + if (fieldCount != 0) { + int number = 0; + String[] fields = new String[fieldCount]; + + for (int i = 0; i < fieldCount; i++) { + if(tvdFormat == TermVectorsWriter.FORMAT_VERSION) + number = tvd.readVInt(); + else number += tvd.readVInt(); - fields[i] = fieldInfos.fieldName(number); - } - - // Compute position in the tvf file - position = 0; - long[] tvfPointers = new long[fieldCount]; - for (int i = 0; i < fieldCount; i++) { - position += tvd.readVLong(); - tvfPointers[i] = position; - } - result = readTermVectors(fields, tvfPointers); + fields[i] = fieldInfos.fieldName(number); } - } catch (IOException e) { - e.printStackTrace(); + + // Compute position in the tvf file + position = 0; + long[] tvfPointers = new long[fieldCount]; + for (int i = 0; i < fieldCount; i++) { + position += tvd.readVLong(); + tvfPointers[i] = position; + } + + result = readTermVectors(fields, tvfPointers); } - } - else - { - System.out.println("No tvx file"); + } else { + //System.out.println("No tvx file"); } return result; } @@ -206,20 +210,41 @@ class TermVectorsReader { int numTerms = tvf.readVInt(); //System.out.println("Num Terms: " + numTerms); - // If no terms - return a constant empty termvector - if (numTerms == 0) return new SegmentTermVector(field, null, null); - - tvf.readVInt(); + // If no terms - return a constant empty termvector. However, this should never occur! + if (numTerms == 0) + return new SegmentTermVector(field, null, null); + boolean storePositions; + boolean storeOffsets; + + if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){ + byte bits = tvf.readByte(); + storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0; + storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0; + } + else{ + tvf.readVInt(); + storePositions = false; + storeOffsets = false; + } + String terms[] = new String[numTerms]; - int termFreqs[] = new int[numTerms]; - + + // we may not need these, but declare them + int positions[][] = null; + TermVectorOffsetInfo offsets[][] = null; + if(storePositions) + positions = new int[numTerms][]; + if(storeOffsets) + offsets = new TermVectorOffsetInfo[numTerms][]; + int start = 0; int deltaLength = 0; int totalLength = 0; char [] buffer = {}; String previousString = ""; + for (int i = 0; i < numTerms; i++) { start = tvf.readVInt(); deltaLength = tvf.readVInt(); @@ -233,9 +258,40 @@ class TermVectorsReader { tvf.readChars(buffer, start, deltaLength); terms[i] = new String(buffer, 0, totalLength); previousString = terms[i]; - termFreqs[i] = tvf.readVInt(); + int freq = tvf.readVInt(); + termFreqs[i] = freq; + + if (storePositions) { //read in the positions + int [] pos = new int[freq]; + positions[i] = pos; + int prevPosition = 0; + for (int j = 0; j < freq; j++) + { + pos[j] = prevPosition + tvf.readVInt(); + prevPosition = pos[j]; + } + } + + if (storeOffsets) { + TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq]; + offsets[i] = offs; + int prevOffset = 0; + for (int j = 0; j < freq; j++) { + int startOffset = prevOffset + tvf.readVInt(); + int endOffset = startOffset + tvf.readVInt(); + offs[j] = new TermVectorOffsetInfo(startOffset, endOffset); + prevOffset = endOffset; + } + } + } + + SegmentTermVector tv; + if (storePositions || storeOffsets){ + tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); + } + else { + tv = new SegmentTermVector(field, terms, termFreqs); } - SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs); return tv; } diff --git a/src/java/org/apache/lucene/index/TermVectorsWriter.java b/src/java/org/apache/lucene/index/TermVectorsWriter.java index 2b4de123074..f61e5a2db44 100644 --- a/src/java/org/apache/lucene/index/TermVectorsWriter.java +++ b/src/java/org/apache/lucene/index/TermVectorsWriter.java @@ -50,14 +50,17 @@ import java.util.Vector; * */ final class TermVectorsWriter { - public static final int FORMAT_VERSION = 1; + public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1; + public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2; + + public static final int FORMAT_VERSION = 2; //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file public static final int FORMAT_SIZE = 4; - //TODO: Figure out how to write with or w/o position information and read back in public static final String TVX_EXTENSION = ".tvx"; public static final String TVD_EXTENSION = ".tvd"; public static final String TVF_EXTENSION = ".tvf"; + private IndexOutput tvx = null, tvd = null, tvf = null; private Vector fields = null; private Vector terms = null; @@ -66,13 +69,6 @@ final class TermVectorsWriter { private TVField currentField = null; private long currentDocPointer = -1; - /** Create term vectors writer for the specified segment in specified - * directory. A new TermVectorsWriter should be created for each - * segment. The parameter maxFields indicates how many total - * fields are found in this document. Not all of these fields may require - * termvectors to be stored, so the number of calls to - * openField is less or equal to this number. - */ public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos) throws IOException { @@ -93,7 +89,6 @@ final class TermVectorsWriter { public final void openDocument() throws IOException { closeDocument(); - currentDocPointer = tvd.getFilePointer(); } @@ -119,12 +114,17 @@ final class TermVectorsWriter { * processing of this field. If a field was previously open, it is * closed automatically. */ - public final void openField(String field) - throws IOException { - if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open."); - + public final void openField(String field) throws IOException { + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + openField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector); + } + + private void openField(int fieldNumber, boolean storePositionWithTermVector, + boolean storeOffsetWithTermVector) throws IOException{ + if (!isDocumentOpen()) + throw new IllegalStateException("Cannot open field when no document is open."); closeField(); - currentField = new TVField(fieldInfos.fieldNumber(field)); + currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector); } /** Finished processing current field. This should be followed by a call to @@ -157,57 +157,80 @@ final class TermVectorsWriter { * times this term appears in this field, in this document. */ public final void addTerm(String termText, int freq) { - if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open"); - if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open"); - - addTermInternal(termText, freq); + addTerm(termText, freq, null, null); + } + + public final void addTerm(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) + { + if (!isDocumentOpen()) + throw new IllegalStateException("Cannot add terms when document is not open"); + if (!isFieldOpen()) + throw new IllegalStateException("Cannot add terms when field is not open"); + + addTermInternal(termText, freq, positions, offsets); } - private final void addTermInternal(String termText, int freq) { - currentField.length += freq; + private final void addTermInternal(String termText, int freq, int [] positions, TermVectorOffsetInfo [] offsets) { TVTerm term = new TVTerm(); term.termText = termText; term.freq = freq; + term.positions = positions; + term.offsets = offsets; terms.add(term); } - - /** Add specified vectors to the document. + /** + * Add a complete document specified by all its term vectors. If document has no + * term vectors, add value for tvx. + * + * @param vectors + * @throws IOException */ - public final void addVectors(TermFreqVector[] vectors) - throws IOException { - if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open"); - if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open"); + public final void addAllDocVectors(TermFreqVector[] vectors) + throws IOException { + openDocument(); - for (int i = 0; i < vectors.length; i++) { - addTermFreqVector(vectors[i]); + if (vectors != null) { + for (int i = 0; i < vectors.length; i++) { + boolean storePositionWithTermVector = false; + boolean storeOffsetWithTermVector = false; + + try { + + TermPositionVector tpVector = (TermPositionVector) vectors[i]; + + if (tpVector.size() > 0 && tpVector.getTermPositions(0) != null) + storePositionWithTermVector = true; + if (tpVector.size() > 0 && tpVector.getOffsets(0) != null) + storeOffsetWithTermVector = true; + + FieldInfo fieldInfo = fieldInfos.fieldInfo(tpVector.getField()); + openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); + + for (int j = 0; j < tpVector.size(); j++) + addTermInternal(tpVector.getTerms()[j], tpVector.getTermFrequencies()[j], tpVector.getTermPositions(j), + tpVector.getOffsets(j)); + + closeField(); + + } catch (ClassCastException ignore) { + + TermFreqVector tfVector = vectors[i]; + + FieldInfo fieldInfo = fieldInfos.fieldInfo(tfVector.getField()); + openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector); + + for (int j = 0; j < tfVector.size(); j++) + addTermInternal(tfVector.getTerms()[j], tfVector.getTermFrequencies()[j], null, null); + + closeField(); + + } + } } + + closeDocument(); } - - - /** Add specified vector to the document. Document must be open but no field - * should be open or exception is thrown. The same document can have addTerm - * and addVectors calls mixed, however a given field must either be - * populated with addTerm or with addVector. * - */ - public final void addTermFreqVector(TermFreqVector vector) - throws IOException { - if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open"); - if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open"); - addTermFreqVectorInternal(vector); - } - - private final void addTermFreqVectorInternal(TermFreqVector vector) - throws IOException { - openField(vector.getField()); - for (int i = 0; i < vector.size(); i++) { - addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]); - } - closeField(); - } - - - /** Close all streams. */ final void close() throws IOException { @@ -245,47 +268,74 @@ final class TermVectorsWriter { // remember where this field is written currentField.tvfPointer = tvf.getFilePointer(); //System.out.println("Field Pointer: " + currentField.tvfPointer); - final int size; - - tvf.writeVInt(size = terms.size()); - tvf.writeVInt(currentField.length - size); + + final int size = terms.size(); + tvf.writeVInt(size); + + boolean storePositions = currentField.storePositions; + boolean storeOffsets = currentField.storeOffsets; + byte bits = 0x0; + if (storePositions) + bits |= STORE_POSITIONS_WITH_TERMVECTOR; + if (storeOffsets) + bits |= STORE_OFFSET_WITH_TERMVECTOR; + tvf.writeByte(bits); + String lastTermText = ""; - // write term ids and positions for (int i = 0; i < size; i++) { TVTerm term = (TVTerm) terms.elementAt(i); - //tvf.writeString(term.termText); int start = StringHelper.stringDifference(lastTermText, term.termText); int length = term.termText.length() - start; - tvf.writeVInt(start); // write shared prefix length - tvf.writeVInt(length); // write delta length + tvf.writeVInt(start); // write shared prefix length + tvf.writeVInt(length); // write delta length tvf.writeChars(term.termText, start, length); // write delta chars tvf.writeVInt(term.freq); lastTermText = term.termText; + + if(storePositions){ + if(term.positions == null) + throw new IllegalStateException("Trying to write positions that are null!"); + + // use delta encoding for positions + int position = 0; + for (int j = 0; j < term.freq; j++){ + tvf.writeVInt(term.positions[j] - position); + position = term.positions[j]; + } + } + + if(storeOffsets){ + if(term.offsets == null) + throw new IllegalStateException("Trying to write offsets that are null!"); + + // use delta encoding for offsets + int position = 0; + for (int j = 0; j < term.freq; j++) { + tvf.writeVInt(term.offsets[j].getStartOffset() - position); + tvf.writeVInt(term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); //Save the diff between the two. + position = term.offsets[j].getEndOffset(); + } + } } } - - - private void writeDoc() throws IOException { - if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document"); + if (isFieldOpen()) + throw new IllegalStateException("Field is still open while writing document"); //System.out.println("Writing doc pointer: " + currentDocPointer); // write document index record tvx.writeLong(currentDocPointer); // write document data record - final int size; + final int size = fields.size(); // write the number of fields - tvd.writeVInt(size = fields.size()); + tvd.writeVInt(size); // write field numbers - int lastFieldNumber = 0; for (int i = 0; i < size; i++) { TVField field = (TVField) fields.elementAt(i); - tvd.writeVInt(field.number - lastFieldNumber); - - lastFieldNumber = field.number; + tvd.writeVInt(field.number); } // write field pointers @@ -293,7 +343,6 @@ final class TermVectorsWriter { for (int i = 0; i < size; i++) { TVField field = (TVField) fields.elementAt(i); tvd.writeVLong(field.tvfPointer - lastFieldPointer); - lastFieldPointer = field.tvfPointer; } //System.out.println("After writing doc pointer: " + tvx.getFilePointer()); @@ -303,17 +352,20 @@ final class TermVectorsWriter { private static class TVField { int number; long tvfPointer = 0; - int length = 0; // number of distinct term positions - - TVField(int number) { + boolean storePositions = false; + boolean storeOffsets = false; + TVField(int number, boolean storePos, boolean storeOff) { this.number = number; + storePositions = storePos; + storeOffsets = storeOff; } } private static class TVTerm { String termText; int freq = 0; - //int positions[] = null; + int positions[] = null; + TermVectorOffsetInfo [] offsets = null; } diff --git a/src/test/org/apache/lucene/index/DocHelper.java b/src/test/org/apache/lucene/index/DocHelper.java index 2a8c3876b31..5f3c6daf956 100644 --- a/src/test/org/apache/lucene/index/DocHelper.java +++ b/src/test/org/apache/lucene/index/DocHelper.java @@ -34,7 +34,7 @@ class DocHelper { //Fields will be lexicographically sorted. So, the order is: field, text, two public static final int [] FIELD_2_FREQS = {3, 1, 1}; public static final String TEXT_FIELD_2_KEY = "textField2"; - public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true); + public static Field textField2 = new Field(TEXT_FIELD_2_KEY, FIELD_2_TEXT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); public static final String KEYWORD_TEXT = "Keyword"; public static final String KEYWORD_FIELD_KEY = "keyField"; @@ -135,7 +135,7 @@ class DocHelper { Enumeration fields = doc.fields(); int result = 0; while (fields.hasMoreElements()) { - fields.nextElement(); + String name = fields.nextElement().toString(); result++; } return result; diff --git a/src/test/org/apache/lucene/index/TestSegmentMerger.java b/src/test/org/apache/lucene/index/TestSegmentMerger.java index c4b43472ce3..12da850ce60 100644 --- a/src/test/org/apache/lucene/index/TestSegmentMerger.java +++ b/src/test/org/apache/lucene/index/TestSegmentMerger.java @@ -109,6 +109,7 @@ public class TestSegmentMerger extends TestCase { int [] freqs = vector.getTermFrequencies(); assertTrue(freqs != null); //System.out.println("Freqs size: " + freqs.length); + assertTrue(vector instanceof TermPositionVector == true); for (int i = 0; i < terms.length; i++) { String term = terms[i]; diff --git a/src/test/org/apache/lucene/index/TestSegmentReader.java b/src/test/org/apache/lucene/index/TestSegmentReader.java index fce8e1cb823..507ac3cddab 100644 --- a/src/test/org/apache/lucene/index/TestSegmentReader.java +++ b/src/test/org/apache/lucene/index/TestSegmentReader.java @@ -178,7 +178,7 @@ public class TestSegmentReader extends TestCase { } - public void testTermVectors() { + public void testTermVectors() throws IOException { TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY); assertTrue(result != null); String [] terms = result.getTerms(); diff --git a/src/test/org/apache/lucene/index/TestTermVectorsReader.java b/src/test/org/apache/lucene/index/TestTermVectorsReader.java index c742eededfc..40fdd818f55 100644 --- a/src/test/org/apache/lucene/index/TestTermVectorsReader.java +++ b/src/test/org/apache/lucene/index/TestTermVectorsReader.java @@ -11,7 +11,11 @@ public class TestTermVectorsReader extends TestCase { private TermVectorsWriter writer = null; //Must be lexicographically sorted, will do in setup, versus trying to maintain here private String [] testFields = {"f1", "f2", "f3"}; + private boolean [] testFieldsStorePos = {true, false, true, false}; + private boolean [] testFieldsStoreOff = {true, false, false, true}; private String [] testTerms = {"this", "is", "a", "test"}; + private int [][] positions = new int[testTerms.length][]; + private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][]; private RAMDirectory dir = new RAMDirectory(); private String seg = "testSegment"; private FieldInfos fieldInfos = new FieldInfos(); @@ -22,9 +26,22 @@ public class TestTermVectorsReader extends TestCase { protected void setUp() { for (int i = 0; i < testFields.length; i++) { - fieldInfos.add(testFields[i], true, true); + fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); } + for (int i = 0; i < testTerms.length; i++) + { + positions[i] = new int[3]; + for (int j = 0; j < positions[i].length; j++) { + // poditions are always sorted in increasing order + positions[i][j] = (int)(j * 10 + Math.random() * 10); + } + offsets[i] = new TermVectorOffsetInfo[3]; + for (int j = 0; j < offsets[i].length; j++){ + // ofsets are alway sorted in increasing order + offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length()); + } + } try { Arrays.sort(testTerms); for (int j = 0; j < 5; j++) { @@ -34,7 +51,7 @@ public class TestTermVectorsReader extends TestCase { for (int k = 0; k < testFields.length; k++) { writer.openField(testFields[k]); for (int i = 0; i < testTerms.length; i++) { - writer.addTerm(testTerms[i], i); + writer.addTerm(testTerms[i], 3, positions[i], offsets[i]); } writer.closeField(); } @@ -79,6 +96,103 @@ public class TestTermVectorsReader extends TestCase { assertTrue(false); } } + + public void testPositionReader() { + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + TermPositionVector vector; + String [] terms; + vector = (TermPositionVector)reader.get(0, testFields[0]); + assertTrue(vector != null); + terms = vector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + int [] positions = vector.getTermPositions(i); + assertTrue(positions != null); + assertTrue(positions.length == this.positions[i].length); + for (int j = 0; j < positions.length; j++) { + int position = positions[j]; + assertTrue(position == this.positions[i][j]); + } + TermVectorOffsetInfo [] offset = vector.getOffsets(i); + assertTrue(offset != null); + assertTrue(offset.length == this.offsets[i].length); + for (int j = 0; j < offset.length; j++) { + TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; + assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); + } + } + + TermFreqVector freqVector = (TermFreqVector)reader.get(0, testFields[1]); //no pos, no offset + assertTrue(freqVector != null); + assertTrue(freqVector instanceof TermPositionVector == false); + terms = freqVector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + } + + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + catch (ClassCastException cce) + { + cce.printStackTrace(); + assertTrue(false); + } + } + + public void testOffsetReader() { + try { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]); + assertTrue(vector != null); + String [] terms = vector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + int [] positions = vector.getTermPositions(i); + assertTrue(positions != null); + assertTrue(positions.length == this.positions[i].length); + for (int j = 0; j < positions.length; j++) { + int position = positions[j]; + assertTrue(position == this.positions[i][j]); + } + TermVectorOffsetInfo [] offset = vector.getOffsets(i); + assertTrue(offset != null); + assertTrue(offset.length == this.offsets[i].length); + for (int j = 0; j < offset.length; j++) { + TermVectorOffsetInfo termVectorOffsetInfo = offset[j]; + assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); + } + } + + + } catch (IOException e) { + e.printStackTrace(); + assertTrue(false); + } + catch (ClassCastException cce) + { + cce.printStackTrace(); + assertTrue(false); + } + } + /** * Make sure exceptions and bad params are handled appropriately @@ -89,9 +203,9 @@ public class TestTermVectorsReader extends TestCase { assertTrue(reader != null); //Bad document number, good field number TermFreqVector vector = reader.get(50, testFields[0]); - assertTrue(vector == null); + assertTrue(false); } catch (Exception e) { - assertTrue(false); + assertTrue(true); } try { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); diff --git a/src/test/org/apache/lucene/index/TestTermVectorsWriter.java b/src/test/org/apache/lucene/index/TestTermVectorsWriter.java index a36207444d0..b759c65f578 100644 --- a/src/test/org/apache/lucene/index/TestTermVectorsWriter.java +++ b/src/test/org/apache/lucene/index/TestTermVectorsWriter.java @@ -44,7 +44,7 @@ public class TestTermVectorsWriter extends TestCase { for (int i = 0; i < testTerms.length; i++) { positions[i] = new int[5]; for (int j = 0; j < positions[i].length; j++) { - positions[i][j] = i * 100; + positions[i][j] = j * 10; } } } @@ -107,7 +107,7 @@ public class TestTermVectorsWriter extends TestCase { } } - private void checkTermVector(TermVectorsReader reader, int docNum, String field) { + private void checkTermVector(TermVectorsReader reader, int docNum, String field) throws IOException { TermFreqVector vector = reader.get(docNum, field); assertTrue(vector != null); String[] terms = vector.getTerms(); diff --git a/src/test/org/apache/lucene/search/TestTermVectors.java b/src/test/org/apache/lucene/search/TestTermVectors.java index 0fcb35228da..106a36d311a 100644 --- a/src/test/org/apache/lucene/search/TestTermVectors.java +++ b/src/test/org/apache/lucene/search/TestTermVectors.java @@ -43,8 +43,23 @@ public class TestTermVectors extends TestCase { //writer.infoStream = System.out; for (int i = 0; i < 1000; i++) { Document doc = new Document(); + Field.TermVector termVector; + int mod3 = i % 3; + int mod2 = i % 2; + if (mod2 == 0 && mod3 == 0){ + termVector = Field.TermVector.WITH_POSITIONS_OFFSETS; + } + else if (mod2 == 0){ + termVector = Field.TermVector.WITH_POSITIONS; + } + else if (mod3 == 0){ + termVector = Field.TermVector.WITH_OFFSETS; + } + else { + termVector = Field.TermVector.YES; + } doc.add(new Field("field", English.intToEnglish(i), - Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); + Field.Store.YES, Field.Index.TOKENIZED, termVector)); writer.addDocument(doc); } writer.close(); @@ -70,17 +85,74 @@ public class TestTermVectors extends TestCase { TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); assertTrue(vector != null); assertTrue(vector.length == 1); - //assertTrue(); } - TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50)); - //System.out.println("Explain: " + searcher.explain(query, hits.id(50))); - //System.out.println("Vector: " + vector[0].toString()); } catch (IOException e) { assertTrue(false); } } public void testTermPositionVectors() { + Query query = new TermQuery(new Term("field", "zero")); + try { + Hits hits = searcher.search(query); + assertEquals(1, hits.length()); + + for (int i = 0; i < hits.length(); i++) + { + TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); + assertTrue(vector != null); + assertTrue(vector.length == 1); + + boolean shouldBePosVector = (hits.id(i) % 2 == 0) ? true : false; + assertTrue((shouldBePosVector == false) || (shouldBePosVector == true && (vector[0] instanceof TermPositionVector == true))); + + boolean shouldBeOffVector = (hits.id(i) % 3 == 0) ? true : false; + assertTrue((shouldBeOffVector == false) || (shouldBeOffVector == true && (vector[0] instanceof TermPositionVector == true))); + + if(shouldBePosVector || shouldBeOffVector){ + TermPositionVector posVec = (TermPositionVector)vector[0]; + String [] terms = posVec.getTerms(); + assertTrue(terms != null && terms.length > 0); + + for (int j = 0; j < terms.length; j++) { + int [] positions = posVec.getTermPositions(j); + TermVectorOffsetInfo [] offsets = posVec.getOffsets(j); + + if(shouldBePosVector){ + assertTrue(positions != null); + assertTrue(positions.length > 0); + } + else + assertTrue(positions == null); + + if(shouldBeOffVector){ + assertTrue(offsets != null); + assertTrue(offsets.length > 0); + } + else + assertTrue(offsets == null); + } + } + else{ + try{ + TermPositionVector posVec = (TermPositionVector)vector[0]; + assertTrue(false); + } + catch(ClassCastException ignore){ + TermFreqVector freqVec = vector[0]; + String [] terms = freqVec.getTerms(); + assertTrue(terms != null && terms.length > 0); + } + + } + + } + } catch (IOException e) { + assertTrue(false); + } + } + + public void testTermOffsetVectors() { Query query = new TermQuery(new Term("field", "fifty")); try { Hits hits = searcher.search(query); @@ -91,6 +163,7 @@ public class TestTermVectors extends TestCase { TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); assertTrue(vector != null); assertTrue(vector.length == 1); + //assertTrue(); } } catch (IOException e) { @@ -164,7 +237,7 @@ public class TestTermVectors extends TestCase { int [] freqs = vector.getTermFrequencies(); for (int i = 0; i < vTerms.length; i++) { - if (term.text().equals(vTerms[i]) == true) + if (term.text().equals(vTerms[i])) { assertTrue(freqs[i] == freq); } @@ -184,9 +257,9 @@ public class TestTermVectors extends TestCase { System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ - assertTrue(testDoc3.toString().equals(hits.doc(0).toString())); - assertTrue(testDoc4.toString().equals(hits.doc(1).toString())); - assertTrue(testDoc1.toString().equals(hits.doc(2).toString())); + assertTrue(hits.id(0) == 2); + assertTrue(hits.id(1) == 3); + assertTrue(hits.id(2) == 0); TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field"); assertTrue(vector != null); //System.out.println("Vector: " + vector);