diff --git a/CHANGES.txt b/CHANGES.txt
index 36cce30e670..dfa0564508a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -54,6 +54,10 @@ New features
2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
+ 3. LUCENE-868: Added new Term Vector access features. New callback mechanism allows application to define how and where to read Term Vectors from disk.
+ This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the
+ actual storage of Term Vectors has taken place.
+
Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index 11f90be8374..d141da761da 100644
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -17,6 +17,16 @@ package org.apache.lucene.index.memory;
* limitations under the License.
*/
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@@ -30,22 +40,13 @@ import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
-import java.io.IOException;
-import java.io.Serializable;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
/**
* High-performance single-document main memory Apache Lucene fulltext search index.
*
@@ -935,8 +936,47 @@ public class MemoryIndex {
}
return vectors;
}
-
- public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
+
+ public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException
+ {
+ if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors");
+
+ // if (vectors.length == 0) return null;
+ for (Iterator iterator = fields.keySet().iterator(); iterator.hasNext();)
+ {
+ String fieldName = (String) iterator.next();
+ getTermFreqVector(docNumber, fieldName, mapper);
+ }
+ }
+
+ public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException
+ {
+ if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
+ final Info info = getInfo(field);
+ if (info == null){
+ return;
+ }
+ info.sortTerms();
+ mapper.setExpectations(field, info.sortedTerms.length, stride != 1, true);
+ for (int i = info.sortedTerms.length; --i >=0;){
+
+ ArrayIntList positions = (ArrayIntList) info.sortedTerms[i].getValue();
+ int size = positions.size();
+ org.apache.lucene.index.TermVectorOffsetInfo[] offsets =
+ new org.apache.lucene.index.TermVectorOffsetInfo[size / stride];
+
+ for (int k=0, j=1; j < size; k++, j += stride) {
+ int start = positions.get(j);
+ int end = positions.get(j+1);
+ offsets[k] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end);
+ }
+ mapper.map((String)info.sortedTerms[i].getKey(),
+ numPositions((ArrayIntList) info.sortedTerms[i].getValue()),
+ offsets, ((ArrayIntList) info.sortedTerms[i].getValue()).toArray(stride));
+ }
+ }
+
+ public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
final Info info = getInfo(fieldName);
if (info == null) return null; // TODO: or return empty vector impl???
diff --git a/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java b/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java
new file mode 100644
index 00000000000..7f54850a4d8
--- /dev/null
+++ b/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java
@@ -0,0 +1,70 @@
+package org.apache.lucene.index;
+
+import java.util.*;
+
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * For each Field, store a sorted collection of {@link TermVectorEntry}s
+ *
+ * This is not thread-safe.
+ */
+public class FieldSortedTermVectorMapper extends TermVectorMapper{
+ private Map fieldToTerms = new HashMap();
+ private SortedSet currentSet;
+ private String currentField;
+ private Comparator comparator;
+
+ /**
+ *
+ * @param comparator A Comparator for sorting {@link TermVectorEntry}s
+ */
+ public FieldSortedTermVectorMapper(Comparator comparator) {
+ this(false, false, comparator);
+ }
+
+
+ public FieldSortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
+ super(ignoringPositions, ignoringOffsets);
+ this.comparator = comparator;
+ }
+
+ public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+ TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions);
+ currentSet.add(entry);
+ }
+
+ public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+ currentSet = new TreeSet(comparator);
+ currentField = field;
+ fieldToTerms.put(field, currentSet);
+ }
+
+ /**
+ * Get the mapping between fields and terms, sorted by the comparator
+ *
+ * @return A map between field names and {@link java.util.SortedSet}s per field. SortedSet entries are {@link TermVectorEntry}
+ */
+ public Map getFieldToTerms() {
+ return fieldToTerms;
+ }
+
+
+ public Comparator getComparator() {
+ return comparator;
+ }
+}
diff --git a/src/java/org/apache/lucene/index/FilterIndexReader.java b/src/java/org/apache/lucene/index/FilterIndexReader.java
index 887e5da33b8..4b9b9d9a694 100644
--- a/src/java/org/apache/lucene/index/FilterIndexReader.java
+++ b/src/java/org/apache/lucene/index/FilterIndexReader.java
@@ -115,6 +115,18 @@ public class FilterIndexReader extends IndexReader {
return in.getTermFreqVector(docNumber, field);
}
+
+ public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ in.getTermFreqVector(docNumber, field, mapper);
+
+ }
+
+ public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ in.getTermFreqVector(docNumber, mapper);
+ }
+
public int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
return in.numDocs();
diff --git a/src/java/org/apache/lucene/index/IndexReader.java b/src/java/org/apache/lucene/index/IndexReader.java
index 30315a10933..84d6fd9cacb 100644
--- a/src/java/org/apache/lucene/index/IndexReader.java
+++ b/src/java/org/apache/lucene/index/IndexReader.java
@@ -20,12 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.search.Similarity;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.Lock;
-import org.apache.lucene.store.LockObtainFailedException;
-import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.store.*;
import java.io.File;
import java.io.FileOutputStream;
@@ -385,6 +380,25 @@ public abstract class IndexReader {
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException;
+ /**
+ * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
+ * the {@link TermFreqVector}.
+ * @param docNumber The number of the document to load the vector for
+ * @param field The name of the field to load
+ * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
+ * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
+ *
+ */
+ abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
+
+ /**
+ * Map all the term vectors for all fields in a Document
+ * @param docNumber The number of the document to load the vector for
+ * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
+ * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
+ */
+ abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
+
/**
* Returns true
if an index exists at the specified directory.
* If the directory does not exist or if there is no index in it.
diff --git a/src/java/org/apache/lucene/index/MultiReader.java b/src/java/org/apache/lucene/index/MultiReader.java
index ef305443c54..3e1d8009c23 100644
--- a/src/java/org/apache/lucene/index/MultiReader.java
+++ b/src/java/org/apache/lucene/index/MultiReader.java
@@ -85,6 +85,19 @@ public class MultiReader extends IndexReader {
return subReaders[i].getTermFreqVector(n - starts[i], field);
}
+
+ public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ int i = readerIndex(docNumber); // find segment num
+ subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper);
+ }
+
+ public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ int i = readerIndex(docNumber); // find segment num
+ subReaders[i].getTermFreqVector(docNumber - starts[i], mapper);
+ }
+
public synchronized int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
if (numDocs == -1) { // check cache
diff --git a/src/java/org/apache/lucene/index/ParallelReader.java b/src/java/org/apache/lucene/index/ParallelReader.java
index 4b68ca81eb3..ef424e20ba6 100644
--- a/src/java/org/apache/lucene/index/ParallelReader.java
+++ b/src/java/org/apache/lucene/index/ParallelReader.java
@@ -194,6 +194,29 @@ public class ParallelReader extends IndexReader {
return reader==null ? null : reader.getTermFreqVector(n, field);
}
+
+ public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ IndexReader reader = ((IndexReader)fieldToReader.get(field));
+ if (reader != null) {
+ reader.getTermFreqVector(docNumber, field, mapper);
+ }
+ }
+
+ public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ ensureOpen();
+
+ Iterator i = fieldToReader.entrySet().iterator();
+ while (i.hasNext()) {
+ Map.Entry e = (Map.Entry)i.next();
+ String field = (String)e.getKey();
+ IndexReader reader = (IndexReader)e.getValue();
+ reader.getTermFreqVector(docNumber, field, mapper);
+ }
+
+ }
+
public boolean hasNorms(String field) throws IOException {
ensureOpen();
IndexReader reader = ((IndexReader)fieldToReader.get(field));
diff --git a/src/java/org/apache/lucene/index/SegmentReader.java b/src/java/org/apache/lucene/index/SegmentReader.java
index 2ac08861e5b..4f8d5488211 100644
--- a/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/src/java/org/apache/lucene/index/SegmentReader.java
@@ -20,10 +20,10 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.util.BitVector;
import java.io.IOException;
@@ -643,6 +643,35 @@ class SegmentReader extends IndexReader {
}
+ public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ FieldInfo fi = fieldInfos.fieldInfo(field);
+ if (fi == null || !fi.storeTermVector || termVectorsReaderOrig == null)
+ throw new IOException("field does not contain term vectors");
+
+ TermVectorsReader termVectorsReader = getTermVectorsReader();
+ if (termVectorsReader == null)
+ {
+ throw new IOException("Cannot open a reader for the term vectors");
+ }
+
+
+ termVectorsReader.get(docNumber, field, mapper);
+ }
+
+
+ public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
+ ensureOpen();
+ if (termVectorsReaderOrig == null)
+ return;
+
+ TermVectorsReader termVectorsReader = getTermVectorsReader();
+ if (termVectorsReader == null)
+ return;
+
+ termVectorsReader.get(docNumber, mapper);
+ }
+
/** Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document.
* Each vector vector contains term numbers and frequencies for all terms
diff --git a/src/java/org/apache/lucene/index/SortedTermVectorMapper.java b/src/java/org/apache/lucene/index/SortedTermVectorMapper.java
new file mode 100644
index 00000000000..61ced004ef7
--- /dev/null
+++ b/src/java/org/apache/lucene/index/SortedTermVectorMapper.java
@@ -0,0 +1,129 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.*;
+
+/**
+ * Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information
+ * into a single, SortedSet.
+ *
+ * NOTE: This Mapper ignores all Field information for the Document. This means that if you are using offset/positions you will not
+ * know what Fields they correlate with.
+ *
+ * This is not thread-safe
+ */
+public class SortedTermVectorMapper extends TermVectorMapper{
+
+
+ private SortedSet currentSet;
+ private Map termToTVE = new HashMap();
+ private boolean storeOffsets;
+ private boolean storePositions;
+ /**
+ * Stand-in name for the field in {@link TermVectorEntry}.
+ */
+ public static final String ALL = "_ALL_";
+
+ /**
+ *
+ * @param comparator A Comparator for sorting {@link TermVectorEntry}s
+ */
+ public SortedTermVectorMapper(Comparator comparator) {
+ this(false, false, comparator);
+ }
+
+
+ public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
+ super(ignoringPositions, ignoringOffsets);
+ currentSet = new TreeSet(comparator);
+ }
+
+ /**
+ *
+ * @param term The term to map
+ * @param frequency The frequency of the term
+ * @param offsets Offset information, may be null
+ * @param positions Position information, may be null
+ */
+ //We need to combine any previous mentions of the term
+ public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+ TermVectorEntry entry = (TermVectorEntry) termToTVE.get(term);
+ if (entry == null) {
+ entry = new TermVectorEntry(ALL, term, frequency,
+ storeOffsets == true ? offsets : null,
+ storePositions == true ? positions : null);
+ termToTVE.put(term, entry);
+ currentSet.add(entry);
+ } else {
+ entry.setFrequency(entry.getFrequency() + frequency);
+ if (storeOffsets)
+ {
+ TermVectorOffsetInfo [] existingOffsets = entry.getOffsets();
+ //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions
+ if (existingOffsets != null && offsets != null && offsets.length > 0)
+ {
+ //copy over the existing offsets
+ TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length];
+ System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length);
+ System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length);
+ entry.setOffsets(newOffsets);
+ }
+ else if (existingOffsets == null && offsets != null && offsets.length > 0)
+ {
+ entry.setOffsets(offsets);
+ }
+ //else leave it alone
+ }
+ if (storePositions)
+ {
+ int [] existingPositions = entry.getPositions();
+ if (existingPositions != null && positions != null && positions.length > 0)
+ {
+ int [] newPositions = new int[existingPositions.length + positions.length];
+ System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length);
+ System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length);
+ entry.setPositions(newPositions);
+ }
+ else if (existingPositions == null && positions != null && positions.length > 0)
+ {
+ entry.setPositions(positions);
+ }
+ }
+ }
+
+
+ }
+
+ public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+
+ this.storeOffsets = storeOffsets;
+ this.storePositions = storePositions;
+ }
+
+ /**
+ * The TermVectorEntrySet. A SortedSet of {@link TermVectorEntry} objects. Sort is by the comparator passed into the constructor.
+ *
+ * This set will be empty until after the mapping process takes place.
+ *
+ * @return The SortedSet of {@link TermVectorEntry}.
+ */
+ public SortedSet getTermVectorEntrySet()
+ {
+ return currentSet;
+ }
+
+}
diff --git a/src/java/org/apache/lucene/index/TermVectorEntry.java b/src/java/org/apache/lucene/index/TermVectorEntry.java
new file mode 100644
index 00000000000..ee66c6f1a2b
--- /dev/null
+++ b/src/java/org/apache/lucene/index/TermVectorEntry.java
@@ -0,0 +1,98 @@
+package org.apache.lucene.index;
+
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Convenience class for holding TermVector information.
+ */
+public class TermVectorEntry {
+ private String field;
+ private String term;
+ private int frequency;
+ private TermVectorOffsetInfo [] offsets;
+ int [] positions;
+
+
+ public TermVectorEntry() {
+ }
+
+ public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+ this.field = field;
+ this.term = term;
+ this.frequency = frequency;
+ this.offsets = offsets;
+ this.positions = positions;
+ }
+
+
+ public String getField() {
+ return field;
+ }
+
+ public int getFrequency() {
+ return frequency;
+ }
+
+ public TermVectorOffsetInfo[] getOffsets() {
+ return offsets;
+ }
+
+ public int[] getPositions() {
+ return positions;
+ }
+
+ public String getTerm() {
+ return term;
+ }
+
+ //Keep package local
+ void setFrequency(int frequency) {
+ this.frequency = frequency;
+ }
+
+ void setOffsets(TermVectorOffsetInfo[] offsets) {
+ this.offsets = offsets;
+ }
+
+ void setPositions(int[] positions) {
+ this.positions = positions;
+ }
+
+
+ public boolean equals(Object o) {
+ if (this == o) return true;
+ if (o == null || getClass() != o.getClass()) return false;
+
+ TermVectorEntry that = (TermVectorEntry) o;
+
+ if (term != null ? !term.equals(that.term) : that.term != null) return false;
+
+ return true;
+ }
+
+ public int hashCode() {
+ return (term != null ? term.hashCode() : 0);
+ }
+
+ public String toString() {
+ return "TermVectorEntry{" +
+ "field='" + field + '\'' +
+ ", term='" + term + '\'' +
+ ", frequency=" + frequency +
+ '}';
+ }
+}
diff --git a/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java b/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java
new file mode 100644
index 00000000000..2f2807162b8
--- /dev/null
+++ b/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java
@@ -0,0 +1,42 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Comparator;
+
+/**
+ * Compares {@link org.apache.lucene.index.TermVectorEntry}s first by frequency and then by
+ * the term (case-sensitive)
+ *
+ **/
+public class TermVectorEntryFreqSortedComparator implements Comparator {
+ public int compare(Object object, Object object1) {
+ int result = 0;
+ TermVectorEntry entry = (TermVectorEntry) object;
+ TermVectorEntry entry1 = (TermVectorEntry) object1;
+ result = entry1.getFrequency() - entry.getFrequency();
+ if (result == 0)
+ {
+ result = entry.getTerm().compareTo(entry1.getTerm());
+ if (result == 0)
+ {
+ result = entry.getField().compareTo(entry1.getField());
+ }
+ }
+ return result;
+ }
+}
diff --git a/src/java/org/apache/lucene/index/TermVectorMapper.java b/src/java/org/apache/lucene/index/TermVectorMapper.java
new file mode 100644
index 00000000000..53f284bbc55
--- /dev/null
+++ b/src/java/org/apache/lucene/index/TermVectorMapper.java
@@ -0,0 +1,88 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * The TermVectorMapper can be used to map Term Vectors into your own
+ * structure instead of the parallel array structure used by
+ * {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
+ *
+ * It is up to the implementation to make sure it is thread-safe.
+ *
+ *
+ **/
+public abstract class TermVectorMapper {
+
+ private boolean ignoringPositions;
+ private boolean ignoringOffsets;
+
+
+ protected TermVectorMapper() {
+ }
+
+ /**
+ *
+ * @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored
+ * @param ignoringOffsets similar to ignoringPositions
+ */
+ protected TermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets) {
+ this.ignoringPositions = ignoringPositions;
+ this.ignoringOffsets = ignoringOffsets;
+ }
+
+ /**
+ * Tell the mapper what to expect in regards to field, number of terms, offset and position storage.
+ * This method will be called once before retrieving the vector for a field.
+ *
+ * This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}.
+ * @param field The field the vector is for
+ * @param numTerms The number of terms that need to be mapped
+ * @param storeOffsets true if the mapper should expect offset information
+ * @param storePositions true if the mapper should expect positions info
+ */
+ public abstract void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions);
+ /**
+ * Map the Term Vector information into your own structure
+ * @param term The term to add to the vector
+ * @param frequency The frequency of the term in the document
+ * @param offsets null if the offset is not specified, otherwise the offset into the field of the term
+ * @param positions null if the position is not specified, otherwise the position in the field of the term
+ */
+ public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions);
+
+ /**
+ * Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they
+ * can be skipped over. Derived classes should set this to true if they want to ignore positions. The default
+ * is false, meaning positions will be loaded if they are stored.
+ * @return false
+ */
+ public boolean isIgnoringPositions()
+ {
+ return ignoringPositions;
+ }
+
+ /**
+ *
+ * @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets. false by default.
+ * @return false
+ */
+ public boolean isIgnoringOffsets()
+ {
+ return ignoringOffsets;
+ }
+
+}
diff --git a/src/java/org/apache/lucene/index/TermVectorsReader.java b/src/java/org/apache/lucene/index/TermVectorsReader.java
index a03a729802b..45e9f5a294e 100644
--- a/src/java/org/apache/lucene/index/TermVectorsReader.java
+++ b/src/java/org/apache/lucene/index/TermVectorsReader.java
@@ -17,9 +17,9 @@ package org.apache.lucene.index;
* limitations under the License.
*/
+import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.BufferedIndexInput;
import java.io.IOException;
@@ -104,18 +104,9 @@ class TermVectorsReader implements Cloneable {
return size;
}
- /**
- * Retrieve the term vector for the given document and field
- * @param docNum The document number to retrieve the vector for
- * @param field The field within the document to retrieve
- * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
- * @throws IOException if there is an error reading the term vector files
- */
- TermFreqVector get(int docNum, String field) throws IOException {
- // Check if no term vectors are available for this segment at all
- int fieldNumber = fieldInfos.fieldNumber(field);
- TermFreqVector result = null;
+ public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
if (tvx != null) {
+ int fieldNumber = fieldInfos.fieldNumber(field);
//We need to account for the FORMAT_SIZE at when seeking in the tvx
//We don't need to do this in other seeks because we already have the
// file pointer
@@ -137,7 +128,7 @@ class TermVectorsReader implements Cloneable {
number = tvd.readVInt();
else
number += tvd.readVInt();
-
+
if (number == fieldNumber)
found = i;
}
@@ -150,14 +141,30 @@ class TermVectorsReader implements Cloneable {
for (int i = 0; i <= found; i++)
position += tvd.readVLong();
- result = readTermVector(field, position);
+ readTermVector(field, position, mapper);
} else {
//System.out.println("Fieldable not found");
}
} else {
//System.out.println("No tvx file");
}
- return result;
+ }
+
+
+
+ /**
+ * Retrieve the term vector for the given document and field
+ * @param docNum The document number to retrieve the vector for
+ * @param field The field within the document to retrieve
+ * @return The TermFreqVector for the document and field or null if there is no termVector for this field.
+ * @throws IOException if there is an error reading the term vector files
+ */
+ TermFreqVector get(int docNum, String field) throws IOException {
+ // Check if no term vectors are available for this segment at all
+ ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
+ get(docNum, field, mapper);
+
+ return mapper.materializeVector();
}
/**
@@ -169,7 +176,6 @@ class TermVectorsReader implements Cloneable {
*/
TermFreqVector[] get(int docNum) throws IOException {
TermFreqVector[] result = null;
- // Check if no term vectors are available for this segment at all
if (tvx != null) {
//We need to offset by
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
@@ -182,7 +188,7 @@ class TermVectorsReader implements Cloneable {
if (fieldCount != 0) {
int number = 0;
String[] fields = new String[fieldCount];
-
+
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
number = tvd.readVInt();
@@ -208,24 +214,76 @@ class TermVectorsReader implements Cloneable {
return result;
}
+ public void get(int docNumber, TermVectorMapper mapper) throws IOException {
+ // Check if no term vectors are available for this segment at all
+ if (tvx != null) {
+ //We need to offset by
+ tvx.seek((docNumber * 8L) + TermVectorsWriter.FORMAT_SIZE);
+ long position = tvx.readLong();
+
+ tvd.seek(position);
+ int fieldCount = tvd.readVInt();
+
+ // No fields are vectorized for this document
+ if (fieldCount != 0) {
+ int number = 0;
+ String[] fields = new String[fieldCount];
+
+ for (int i = 0; i < fieldCount; i++) {
+ if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
+ number = tvd.readVInt();
+ else
+ number += tvd.readVInt();
+
+ fields[i] = fieldInfos.fieldName(number);
+ }
+
+ // Compute position in the tvf file
+ position = 0;
+ long[] tvfPointers = new long[fieldCount];
+ for (int i = 0; i < fieldCount; i++) {
+ position += tvd.readVLong();
+ tvfPointers[i] = position;
+ }
+
+ readTermVectors(fields, tvfPointers, mapper);
+ }
+ } else {
+ //System.out.println("No tvx file");
+ }
+ }
+
private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
throws IOException {
SegmentTermVector res[] = new SegmentTermVector[fields.length];
for (int i = 0; i < fields.length; i++) {
- res[i] = readTermVector(fields[i], tvfPointers[i]);
+ ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
+ readTermVector(fields[i], tvfPointers[i], mapper);
+ res[i] = (SegmentTermVector) mapper.materializeVector();
}
return res;
}
+ private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
+ throws IOException {
+ for (int i = 0; i < fields.length; i++) {
+ readTermVector(fields[i], tvfPointers[i], mapper);
+ }
+
+ }
+
+
/**
*
* @param field The field to read in
* @param tvfPointer The pointer within the tvf file where we should start reading
+ * @param mapper The mapper used to map the TermVector
* @return The TermVector located at that position
* @throws IOException
+
*/
- private SegmentTermVector readTermVector(String field, long tvfPointer)
+ private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
throws IOException {
// Now read the data from specified position
@@ -236,7 +294,7 @@ class TermVectorsReader implements Cloneable {
//System.out.println("Num Terms: " + numTerms);
// If no terms - return a constant empty termvector. However, this should never occur!
if (numTerms == 0)
- return new SegmentTermVector(field, null, null);
+ return;
boolean storePositions;
boolean storeOffsets;
@@ -251,18 +309,7 @@ class TermVectorsReader implements Cloneable {
storePositions = false;
storeOffsets = false;
}
-
- String terms[] = new String[numTerms];
- int termFreqs[] = new int[numTerms];
-
- // we may not need these, but declare them
- int positions[][] = null;
- TermVectorOffsetInfo offsets[][] = null;
- if(storePositions)
- positions = new int[numTerms][];
- if(storeOffsets)
- offsets = new TermVectorOffsetInfo[numTerms][];
-
+ mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
int start = 0;
int deltaLength = 0;
int totalLength = 0;
@@ -282,45 +329,54 @@ class TermVectorsReader implements Cloneable {
}
tvf.readChars(buffer, start, deltaLength);
- terms[i] = new String(buffer, 0, totalLength);
+ String term = new String(buffer, 0, totalLength);
previousBuffer = buffer;
int freq = tvf.readVInt();
- termFreqs[i] = freq;
-
+ int [] positions = null;
if (storePositions) { //read in the positions
- int [] pos = new int[freq];
- positions[i] = pos;
- int prevPosition = 0;
- for (int j = 0; j < freq; j++)
- {
- pos[j] = prevPosition + tvf.readVInt();
- prevPosition = pos[j];
+ //does the mapper even care about positions?
+ if (mapper.isIgnoringPositions() == false) {
+ positions = new int[freq];
+ int prevPosition = 0;
+ for (int j = 0; j < freq; j++)
+ {
+ positions[j] = prevPosition + tvf.readVInt();
+ prevPosition = positions[j];
+ }
+ } else {
+ //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
+ //
+ for (int j = 0; j < freq; j++)
+ {
+ tvf.readVInt();
+ }
}
}
-
+ TermVectorOffsetInfo[] offsets = null;
if (storeOffsets) {
- TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
- offsets[i] = offs;
- int prevOffset = 0;
- for (int j = 0; j < freq; j++) {
- int startOffset = prevOffset + tvf.readVInt();
- int endOffset = startOffset + tvf.readVInt();
- offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
- prevOffset = endOffset;
+ //does the mapper even care about offsets?
+ if (mapper.isIgnoringOffsets() == false) {
+ offsets = new TermVectorOffsetInfo[freq];
+ int prevOffset = 0;
+ for (int j = 0; j < freq; j++) {
+ int startOffset = prevOffset + tvf.readVInt();
+ int endOffset = startOffset + tvf.readVInt();
+ offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
+ prevOffset = endOffset;
+ }
+ } else {
+ for (int j = 0; j < freq; j++){
+ tvf.readVInt();
+ tvf.readVInt();
+ }
}
}
+ mapper.map(term, freq, offsets, positions);
}
-
- SegmentTermVector tv;
- if (storePositions || storeOffsets){
- tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
- }
- else {
- tv = new SegmentTermVector(field, terms, termFreqs);
- }
- return tv;
}
+
+
protected Object clone() {
if (tvx == null || tvd == null || tvf == null)
@@ -337,4 +393,67 @@ class TermVectorsReader implements Cloneable {
return clone;
}
+
+
+
}
+
+/**
+ * Models the existing parallel array structure
+ */
+class ParallelArrayTermVectorMapper extends TermVectorMapper
+{
+
+ private int numTerms;
+ private String[] terms;
+ private int[] termFreqs;
+ private int positions[][] = null;
+ private TermVectorOffsetInfo offsets[][] = null;
+ private int currentPosition;
+ private boolean storingOffsets;
+ private boolean storingPositions;
+ private String field;
+
+ public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
+ this.numTerms = numTerms;
+ this.field = field;
+ terms = new String[numTerms];
+ termFreqs = new int[numTerms];
+ this.storingOffsets = storeOffsets;
+ this.storingPositions = storePositions;
+ if(storePositions)
+ this.positions = new int[numTerms][];
+ if(storeOffsets)
+ this.offsets = new TermVectorOffsetInfo[numTerms][];
+ }
+
+ public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
+ terms[currentPosition] = term;
+ termFreqs[currentPosition] = frequency;
+ if (storingOffsets)
+ {
+ this.offsets[currentPosition] = offsets;
+ }
+ if (storingPositions)
+ {
+ this.positions[currentPosition] = positions;
+ }
+ currentPosition++;
+ }
+
+ /**
+ * Construct the vector
+ * @return
+ */
+ public TermFreqVector materializeVector() {
+ SegmentTermVector tv = null;
+ if (field != null && terms != null) {
+ if (storingPositions || storingOffsets) {
+ tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
+ } else {
+ tv = new SegmentTermVector(field, terms, termFreqs);
+ }
+ }
+ return tv;
+ }
+}
\ No newline at end of file
diff --git a/src/test/org/apache/lucene/index/TestIndexReader.java b/src/test/org/apache/lucene/index/TestIndexReader.java
index a3df3a2c89c..acd288a839e 100644
--- a/src/test/org/apache/lucene/index/TestIndexReader.java
+++ b/src/test/org/apache/lucene/index/TestIndexReader.java
@@ -21,29 +21,20 @@ package org.apache.lucene.index;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
-
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.store.LockObtainFailedException;
-import org.apache.lucene.store.AlreadyClosedException;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
-
-import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.*;
import org.apache.lucene.util._TestUtil;
-import java.util.Collection;
-import java.util.Arrays;
-import java.io.IOException;
-import java.io.FileNotFoundException;
import java.io.File;
-
-import org.apache.lucene.store.MockRAMDirectory;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.*;
public class TestIndexReader extends TestCase
{
@@ -180,8 +171,43 @@ public class TestIndexReader extends TestCase
d.close();
}
+ public void testTermVectors() throws Exception {
+ RAMDirectory d = new MockRAMDirectory();
+ // set up writer
+ IndexWriter writer = new IndexWriter(d, new StandardAnalyzer(), true);
+ // want to get some more segments here
+ // new termvector fields
+ for (int i = 0; i < 5 * writer.getMergeFactor(); i++) {
+ Document doc = new Document();
+ doc.add(new Field("tvnot","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO));
+ doc.add(new Field("termvector","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
+ doc.add(new Field("tvoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS));
+ doc.add(new Field("tvposition","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+ doc.add(new Field("tvpositionoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
- private void assertTermDocsCount(String msg,
+ writer.addDocument(doc);
+ }
+ writer.close();
+ IndexReader reader = IndexReader.open(d);
+ FieldSortedTermVectorMapper mapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+ reader.getTermFreqVector(0, mapper);
+ Map map = mapper.getFieldToTerms();
+ assertTrue("map is null and it shouldn't be", map != null);
+ assertTrue("map Size: " + map.size() + " is not: " + 4, map.size() == 4);
+ Set set = (Set) map.get("termvector");
+ for (Iterator iterator = set.iterator(); iterator.hasNext();) {
+ TermVectorEntry entry = (TermVectorEntry) iterator.next();
+ assertTrue("entry is null and it shouldn't be", entry != null);
+ System.out.println("Entry: " + entry);
+ }
+
+
+
+
+
+ }
+
+ private void assertTermDocsCount(String msg,
IndexReader reader,
Term term,
int expected)
diff --git a/src/test/org/apache/lucene/index/TestTermVectorsReader.java b/src/test/org/apache/lucene/index/TestTermVectorsReader.java
index 6aa9dd64288..4827f548313 100644
--- a/src/test/org/apache/lucene/index/TestTermVectorsReader.java
+++ b/src/test/org/apache/lucene/index/TestTermVectorsReader.java
@@ -22,16 +22,19 @@ import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.SortedSet;
public class TestTermVectorsReader extends TestCase {
private TermVectorsWriter writer = null;
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
- private String [] testFields = {"f1", "f2", "f3"};
- private boolean [] testFieldsStorePos = {true, false, true, false};
- private boolean [] testFieldsStoreOff = {true, false, false, true};
- private String [] testTerms = {"this", "is", "a", "test"};
- private int [][] positions = new int[testTerms.length][];
- private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
+ private String[] testFields = {"f1", "f2", "f3", "f4"};
+ private boolean[] testFieldsStorePos = {true, false, true, false};
+ private boolean[] testFieldsStoreOff = {true, false, false, true};
+ private String[] testTerms = {"this", "is", "a", "test"};
+ private int[][] positions = new int[testTerms.length][];
+ private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
private RAMDirectory dir = new RAMDirectory();
private String seg = "testSegment";
private FieldInfos fieldInfos = new FieldInfos();
@@ -44,35 +47,37 @@ public class TestTermVectorsReader extends TestCase {
for (int i = 0; i < testFields.length; i++) {
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
}
-
- for (int i = 0; i < testTerms.length; i++)
- {
+
+ for (int i = 0; i < testTerms.length; i++) {
positions[i] = new int[3];
for (int j = 0; j < positions[i].length; j++) {
// poditions are always sorted in increasing order
- positions[i][j] = (int)(j * 10 + Math.random() * 10);
+ positions[i][j] = (int) (j * 10 + Math.random() * 10);
}
offsets[i] = new TermVectorOffsetInfo[3];
- for (int j = 0; j < offsets[i].length; j++){
+ for (int j = 0; j < offsets[i].length; j++) {
// ofsets are alway sorted in increasing order
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
- }
+ }
}
Arrays.sort(testTerms);
+ //Create 5 documents for testing, they all have the same terms
+ writer = new TermVectorsWriter(dir, seg, fieldInfos);
for (int j = 0; j < 5; j++) {
- writer = new TermVectorsWriter(dir, seg, fieldInfos);
+
writer.openDocument();
for (int k = 0; k < testFields.length; k++) {
writer.openField(testFields[k]);
for (int i = 0; i < testTerms.length; i++) {
- writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
+ writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
}
writer.closeField();
}
writer.closeDocument();
- writer.close();
+
}
+ writer.close();
}
protected void tearDown() {
@@ -80,34 +85,38 @@ public class TestTermVectorsReader extends TestCase {
}
public void test() {
- //Check to see the files were created properly in setup
- assertTrue(writer.isDocumentOpen() == false);
- assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
- assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
+ //Check to see the files were created properly in setup
+ assertTrue(writer.isDocumentOpen() == false);
+ assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
+ assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
}
-
+
public void testReader() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
- TermFreqVector vector = reader.get(0, testFields[0]);
- assertTrue(vector != null);
- String [] terms = vector.getTerms();
- assertTrue(terms != null);
- assertTrue(terms.length == testTerms.length);
- for (int i = 0; i < terms.length; i++) {
- String term = terms[i];
- //System.out.println("Term: " + term);
- assertTrue(term.equals(testTerms[i]));
+ for (int j = 0; j < 5; j++) {
+ TermFreqVector vector = reader.get(j, testFields[0]);
+ assertTrue(vector != null);
+ String[] terms = vector.getTerms();
+ assertTrue(terms != null);
+ assertTrue(terms.length == testTerms.length);
+ for (int i = 0; i < terms.length; i++) {
+ String term = terms[i];
+ //System.out.println("Term: " + term);
+ assertTrue(term.equals(testTerms[i]));
+ }
}
- }
-
+
+
+ }
+
public void testPositionReader() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermPositionVector vector;
- String [] terms;
- vector = (TermPositionVector)reader.get(0, testFields[0]);
- assertTrue(vector != null);
+ String[] terms;
+ vector = (TermPositionVector) reader.get(0, testFields[0]);
+ assertTrue(vector != null);
terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
@@ -115,14 +124,14 @@ public class TestTermVectorsReader extends TestCase {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
- int [] positions = vector.getTermPositions(i);
+ int[] positions = vector.getTermPositions(i);
assertTrue(positions != null);
assertTrue(positions.length == this.positions[i].length);
for (int j = 0; j < positions.length; j++) {
int position = positions[j];
assertTrue(position == this.positions[i][j]);
}
- TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+ TermVectorOffsetInfo[] offset = vector.getOffsets(i);
assertTrue(offset != null);
assertTrue(offset.length == this.offsets[i].length);
for (int j = 0; j < offset.length; j++) {
@@ -130,9 +139,9 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
}
}
-
+
TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
- assertTrue(freqVector != null);
+ assertTrue(freqVector != null);
assertTrue(freqVector instanceof TermPositionVector == false);
terms = freqVector.getTerms();
assertTrue(terms != null);
@@ -140,30 +149,30 @@ public class TestTermVectorsReader extends TestCase {
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
- assertTrue(term.equals(testTerms[i]));
+ assertTrue(term.equals(testTerms[i]));
}
}
-
+
public void testOffsetReader() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
- TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
+ TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
assertTrue(vector != null);
- String [] terms = vector.getTerms();
+ String[] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
- int [] positions = vector.getTermPositions(i);
+ int[] positions = vector.getTermPositions(i);
assertTrue(positions != null);
assertTrue(positions.length == this.positions[i].length);
for (int j = 0; j < positions.length; j++) {
int position = positions[j];
assertTrue(position == this.positions[i][j]);
}
- TermVectorOffsetInfo [] offset = vector.getOffsets(i);
+ TermVectorOffsetInfo[] offset = vector.getOffsets(i);
assertTrue(offset != null);
assertTrue(offset.length == this.offsets[i].length);
for (int j = 0; j < offset.length; j++) {
@@ -172,18 +181,112 @@ public class TestTermVectorsReader extends TestCase {
}
}
}
-
+
+ public void testMapper() throws IOException {
+ TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
+ assertTrue(reader != null);
+ SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+ reader.get(0, mapper);
+ SortedSet set = mapper.getTermVectorEntrySet();
+ assertTrue("set is null and it shouldn't be", set != null);
+ //three fields, 4 terms, all terms are the same
+ assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
+ //Check offsets and positions
+ for (Iterator iterator = set.iterator(); iterator.hasNext();) {
+ TermVectorEntry tve = (TermVectorEntry) iterator.next();
+ assertTrue("tve is null and it shouldn't be", tve != null);
+ assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
+ assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
+
+ }
+
+ mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+ reader.get(1, mapper);
+ set = mapper.getTermVectorEntrySet();
+ assertTrue("set is null and it shouldn't be", set != null);
+ //three fields, 4 terms, all terms are the same
+ assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
+ //Should have offsets and positions b/c we are munging all the fields together
+ for (Iterator iterator = set.iterator(); iterator.hasNext();) {
+ TermVectorEntry tve = (TermVectorEntry) iterator.next();
+ assertTrue("tve is null and it shouldn't be", tve != null);
+ assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
+ assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
+
+ }
+
+
+ FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+ reader.get(0, fsMapper);
+ Map map = fsMapper.getFieldToTerms();
+ assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
+ for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
+ Map.Entry entry = (Map.Entry) iterator.next();
+ SortedSet sortedSet = (SortedSet) entry.getValue();
+ assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
+ for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
+ TermVectorEntry tve = (TermVectorEntry) inner.next();
+ assertTrue("tve is null and it shouldn't be", tve != null);
+ //Check offsets and positions.
+ assertTrue("tve is null and it shouldn't be", tve != null);
+ String field = tve.getField();
+ if (field.equals(testFields[0])) {
+ //should have offsets
+
+ assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
+ assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
+ }
+ else if (field.equals(testFields[1])) {
+ //should not have offsets
+
+ assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
+ assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
+ }
+ }
+ }
+ //Try mapper that ignores offs and positions
+ fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
+ reader.get(0, fsMapper);
+ map = fsMapper.getFieldToTerms();
+ assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
+ for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
+ Map.Entry entry = (Map.Entry) iterator.next();
+ SortedSet sortedSet = (SortedSet) entry.getValue();
+ assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
+ for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
+ TermVectorEntry tve = (TermVectorEntry) inner.next();
+ assertTrue("tve is null and it shouldn't be", tve != null);
+ //Check offsets and positions.
+ assertTrue("tve is null and it shouldn't be", tve != null);
+ String field = tve.getField();
+ if (field.equals(testFields[0])) {
+ //should have offsets
+
+ assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
+ assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
+ }
+ else if (field.equals(testFields[1])) {
+ //should not have offsets
+
+ assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
+ assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
+ }
+ }
+ }
+
+ }
+
/**
* Make sure exceptions and bad params are handled appropriately
- */
+ */
public void testBadParams() {
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
//Bad document number, good field number
reader.get(50, testFields[0]);
- fail();
+ fail();
} catch (IOException e) {
// expected exception
}
@@ -192,7 +295,7 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(reader != null);
//Bad document number, no field
reader.get(50);
- fail();
+ fail();
} catch (IOException e) {
// expected exception
}
@@ -201,9 +304,9 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(reader != null);
//good document number, bad field number
TermFreqVector vector = reader.get(0, "f50");
- assertTrue(vector == null);
+ assertTrue(vector == null);
} catch (IOException e) {
fail();
}
- }
+ }
}
diff --git a/src/test/org/apache/lucene/search/TestTermVectors.java b/src/test/org/apache/lucene/search/TestTermVectors.java
index 67c260d1f77..c31671b8f53 100644
--- a/src/test/org/apache/lucene/search/TestTermVectors.java
+++ b/src/test/org/apache/lucene/search/TestTermVectors.java
@@ -28,7 +28,9 @@ import org.apache.lucene.util.English;
import java.io.IOException;
import java.util.HashMap;
+import java.util.Iterator;
import java.util.Map;
+import java.util.SortedSet;
public class TestTermVectors extends TestCase {
private IndexSearcher searcher;
@@ -171,7 +173,7 @@ public class TestTermVectors extends TestCase {
assertTrue(false);
}
}
-
+
public void testKnownSetOfDocuments() {
String test1 = "eating chocolate in a computer lab"; //6 terms
String test2 = "computer in a computer lab"; //5 terms
@@ -275,20 +277,45 @@ public class TestTermVectors extends TestCase {
Integer freqInt = (Integer)test4Map.get(term);
assertTrue(freqInt != null);
assertTrue(freqInt.intValue() == freq);
- }
+ }
+ SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+ knownSearcher.reader.getTermFreqVector(hits.id(1), mapper);
+ SortedSet vectorEntrySet = mapper.getTermVectorEntrySet();
+ assertTrue("mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
+ TermVectorEntry last = null;
+ for (Iterator iterator = vectorEntrySet.iterator(); iterator.hasNext();) {
+ TermVectorEntry tve = (TermVectorEntry) iterator.next();
+ if (tve != null && last != null)
+ {
+ assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency());
+ Integer expectedFreq = (Integer) test4Map.get(tve.getTerm());
+ //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
+ assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue());
+ }
+ last = tve;
+
+ }
+
+ FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
+ knownSearcher.reader.getTermFreqVector(hits.id(1), fieldMapper);
+ Map map = fieldMapper.getFieldToTerms();
+ assertTrue("map Size: " + map.size() + " is not: " + 2, map.size() == 2);
+ vectorEntrySet = (SortedSet) map.get("field");
+ assertTrue("vectorEntrySet is null and it shouldn't be", vectorEntrySet != null);
+ assertTrue("vectorEntrySet Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
knownSearcher.close();
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
-
-
}
private void setupDoc(Document doc, String text)
{
doc.add(new Field("field", text, Field.Store.YES,
Field.Index.TOKENIZED, Field.TermVector.YES));
+ doc.add(new Field("field2", text, Field.Store.YES,
+ Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
//System.out.println("Document: " + doc);
}