diff --git a/CHANGES.txt b/CHANGES.txt index 36cce30e670..dfa0564508a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -54,6 +54,10 @@ New features 2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll) + 3. LUCENE-868: Added new Term Vector access features. New callback mechanism allows application to define how and where to read Term Vectors from disk. + This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the + actual storage of Term Vectors has taken place. + Optimizations 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 11f90be8374..d141da761da 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -17,6 +17,16 @@ package org.apache.lucene.index.memory; * limitations under the License. */ +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; @@ -30,22 +40,13 @@ import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.TermVectorMapper; import org.apache.lucene.search.HitCollector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; -import java.io.IOException; -import java.io.Serializable; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; - /** * High-performance single-document main memory Apache Lucene fulltext search index. * @@ -935,8 +936,47 @@ public class MemoryIndex { } return vectors; } - - public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) { + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException + { + if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors"); + + // if (vectors.length == 0) return null; + for (Iterator iterator = fields.keySet().iterator(); iterator.hasNext();) + { + String fieldName = (String) iterator.next(); + getTermFreqVector(docNumber, fieldName, mapper); + } + } + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException + { + if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector"); + final Info info = getInfo(field); + if (info == null){ + return; + } + info.sortTerms(); + mapper.setExpectations(field, info.sortedTerms.length, stride != 1, true); + for (int i = info.sortedTerms.length; --i >=0;){ + + ArrayIntList positions = (ArrayIntList) info.sortedTerms[i].getValue(); + int size = positions.size(); + org.apache.lucene.index.TermVectorOffsetInfo[] offsets = + new org.apache.lucene.index.TermVectorOffsetInfo[size / stride]; + + for (int k=0, j=1; j < size; k++, j += stride) { + int start = positions.get(j); + int end = positions.get(j+1); + offsets[k] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end); + } + mapper.map((String)info.sortedTerms[i].getKey(), + numPositions((ArrayIntList) info.sortedTerms[i].getValue()), + offsets, ((ArrayIntList) info.sortedTerms[i].getValue()).toArray(stride)); + } + } + + public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) { if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector"); final Info info = getInfo(fieldName); if (info == null) return null; // TODO: or return empty vector impl??? diff --git a/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java b/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java new file mode 100644 index 00000000000..7f54850a4d8 --- /dev/null +++ b/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java @@ -0,0 +1,70 @@ +package org.apache.lucene.index; + +import java.util.*; + +/** + * Copyright 2007 The Apache Software Foundation + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * For each Field, store a sorted collection of {@link TermVectorEntry}s + *

+ * This is not thread-safe. + */ +public class FieldSortedTermVectorMapper extends TermVectorMapper{ + private Map fieldToTerms = new HashMap(); + private SortedSet currentSet; + private String currentField; + private Comparator comparator; + + /** + * + * @param comparator A Comparator for sorting {@link TermVectorEntry}s + */ + public FieldSortedTermVectorMapper(Comparator comparator) { + this(false, false, comparator); + } + + + public FieldSortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) { + super(ignoringPositions, ignoringOffsets); + this.comparator = comparator; + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions); + currentSet.add(entry); + } + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + currentSet = new TreeSet(comparator); + currentField = field; + fieldToTerms.put(field, currentSet); + } + + /** + * Get the mapping between fields and terms, sorted by the comparator + * + * @return A map between field names and {@link java.util.SortedSet}s per field. SortedSet entries are {@link TermVectorEntry} + */ + public Map getFieldToTerms() { + return fieldToTerms; + } + + + public Comparator getComparator() { + return comparator; + } +} diff --git a/src/java/org/apache/lucene/index/FilterIndexReader.java b/src/java/org/apache/lucene/index/FilterIndexReader.java index 887e5da33b8..4b9b9d9a694 100644 --- a/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -115,6 +115,18 @@ public class FilterIndexReader extends IndexReader { return in.getTermFreqVector(docNumber, field); } + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + in.getTermFreqVector(docNumber, field, mapper); + + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + in.getTermFreqVector(docNumber, mapper); + } + public int numDocs() { // Don't call ensureOpen() here (it could affect performance) return in.numDocs(); diff --git a/src/java/org/apache/lucene/index/IndexReader.java b/src/java/org/apache/lucene/index/IndexReader.java index 30315a10933..84d6fd9cacb 100644 --- a/src/java/org/apache/lucene/index/IndexReader.java +++ b/src/java/org/apache/lucene/index/IndexReader.java @@ -20,12 +20,7 @@ package org.apache.lucene.index; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.Lock; -import org.apache.lucene.store.LockObtainFailedException; -import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.*; import java.io.File; import java.io.FileOutputStream; @@ -385,6 +380,25 @@ public abstract class IndexReader { abstract public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException; + /** + * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of + * the {@link TermFreqVector}. + * @param docNumber The number of the document to load the vector for + * @param field The name of the field to load + * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null + * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. + * + */ + abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException; + + /** + * Map all the term vectors for all fields in a Document + * @param docNumber The number of the document to load the vector for + * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null + * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. + */ + abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException; + /** * Returns true if an index exists at the specified directory. * If the directory does not exist or if there is no index in it. diff --git a/src/java/org/apache/lucene/index/MultiReader.java b/src/java/org/apache/lucene/index/MultiReader.java index ef305443c54..3e1d8009c23 100644 --- a/src/java/org/apache/lucene/index/MultiReader.java +++ b/src/java/org/apache/lucene/index/MultiReader.java @@ -85,6 +85,19 @@ public class MultiReader extends IndexReader { return subReaders[i].getTermFreqVector(n - starts[i], field); } + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + int i = readerIndex(docNumber); // find segment num + subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper); + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + int i = readerIndex(docNumber); // find segment num + subReaders[i].getTermFreqVector(docNumber - starts[i], mapper); + } + public synchronized int numDocs() { // Don't call ensureOpen() here (it could affect performance) if (numDocs == -1) { // check cache diff --git a/src/java/org/apache/lucene/index/ParallelReader.java b/src/java/org/apache/lucene/index/ParallelReader.java index 4b68ca81eb3..ef424e20ba6 100644 --- a/src/java/org/apache/lucene/index/ParallelReader.java +++ b/src/java/org/apache/lucene/index/ParallelReader.java @@ -194,6 +194,29 @@ public class ParallelReader extends IndexReader { return reader==null ? null : reader.getTermFreqVector(n, field); } + + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + IndexReader reader = ((IndexReader)fieldToReader.get(field)); + if (reader != null) { + reader.getTermFreqVector(docNumber, field, mapper); + } + } + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + ensureOpen(); + + Iterator i = fieldToReader.entrySet().iterator(); + while (i.hasNext()) { + Map.Entry e = (Map.Entry)i.next(); + String field = (String)e.getKey(); + IndexReader reader = (IndexReader)e.getValue(); + reader.getTermFreqVector(docNumber, field, mapper); + } + + } + public boolean hasNorms(String field) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); diff --git a/src/java/org/apache/lucene/index/SegmentReader.java b/src/java/org/apache/lucene/index/SegmentReader.java index 2ac08861e5b..4f8d5488211 100644 --- a/src/java/org/apache/lucene/index/SegmentReader.java +++ b/src/java/org/apache/lucene/index/SegmentReader.java @@ -20,10 +20,10 @@ package org.apache.lucene.index; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.BitVector; import java.io.IOException; @@ -643,6 +643,35 @@ class SegmentReader extends IndexReader { } + public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { + ensureOpen(); + FieldInfo fi = fieldInfos.fieldInfo(field); + if (fi == null || !fi.storeTermVector || termVectorsReaderOrig == null) + throw new IOException("field does not contain term vectors"); + + TermVectorsReader termVectorsReader = getTermVectorsReader(); + if (termVectorsReader == null) + { + throw new IOException("Cannot open a reader for the term vectors"); + } + + + termVectorsReader.get(docNumber, field, mapper); + } + + + public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { + ensureOpen(); + if (termVectorsReaderOrig == null) + return; + + TermVectorsReader termVectorsReader = getTermVectorsReader(); + if (termVectorsReader == null) + return; + + termVectorsReader.get(docNumber, mapper); + } + /** Return an array of term frequency vectors for the specified document. * The array contains a vector for each vectorized field in the document. * Each vector vector contains term numbers and frequencies for all terms diff --git a/src/java/org/apache/lucene/index/SortedTermVectorMapper.java b/src/java/org/apache/lucene/index/SortedTermVectorMapper.java new file mode 100644 index 00000000000..61ced004ef7 --- /dev/null +++ b/src/java/org/apache/lucene/index/SortedTermVectorMapper.java @@ -0,0 +1,129 @@ +package org.apache.lucene.index; +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.*; + +/** + * Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information + * into a single, SortedSet. + *
+ * NOTE: This Mapper ignores all Field information for the Document. This means that if you are using offset/positions you will not + * know what Fields they correlate with. + *
+ * This is not thread-safe + */ +public class SortedTermVectorMapper extends TermVectorMapper{ + + + private SortedSet currentSet; + private Map termToTVE = new HashMap(); + private boolean storeOffsets; + private boolean storePositions; + /** + * Stand-in name for the field in {@link TermVectorEntry}. + */ + public static final String ALL = "_ALL_"; + + /** + * + * @param comparator A Comparator for sorting {@link TermVectorEntry}s + */ + public SortedTermVectorMapper(Comparator comparator) { + this(false, false, comparator); + } + + + public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) { + super(ignoringPositions, ignoringOffsets); + currentSet = new TreeSet(comparator); + } + + /** + * + * @param term The term to map + * @param frequency The frequency of the term + * @param offsets Offset information, may be null + * @param positions Position information, may be null + */ + //We need to combine any previous mentions of the term + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + TermVectorEntry entry = (TermVectorEntry) termToTVE.get(term); + if (entry == null) { + entry = new TermVectorEntry(ALL, term, frequency, + storeOffsets == true ? offsets : null, + storePositions == true ? positions : null); + termToTVE.put(term, entry); + currentSet.add(entry); + } else { + entry.setFrequency(entry.getFrequency() + frequency); + if (storeOffsets) + { + TermVectorOffsetInfo [] existingOffsets = entry.getOffsets(); + //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions + if (existingOffsets != null && offsets != null && offsets.length > 0) + { + //copy over the existing offsets + TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length]; + System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length); + System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length); + entry.setOffsets(newOffsets); + } + else if (existingOffsets == null && offsets != null && offsets.length > 0) + { + entry.setOffsets(offsets); + } + //else leave it alone + } + if (storePositions) + { + int [] existingPositions = entry.getPositions(); + if (existingPositions != null && positions != null && positions.length > 0) + { + int [] newPositions = new int[existingPositions.length + positions.length]; + System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length); + System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length); + entry.setPositions(newPositions); + } + else if (existingPositions == null && positions != null && positions.length > 0) + { + entry.setPositions(positions); + } + } + } + + + } + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + + this.storeOffsets = storeOffsets; + this.storePositions = storePositions; + } + + /** + * The TermVectorEntrySet. A SortedSet of {@link TermVectorEntry} objects. Sort is by the comparator passed into the constructor. + *
+ * This set will be empty until after the mapping process takes place. + * + * @return The SortedSet of {@link TermVectorEntry}. + */ + public SortedSet getTermVectorEntrySet() + { + return currentSet; + } + +} diff --git a/src/java/org/apache/lucene/index/TermVectorEntry.java b/src/java/org/apache/lucene/index/TermVectorEntry.java new file mode 100644 index 00000000000..ee66c6f1a2b --- /dev/null +++ b/src/java/org/apache/lucene/index/TermVectorEntry.java @@ -0,0 +1,98 @@ +package org.apache.lucene.index; + +/** + * Copyright 2007 The Apache Software Foundation + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Convenience class for holding TermVector information. + */ +public class TermVectorEntry { + private String field; + private String term; + private int frequency; + private TermVectorOffsetInfo [] offsets; + int [] positions; + + + public TermVectorEntry() { + } + + public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + this.field = field; + this.term = term; + this.frequency = frequency; + this.offsets = offsets; + this.positions = positions; + } + + + public String getField() { + return field; + } + + public int getFrequency() { + return frequency; + } + + public TermVectorOffsetInfo[] getOffsets() { + return offsets; + } + + public int[] getPositions() { + return positions; + } + + public String getTerm() { + return term; + } + + //Keep package local + void setFrequency(int frequency) { + this.frequency = frequency; + } + + void setOffsets(TermVectorOffsetInfo[] offsets) { + this.offsets = offsets; + } + + void setPositions(int[] positions) { + this.positions = positions; + } + + + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TermVectorEntry that = (TermVectorEntry) o; + + if (term != null ? !term.equals(that.term) : that.term != null) return false; + + return true; + } + + public int hashCode() { + return (term != null ? term.hashCode() : 0); + } + + public String toString() { + return "TermVectorEntry{" + + "field='" + field + '\'' + + ", term='" + term + '\'' + + ", frequency=" + frequency + + '}'; + } +} diff --git a/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java b/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java new file mode 100644 index 00000000000..2f2807162b8 --- /dev/null +++ b/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java @@ -0,0 +1,42 @@ +package org.apache.lucene.index; +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.util.Comparator; + +/** + * Compares {@link org.apache.lucene.index.TermVectorEntry}s first by frequency and then by + * the term (case-sensitive) + * + **/ +public class TermVectorEntryFreqSortedComparator implements Comparator { + public int compare(Object object, Object object1) { + int result = 0; + TermVectorEntry entry = (TermVectorEntry) object; + TermVectorEntry entry1 = (TermVectorEntry) object1; + result = entry1.getFrequency() - entry.getFrequency(); + if (result == 0) + { + result = entry.getTerm().compareTo(entry1.getTerm()); + if (result == 0) + { + result = entry.getField().compareTo(entry1.getField()); + } + } + return result; + } +} diff --git a/src/java/org/apache/lucene/index/TermVectorMapper.java b/src/java/org/apache/lucene/index/TermVectorMapper.java new file mode 100644 index 00000000000..53f284bbc55 --- /dev/null +++ b/src/java/org/apache/lucene/index/TermVectorMapper.java @@ -0,0 +1,88 @@ +package org.apache.lucene.index; +/** + * Copyright 2007 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * The TermVectorMapper can be used to map Term Vectors into your own + * structure instead of the parallel array structure used by + * {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. + *

+ * It is up to the implementation to make sure it is thread-safe. + * + * + **/ +public abstract class TermVectorMapper { + + private boolean ignoringPositions; + private boolean ignoringOffsets; + + + protected TermVectorMapper() { + } + + /** + * + * @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored + * @param ignoringOffsets similar to ignoringPositions + */ + protected TermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets) { + this.ignoringPositions = ignoringPositions; + this.ignoringOffsets = ignoringOffsets; + } + + /** + * Tell the mapper what to expect in regards to field, number of terms, offset and position storage. + * This method will be called once before retrieving the vector for a field. + * + * This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}. + * @param field The field the vector is for + * @param numTerms The number of terms that need to be mapped + * @param storeOffsets true if the mapper should expect offset information + * @param storePositions true if the mapper should expect positions info + */ + public abstract void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions); + /** + * Map the Term Vector information into your own structure + * @param term The term to add to the vector + * @param frequency The frequency of the term in the document + * @param offsets null if the offset is not specified, otherwise the offset into the field of the term + * @param positions null if the position is not specified, otherwise the position in the field of the term + */ + public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions); + + /** + * Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they + * can be skipped over. Derived classes should set this to true if they want to ignore positions. The default + * is false, meaning positions will be loaded if they are stored. + * @return false + */ + public boolean isIgnoringPositions() + { + return ignoringPositions; + } + + /** + * + * @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets. false by default. + * @return false + */ + public boolean isIgnoringOffsets() + { + return ignoringOffsets; + } + +} diff --git a/src/java/org/apache/lucene/index/TermVectorsReader.java b/src/java/org/apache/lucene/index/TermVectorsReader.java index a03a729802b..45e9f5a294e 100644 --- a/src/java/org/apache/lucene/index/TermVectorsReader.java +++ b/src/java/org/apache/lucene/index/TermVectorsReader.java @@ -17,9 +17,9 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.BufferedIndexInput; import java.io.IOException; @@ -104,18 +104,9 @@ class TermVectorsReader implements Cloneable { return size; } - /** - * Retrieve the term vector for the given document and field - * @param docNum The document number to retrieve the vector for - * @param field The field within the document to retrieve - * @return The TermFreqVector for the document and field or null if there is no termVector for this field. - * @throws IOException if there is an error reading the term vector files - */ - TermFreqVector get(int docNum, String field) throws IOException { - // Check if no term vectors are available for this segment at all - int fieldNumber = fieldInfos.fieldNumber(field); - TermFreqVector result = null; + public void get(int docNum, String field, TermVectorMapper mapper) throws IOException { if (tvx != null) { + int fieldNumber = fieldInfos.fieldNumber(field); //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer @@ -137,7 +128,7 @@ class TermVectorsReader implements Cloneable { number = tvd.readVInt(); else number += tvd.readVInt(); - + if (number == fieldNumber) found = i; } @@ -150,14 +141,30 @@ class TermVectorsReader implements Cloneable { for (int i = 0; i <= found; i++) position += tvd.readVLong(); - result = readTermVector(field, position); + readTermVector(field, position, mapper); } else { //System.out.println("Fieldable not found"); } } else { //System.out.println("No tvx file"); } - return result; + } + + + + /** + * Retrieve the term vector for the given document and field + * @param docNum The document number to retrieve the vector for + * @param field The field within the document to retrieve + * @return The TermFreqVector for the document and field or null if there is no termVector for this field. + * @throws IOException if there is an error reading the term vector files + */ + TermFreqVector get(int docNum, String field) throws IOException { + // Check if no term vectors are available for this segment at all + ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); + get(docNum, field, mapper); + + return mapper.materializeVector(); } /** @@ -169,7 +176,6 @@ class TermVectorsReader implements Cloneable { */ TermFreqVector[] get(int docNum) throws IOException { TermFreqVector[] result = null; - // Check if no term vectors are available for this segment at all if (tvx != null) { //We need to offset by tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE); @@ -182,7 +188,7 @@ class TermVectorsReader implements Cloneable { if (fieldCount != 0) { int number = 0; String[] fields = new String[fieldCount]; - + for (int i = 0; i < fieldCount; i++) { if(tvdFormat == TermVectorsWriter.FORMAT_VERSION) number = tvd.readVInt(); @@ -208,24 +214,76 @@ class TermVectorsReader implements Cloneable { return result; } + public void get(int docNumber, TermVectorMapper mapper) throws IOException { + // Check if no term vectors are available for this segment at all + if (tvx != null) { + //We need to offset by + tvx.seek((docNumber * 8L) + TermVectorsWriter.FORMAT_SIZE); + long position = tvx.readLong(); + + tvd.seek(position); + int fieldCount = tvd.readVInt(); + + // No fields are vectorized for this document + if (fieldCount != 0) { + int number = 0; + String[] fields = new String[fieldCount]; + + for (int i = 0; i < fieldCount; i++) { + if(tvdFormat == TermVectorsWriter.FORMAT_VERSION) + number = tvd.readVInt(); + else + number += tvd.readVInt(); + + fields[i] = fieldInfos.fieldName(number); + } + + // Compute position in the tvf file + position = 0; + long[] tvfPointers = new long[fieldCount]; + for (int i = 0; i < fieldCount; i++) { + position += tvd.readVLong(); + tvfPointers[i] = position; + } + + readTermVectors(fields, tvfPointers, mapper); + } + } else { + //System.out.println("No tvx file"); + } + } + private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[]) throws IOException { SegmentTermVector res[] = new SegmentTermVector[fields.length]; for (int i = 0; i < fields.length; i++) { - res[i] = readTermVector(fields[i], tvfPointers[i]); + ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); + readTermVector(fields[i], tvfPointers[i], mapper); + res[i] = (SegmentTermVector) mapper.materializeVector(); } return res; } + private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) + throws IOException { + for (int i = 0; i < fields.length; i++) { + readTermVector(fields[i], tvfPointers[i], mapper); + } + + } + + /** * * @param field The field to read in * @param tvfPointer The pointer within the tvf file where we should start reading + * @param mapper The mapper used to map the TermVector * @return The TermVector located at that position * @throws IOException + */ - private SegmentTermVector readTermVector(String field, long tvfPointer) + private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper) throws IOException { // Now read the data from specified position @@ -236,7 +294,7 @@ class TermVectorsReader implements Cloneable { //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) - return new SegmentTermVector(field, null, null); + return; boolean storePositions; boolean storeOffsets; @@ -251,18 +309,7 @@ class TermVectorsReader implements Cloneable { storePositions = false; storeOffsets = false; } - - String terms[] = new String[numTerms]; - int termFreqs[] = new int[numTerms]; - - // we may not need these, but declare them - int positions[][] = null; - TermVectorOffsetInfo offsets[][] = null; - if(storePositions) - positions = new int[numTerms][]; - if(storeOffsets) - offsets = new TermVectorOffsetInfo[numTerms][]; - + mapper.setExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; @@ -282,45 +329,54 @@ class TermVectorsReader implements Cloneable { } tvf.readChars(buffer, start, deltaLength); - terms[i] = new String(buffer, 0, totalLength); + String term = new String(buffer, 0, totalLength); previousBuffer = buffer; int freq = tvf.readVInt(); - termFreqs[i] = freq; - + int [] positions = null; if (storePositions) { //read in the positions - int [] pos = new int[freq]; - positions[i] = pos; - int prevPosition = 0; - for (int j = 0; j < freq; j++) - { - pos[j] = prevPosition + tvf.readVInt(); - prevPosition = pos[j]; + //does the mapper even care about positions? + if (mapper.isIgnoringPositions() == false) { + positions = new int[freq]; + int prevPosition = 0; + for (int j = 0; j < freq; j++) + { + positions[j] = prevPosition + tvf.readVInt(); + prevPosition = positions[j]; + } + } else { + //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip + // + for (int j = 0; j < freq; j++) + { + tvf.readVInt(); + } } } - + TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { - TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq]; - offsets[i] = offs; - int prevOffset = 0; - for (int j = 0; j < freq; j++) { - int startOffset = prevOffset + tvf.readVInt(); - int endOffset = startOffset + tvf.readVInt(); - offs[j] = new TermVectorOffsetInfo(startOffset, endOffset); - prevOffset = endOffset; + //does the mapper even care about offsets? + if (mapper.isIgnoringOffsets() == false) { + offsets = new TermVectorOffsetInfo[freq]; + int prevOffset = 0; + for (int j = 0; j < freq; j++) { + int startOffset = prevOffset + tvf.readVInt(); + int endOffset = startOffset + tvf.readVInt(); + offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); + prevOffset = endOffset; + } + } else { + for (int j = 0; j < freq; j++){ + tvf.readVInt(); + tvf.readVInt(); + } } } + mapper.map(term, freq, offsets, positions); } - - SegmentTermVector tv; - if (storePositions || storeOffsets){ - tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); - } - else { - tv = new SegmentTermVector(field, terms, termFreqs); - } - return tv; } + + protected Object clone() { if (tvx == null || tvd == null || tvf == null) @@ -337,4 +393,67 @@ class TermVectorsReader implements Cloneable { return clone; } + + + } + +/** + * Models the existing parallel array structure + */ +class ParallelArrayTermVectorMapper extends TermVectorMapper +{ + + private int numTerms; + private String[] terms; + private int[] termFreqs; + private int positions[][] = null; + private TermVectorOffsetInfo offsets[][] = null; + private int currentPosition; + private boolean storingOffsets; + private boolean storingPositions; + private String field; + + public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { + this.numTerms = numTerms; + this.field = field; + terms = new String[numTerms]; + termFreqs = new int[numTerms]; + this.storingOffsets = storeOffsets; + this.storingPositions = storePositions; + if(storePositions) + this.positions = new int[numTerms][]; + if(storeOffsets) + this.offsets = new TermVectorOffsetInfo[numTerms][]; + } + + public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { + terms[currentPosition] = term; + termFreqs[currentPosition] = frequency; + if (storingOffsets) + { + this.offsets[currentPosition] = offsets; + } + if (storingPositions) + { + this.positions[currentPosition] = positions; + } + currentPosition++; + } + + /** + * Construct the vector + * @return + */ + public TermFreqVector materializeVector() { + SegmentTermVector tv = null; + if (field != null && terms != null) { + if (storingPositions || storingOffsets) { + tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); + } else { + tv = new SegmentTermVector(field, terms, termFreqs); + } + } + return tv; + } +} \ No newline at end of file diff --git a/src/test/org/apache/lucene/index/TestIndexReader.java b/src/test/org/apache/lucene/index/TestIndexReader.java index a3df3a2c89c..acd288a839e 100644 --- a/src/test/org/apache/lucene/index/TestIndexReader.java +++ b/src/test/org/apache/lucene/index/TestIndexReader.java @@ -21,29 +21,20 @@ package org.apache.lucene.index; import junit.framework.TestCase; import junit.framework.TestSuite; import junit.textui.TestRunner; - -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.LockObtainFailedException; -import org.apache.lucene.store.AlreadyClosedException; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; - -import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.*; import org.apache.lucene.util._TestUtil; -import java.util.Collection; -import java.util.Arrays; -import java.io.IOException; -import java.io.FileNotFoundException; import java.io.File; - -import org.apache.lucene.store.MockRAMDirectory; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; public class TestIndexReader extends TestCase { @@ -180,8 +171,43 @@ public class TestIndexReader extends TestCase d.close(); } + public void testTermVectors() throws Exception { + RAMDirectory d = new MockRAMDirectory(); + // set up writer + IndexWriter writer = new IndexWriter(d, new StandardAnalyzer(), true); + // want to get some more segments here + // new termvector fields + for (int i = 0; i < 5 * writer.getMergeFactor(); i++) { + Document doc = new Document(); + doc.add(new Field("tvnot","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO)); + doc.add(new Field("termvector","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("tvoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS)); + doc.add(new Field("tvposition","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + doc.add(new Field("tvpositionoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); - private void assertTermDocsCount(String msg, + writer.addDocument(doc); + } + writer.close(); + IndexReader reader = IndexReader.open(d); + FieldSortedTermVectorMapper mapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); + reader.getTermFreqVector(0, mapper); + Map map = mapper.getFieldToTerms(); + assertTrue("map is null and it shouldn't be", map != null); + assertTrue("map Size: " + map.size() + " is not: " + 4, map.size() == 4); + Set set = (Set) map.get("termvector"); + for (Iterator iterator = set.iterator(); iterator.hasNext();) { + TermVectorEntry entry = (TermVectorEntry) iterator.next(); + assertTrue("entry is null and it shouldn't be", entry != null); + System.out.println("Entry: " + entry); + } + + + + + + } + + private void assertTermDocsCount(String msg, IndexReader reader, Term term, int expected) diff --git a/src/test/org/apache/lucene/index/TestTermVectorsReader.java b/src/test/org/apache/lucene/index/TestTermVectorsReader.java index 6aa9dd64288..4827f548313 100644 --- a/src/test/org/apache/lucene/index/TestTermVectorsReader.java +++ b/src/test/org/apache/lucene/index/TestTermVectorsReader.java @@ -22,16 +22,19 @@ import org.apache.lucene.store.RAMDirectory; import java.io.IOException; import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedSet; public class TestTermVectorsReader extends TestCase { private TermVectorsWriter writer = null; //Must be lexicographically sorted, will do in setup, versus trying to maintain here - private String [] testFields = {"f1", "f2", "f3"}; - private boolean [] testFieldsStorePos = {true, false, true, false}; - private boolean [] testFieldsStoreOff = {true, false, false, true}; - private String [] testTerms = {"this", "is", "a", "test"}; - private int [][] positions = new int[testTerms.length][]; - private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][]; + private String[] testFields = {"f1", "f2", "f3", "f4"}; + private boolean[] testFieldsStorePos = {true, false, true, false}; + private boolean[] testFieldsStoreOff = {true, false, false, true}; + private String[] testTerms = {"this", "is", "a", "test"}; + private int[][] positions = new int[testTerms.length][]; + private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][]; private RAMDirectory dir = new RAMDirectory(); private String seg = "testSegment"; private FieldInfos fieldInfos = new FieldInfos(); @@ -44,35 +47,37 @@ public class TestTermVectorsReader extends TestCase { for (int i = 0; i < testFields.length; i++) { fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]); } - - for (int i = 0; i < testTerms.length; i++) - { + + for (int i = 0; i < testTerms.length; i++) { positions[i] = new int[3]; for (int j = 0; j < positions[i].length; j++) { // poditions are always sorted in increasing order - positions[i][j] = (int)(j * 10 + Math.random() * 10); + positions[i][j] = (int) (j * 10 + Math.random() * 10); } offsets[i] = new TermVectorOffsetInfo[3]; - for (int j = 0; j < offsets[i].length; j++){ + for (int j = 0; j < offsets[i].length; j++) { // ofsets are alway sorted in increasing order offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length()); - } + } } Arrays.sort(testTerms); + //Create 5 documents for testing, they all have the same terms + writer = new TermVectorsWriter(dir, seg, fieldInfos); for (int j = 0; j < 5; j++) { - writer = new TermVectorsWriter(dir, seg, fieldInfos); + writer.openDocument(); for (int k = 0; k < testFields.length; k++) { writer.openField(testFields[k]); for (int i = 0; i < testTerms.length; i++) { - writer.addTerm(testTerms[i], 3, positions[i], offsets[i]); + writer.addTerm(testTerms[i], 3, positions[i], offsets[i]); } writer.closeField(); } writer.closeDocument(); - writer.close(); + } + writer.close(); } protected void tearDown() { @@ -80,34 +85,38 @@ public class TestTermVectorsReader extends TestCase { } public void test() { - //Check to see the files were created properly in setup - assertTrue(writer.isDocumentOpen() == false); - assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION)); - assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION)); + //Check to see the files were created properly in setup + assertTrue(writer.isDocumentOpen() == false); + assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION)); + assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION)); } - + public void testReader() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); assertTrue(reader != null); - TermFreqVector vector = reader.get(0, testFields[0]); - assertTrue(vector != null); - String [] terms = vector.getTerms(); - assertTrue(terms != null); - assertTrue(terms.length == testTerms.length); - for (int i = 0; i < terms.length; i++) { - String term = terms[i]; - //System.out.println("Term: " + term); - assertTrue(term.equals(testTerms[i])); + for (int j = 0; j < 5; j++) { + TermFreqVector vector = reader.get(j, testFields[0]); + assertTrue(vector != null); + String[] terms = vector.getTerms(); + assertTrue(terms != null); + assertTrue(terms.length == testTerms.length); + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + //System.out.println("Term: " + term); + assertTrue(term.equals(testTerms[i])); + } } - } - + + + } + public void testPositionReader() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); assertTrue(reader != null); TermPositionVector vector; - String [] terms; - vector = (TermPositionVector)reader.get(0, testFields[0]); - assertTrue(vector != null); + String[] terms; + vector = (TermPositionVector) reader.get(0, testFields[0]); + assertTrue(vector != null); terms = vector.getTerms(); assertTrue(terms != null); assertTrue(terms.length == testTerms.length); @@ -115,14 +124,14 @@ public class TestTermVectorsReader extends TestCase { String term = terms[i]; //System.out.println("Term: " + term); assertTrue(term.equals(testTerms[i])); - int [] positions = vector.getTermPositions(i); + int[] positions = vector.getTermPositions(i); assertTrue(positions != null); assertTrue(positions.length == this.positions[i].length); for (int j = 0; j < positions.length; j++) { int position = positions[j]; assertTrue(position == this.positions[i][j]); } - TermVectorOffsetInfo [] offset = vector.getOffsets(i); + TermVectorOffsetInfo[] offset = vector.getOffsets(i); assertTrue(offset != null); assertTrue(offset.length == this.offsets[i].length); for (int j = 0; j < offset.length; j++) { @@ -130,9 +139,9 @@ public class TestTermVectorsReader extends TestCase { assertTrue(termVectorOffsetInfo.equals(offsets[i][j])); } } - + TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset - assertTrue(freqVector != null); + assertTrue(freqVector != null); assertTrue(freqVector instanceof TermPositionVector == false); terms = freqVector.getTerms(); assertTrue(terms != null); @@ -140,30 +149,30 @@ public class TestTermVectorsReader extends TestCase { for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); - assertTrue(term.equals(testTerms[i])); + assertTrue(term.equals(testTerms[i])); } } - + public void testOffsetReader() throws IOException { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); assertTrue(reader != null); - TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]); + TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]); assertTrue(vector != null); - String [] terms = vector.getTerms(); + String[] terms = vector.getTerms(); assertTrue(terms != null); assertTrue(terms.length == testTerms.length); for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); assertTrue(term.equals(testTerms[i])); - int [] positions = vector.getTermPositions(i); + int[] positions = vector.getTermPositions(i); assertTrue(positions != null); assertTrue(positions.length == this.positions[i].length); for (int j = 0; j < positions.length; j++) { int position = positions[j]; assertTrue(position == this.positions[i][j]); } - TermVectorOffsetInfo [] offset = vector.getOffsets(i); + TermVectorOffsetInfo[] offset = vector.getOffsets(i); assertTrue(offset != null); assertTrue(offset.length == this.offsets[i].length); for (int j = 0; j < offset.length; j++) { @@ -172,18 +181,112 @@ public class TestTermVectorsReader extends TestCase { } } } - + + public void testMapper() throws IOException { + TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); + assertTrue(reader != null); + SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); + reader.get(0, mapper); + SortedSet set = mapper.getTermVectorEntrySet(); + assertTrue("set is null and it shouldn't be", set != null); + //three fields, 4 terms, all terms are the same + assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4); + //Check offsets and positions + for (Iterator iterator = set.iterator(); iterator.hasNext();) { + TermVectorEntry tve = (TermVectorEntry) iterator.next(); + assertTrue("tve is null and it shouldn't be", tve != null); + assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null); + assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null); + + } + + mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); + reader.get(1, mapper); + set = mapper.getTermVectorEntrySet(); + assertTrue("set is null and it shouldn't be", set != null); + //three fields, 4 terms, all terms are the same + assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4); + //Should have offsets and positions b/c we are munging all the fields together + for (Iterator iterator = set.iterator(); iterator.hasNext();) { + TermVectorEntry tve = (TermVectorEntry) iterator.next(); + assertTrue("tve is null and it shouldn't be", tve != null); + assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null); + assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null); + + } + + + FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); + reader.get(0, fsMapper); + Map map = fsMapper.getFieldToTerms(); + assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length); + for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) { + Map.Entry entry = (Map.Entry) iterator.next(); + SortedSet sortedSet = (SortedSet) entry.getValue(); + assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4); + for (Iterator inner = sortedSet.iterator(); inner.hasNext();) { + TermVectorEntry tve = (TermVectorEntry) inner.next(); + assertTrue("tve is null and it shouldn't be", tve != null); + //Check offsets and positions. + assertTrue("tve is null and it shouldn't be", tve != null); + String field = tve.getField(); + if (field.equals(testFields[0])) { + //should have offsets + + assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null); + assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null); + } + else if (field.equals(testFields[1])) { + //should not have offsets + + assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null); + assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null); + } + } + } + //Try mapper that ignores offs and positions + fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator()); + reader.get(0, fsMapper); + map = fsMapper.getFieldToTerms(); + assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length); + for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) { + Map.Entry entry = (Map.Entry) iterator.next(); + SortedSet sortedSet = (SortedSet) entry.getValue(); + assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4); + for (Iterator inner = sortedSet.iterator(); inner.hasNext();) { + TermVectorEntry tve = (TermVectorEntry) inner.next(); + assertTrue("tve is null and it shouldn't be", tve != null); + //Check offsets and positions. + assertTrue("tve is null and it shouldn't be", tve != null); + String field = tve.getField(); + if (field.equals(testFields[0])) { + //should have offsets + + assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null); + assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null); + } + else if (field.equals(testFields[1])) { + //should not have offsets + + assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null); + assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null); + } + } + } + + } + /** * Make sure exceptions and bad params are handled appropriately - */ + */ public void testBadParams() { try { TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos); assertTrue(reader != null); //Bad document number, good field number reader.get(50, testFields[0]); - fail(); + fail(); } catch (IOException e) { // expected exception } @@ -192,7 +295,7 @@ public class TestTermVectorsReader extends TestCase { assertTrue(reader != null); //Bad document number, no field reader.get(50); - fail(); + fail(); } catch (IOException e) { // expected exception } @@ -201,9 +304,9 @@ public class TestTermVectorsReader extends TestCase { assertTrue(reader != null); //good document number, bad field number TermFreqVector vector = reader.get(0, "f50"); - assertTrue(vector == null); + assertTrue(vector == null); } catch (IOException e) { fail(); } - } + } } diff --git a/src/test/org/apache/lucene/search/TestTermVectors.java b/src/test/org/apache/lucene/search/TestTermVectors.java index 67c260d1f77..c31671b8f53 100644 --- a/src/test/org/apache/lucene/search/TestTermVectors.java +++ b/src/test/org/apache/lucene/search/TestTermVectors.java @@ -28,7 +28,9 @@ import org.apache.lucene.util.English; import java.io.IOException; import java.util.HashMap; +import java.util.Iterator; import java.util.Map; +import java.util.SortedSet; public class TestTermVectors extends TestCase { private IndexSearcher searcher; @@ -171,7 +173,7 @@ public class TestTermVectors extends TestCase { assertTrue(false); } } - + public void testKnownSetOfDocuments() { String test1 = "eating chocolate in a computer lab"; //6 terms String test2 = "computer in a computer lab"; //5 terms @@ -275,20 +277,45 @@ public class TestTermVectors extends TestCase { Integer freqInt = (Integer)test4Map.get(term); assertTrue(freqInt != null); assertTrue(freqInt.intValue() == freq); - } + } + SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); + knownSearcher.reader.getTermFreqVector(hits.id(1), mapper); + SortedSet vectorEntrySet = mapper.getTermVectorEntrySet(); + assertTrue("mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10); + TermVectorEntry last = null; + for (Iterator iterator = vectorEntrySet.iterator(); iterator.hasNext();) { + TermVectorEntry tve = (TermVectorEntry) iterator.next(); + if (tve != null && last != null) + { + assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency()); + Integer expectedFreq = (Integer) test4Map.get(tve.getTerm()); + //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields + assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue()); + } + last = tve; + + } + + FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); + knownSearcher.reader.getTermFreqVector(hits.id(1), fieldMapper); + Map map = fieldMapper.getFieldToTerms(); + assertTrue("map Size: " + map.size() + " is not: " + 2, map.size() == 2); + vectorEntrySet = (SortedSet) map.get("field"); + assertTrue("vectorEntrySet is null and it shouldn't be", vectorEntrySet != null); + assertTrue("vectorEntrySet Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10); knownSearcher.close(); } catch (IOException e) { e.printStackTrace(); assertTrue(false); } - - } private void setupDoc(Document doc, String text) { doc.add(new Field("field", text, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); + doc.add(new Field("field2", text, Field.Store.YES, + Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); //System.out.println("Document: " + doc); }