LUCENE-868: New Term Vector access mechanism. Allows for applications to define how they access term vector information instead of having to pack/unpack the TV info returned by the old way.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@558592 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-07-23 03:17:25 +00:00
parent 86432275f6
commit e97d5830ce
16 changed files with 986 additions and 149 deletions

View File

@ -54,6 +54,10 @@ New features
2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
3. LUCENE-868: Added new Term Vector access features. New callback mechanism allows application to define how and where to read Term Vectors from disk.
This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the
actual storage of Term Vectors has taken place.
Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the

View File

@ -17,6 +17,16 @@ package org.apache.lucene.index.memory;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@ -30,22 +40,13 @@ import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* High-performance single-document main memory Apache Lucene fulltext search index.
*
@ -935,8 +936,47 @@ public class MemoryIndex {
}
return vectors;
}
public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException
{
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors");
// if (vectors.length == 0) return null;
for (Iterator iterator = fields.keySet().iterator(); iterator.hasNext();)
{
String fieldName = (String) iterator.next();
getTermFreqVector(docNumber, fieldName, mapper);
}
}
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException
{
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
final Info info = getInfo(field);
if (info == null){
return;
}
info.sortTerms();
mapper.setExpectations(field, info.sortedTerms.length, stride != 1, true);
for (int i = info.sortedTerms.length; --i >=0;){
ArrayIntList positions = (ArrayIntList) info.sortedTerms[i].getValue();
int size = positions.size();
org.apache.lucene.index.TermVectorOffsetInfo[] offsets =
new org.apache.lucene.index.TermVectorOffsetInfo[size / stride];
for (int k=0, j=1; j < size; k++, j += stride) {
int start = positions.get(j);
int end = positions.get(j+1);
offsets[k] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end);
}
mapper.map((String)info.sortedTerms[i].getKey(),
numPositions((ArrayIntList) info.sortedTerms[i].getValue()),
offsets, ((ArrayIntList) info.sortedTerms[i].getValue()).toArray(stride));
}
}
public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
final Info info = getInfo(fieldName);
if (info == null) return null; // TODO: or return empty vector impl???

View File

@ -0,0 +1,70 @@
package org.apache.lucene.index;
import java.util.*;
/**
* Copyright 2007 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* For each Field, store a sorted collection of {@link TermVectorEntry}s
* <p/>
* This is not thread-safe.
*/
public class FieldSortedTermVectorMapper extends TermVectorMapper{
private Map fieldToTerms = new HashMap();
private SortedSet currentSet;
private String currentField;
private Comparator comparator;
/**
*
* @param comparator A Comparator for sorting {@link TermVectorEntry}s
*/
public FieldSortedTermVectorMapper(Comparator comparator) {
this(false, false, comparator);
}
public FieldSortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
super(ignoringPositions, ignoringOffsets);
this.comparator = comparator;
}
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions);
currentSet.add(entry);
}
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
currentSet = new TreeSet(comparator);
currentField = field;
fieldToTerms.put(field, currentSet);
}
/**
* Get the mapping between fields and terms, sorted by the comparator
*
* @return A map between field names and {@link java.util.SortedSet}s per field. SortedSet entries are {@link TermVectorEntry}
*/
public Map getFieldToTerms() {
return fieldToTerms;
}
public Comparator getComparator() {
return comparator;
}
}

View File

@ -115,6 +115,18 @@ public class FilterIndexReader extends IndexReader {
return in.getTermFreqVector(docNumber, field);
}
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
ensureOpen();
in.getTermFreqVector(docNumber, field, mapper);
}
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
ensureOpen();
in.getTermFreqVector(docNumber, mapper);
}
public int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
return in.numDocs();

View File

@ -20,12 +20,7 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.*;
import java.io.File;
import java.io.FileOutputStream;
@ -385,6 +380,25 @@ public abstract class IndexReader {
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException;
/**
* Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
* the {@link TermFreqVector}.
* @param docNumber The number of the document to load the vector for
* @param field The name of the field to load
* @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
* @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
*
*/
abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
/**
* Map all the term vectors for all fields in a Document
* @param docNumber The number of the document to load the vector for
* @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
* @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
*/
abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
/**
* Returns <code>true</code> if an index exists at the specified directory.
* If the directory does not exist or if there is no index in it.

View File

@ -85,6 +85,19 @@ public class MultiReader extends IndexReader {
return subReaders[i].getTermFreqVector(n - starts[i], field);
}
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
ensureOpen();
int i = readerIndex(docNumber); // find segment num
subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper);
}
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
ensureOpen();
int i = readerIndex(docNumber); // find segment num
subReaders[i].getTermFreqVector(docNumber - starts[i], mapper);
}
public synchronized int numDocs() {
// Don't call ensureOpen() here (it could affect performance)
if (numDocs == -1) { // check cache

View File

@ -194,6 +194,29 @@ public class ParallelReader extends IndexReader {
return reader==null ? null : reader.getTermFreqVector(n, field);
}
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
ensureOpen();
IndexReader reader = ((IndexReader)fieldToReader.get(field));
if (reader != null) {
reader.getTermFreqVector(docNumber, field, mapper);
}
}
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
ensureOpen();
ensureOpen();
Iterator i = fieldToReader.entrySet().iterator();
while (i.hasNext()) {
Map.Entry e = (Map.Entry)i.next();
String field = (String)e.getKey();
IndexReader reader = (IndexReader)e.getValue();
reader.getTermFreqVector(docNumber, field, mapper);
}
}
public boolean hasNorms(String field) throws IOException {
ensureOpen();
IndexReader reader = ((IndexReader)fieldToReader.get(field));

View File

@ -20,10 +20,10 @@ package org.apache.lucene.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.util.BitVector;
import java.io.IOException;
@ -643,6 +643,35 @@ class SegmentReader extends IndexReader {
}
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
ensureOpen();
FieldInfo fi = fieldInfos.fieldInfo(field);
if (fi == null || !fi.storeTermVector || termVectorsReaderOrig == null)
throw new IOException("field does not contain term vectors");
TermVectorsReader termVectorsReader = getTermVectorsReader();
if (termVectorsReader == null)
{
throw new IOException("Cannot open a reader for the term vectors");
}
termVectorsReader.get(docNumber, field, mapper);
}
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
ensureOpen();
if (termVectorsReaderOrig == null)
return;
TermVectorsReader termVectorsReader = getTermVectorsReader();
if (termVectorsReader == null)
return;
termVectorsReader.get(docNumber, mapper);
}
/** Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document.
* Each vector vector contains term numbers and frequencies for all terms

View File

@ -0,0 +1,129 @@
package org.apache.lucene.index;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.*;
/**
* Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information
* into a single, SortedSet.
* <br/>
* NOTE: This Mapper ignores all Field information for the Document. This means that if you are using offset/positions you will not
* know what Fields they correlate with.
* <br/>
* This is not thread-safe
*/
public class SortedTermVectorMapper extends TermVectorMapper{
private SortedSet currentSet;
private Map termToTVE = new HashMap();
private boolean storeOffsets;
private boolean storePositions;
/**
* Stand-in name for the field in {@link TermVectorEntry}.
*/
public static final String ALL = "_ALL_";
/**
*
* @param comparator A Comparator for sorting {@link TermVectorEntry}s
*/
public SortedTermVectorMapper(Comparator comparator) {
this(false, false, comparator);
}
public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
super(ignoringPositions, ignoringOffsets);
currentSet = new TreeSet(comparator);
}
/**
*
* @param term The term to map
* @param frequency The frequency of the term
* @param offsets Offset information, may be null
* @param positions Position information, may be null
*/
//We need to combine any previous mentions of the term
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
TermVectorEntry entry = (TermVectorEntry) termToTVE.get(term);
if (entry == null) {
entry = new TermVectorEntry(ALL, term, frequency,
storeOffsets == true ? offsets : null,
storePositions == true ? positions : null);
termToTVE.put(term, entry);
currentSet.add(entry);
} else {
entry.setFrequency(entry.getFrequency() + frequency);
if (storeOffsets)
{
TermVectorOffsetInfo [] existingOffsets = entry.getOffsets();
//A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions
if (existingOffsets != null && offsets != null && offsets.length > 0)
{
//copy over the existing offsets
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length];
System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length);
System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length);
entry.setOffsets(newOffsets);
}
else if (existingOffsets == null && offsets != null && offsets.length > 0)
{
entry.setOffsets(offsets);
}
//else leave it alone
}
if (storePositions)
{
int [] existingPositions = entry.getPositions();
if (existingPositions != null && positions != null && positions.length > 0)
{
int [] newPositions = new int[existingPositions.length + positions.length];
System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length);
System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length);
entry.setPositions(newPositions);
}
else if (existingPositions == null && positions != null && positions.length > 0)
{
entry.setPositions(positions);
}
}
}
}
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
this.storeOffsets = storeOffsets;
this.storePositions = storePositions;
}
/**
* The TermVectorEntrySet. A SortedSet of {@link TermVectorEntry} objects. Sort is by the comparator passed into the constructor.
*<br/>
* This set will be empty until after the mapping process takes place.
*
* @return The SortedSet of {@link TermVectorEntry}.
*/
public SortedSet getTermVectorEntrySet()
{
return currentSet;
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene.index;
/**
* Copyright 2007 The Apache Software Foundation
* <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Convenience class for holding TermVector information.
*/
public class TermVectorEntry {
private String field;
private String term;
private int frequency;
private TermVectorOffsetInfo [] offsets;
int [] positions;
public TermVectorEntry() {
}
public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
this.field = field;
this.term = term;
this.frequency = frequency;
this.offsets = offsets;
this.positions = positions;
}
public String getField() {
return field;
}
public int getFrequency() {
return frequency;
}
public TermVectorOffsetInfo[] getOffsets() {
return offsets;
}
public int[] getPositions() {
return positions;
}
public String getTerm() {
return term;
}
//Keep package local
void setFrequency(int frequency) {
this.frequency = frequency;
}
void setOffsets(TermVectorOffsetInfo[] offsets) {
this.offsets = offsets;
}
void setPositions(int[] positions) {
this.positions = positions;
}
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TermVectorEntry that = (TermVectorEntry) o;
if (term != null ? !term.equals(that.term) : that.term != null) return false;
return true;
}
public int hashCode() {
return (term != null ? term.hashCode() : 0);
}
public String toString() {
return "TermVectorEntry{" +
"field='" + field + '\'' +
", term='" + term + '\'' +
", frequency=" + frequency +
'}';
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.index;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Comparator;
/**
* Compares {@link org.apache.lucene.index.TermVectorEntry}s first by frequency and then by
* the term (case-sensitive)
*
**/
public class TermVectorEntryFreqSortedComparator implements Comparator {
public int compare(Object object, Object object1) {
int result = 0;
TermVectorEntry entry = (TermVectorEntry) object;
TermVectorEntry entry1 = (TermVectorEntry) object1;
result = entry1.getFrequency() - entry.getFrequency();
if (result == 0)
{
result = entry.getTerm().compareTo(entry1.getTerm());
if (result == 0)
{
result = entry.getField().compareTo(entry1.getField());
}
}
return result;
}
}

View File

@ -0,0 +1,88 @@
package org.apache.lucene.index;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* The TermVectorMapper can be used to map Term Vectors into your own
* structure instead of the parallel array structure used by
* {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
* <p/>
* It is up to the implementation to make sure it is thread-safe.
*
*
**/
public abstract class TermVectorMapper {
private boolean ignoringPositions;
private boolean ignoringOffsets;
protected TermVectorMapper() {
}
/**
*
* @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored
* @param ignoringOffsets similar to ignoringPositions
*/
protected TermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets) {
this.ignoringPositions = ignoringPositions;
this.ignoringOffsets = ignoringOffsets;
}
/**
* Tell the mapper what to expect in regards to field, number of terms, offset and position storage.
* This method will be called once before retrieving the vector for a field.
*
* This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}.
* @param field The field the vector is for
* @param numTerms The number of terms that need to be mapped
* @param storeOffsets true if the mapper should expect offset information
* @param storePositions true if the mapper should expect positions info
*/
public abstract void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions);
/**
* Map the Term Vector information into your own structure
* @param term The term to add to the vector
* @param frequency The frequency of the term in the document
* @param offsets null if the offset is not specified, otherwise the offset into the field of the term
* @param positions null if the position is not specified, otherwise the position in the field of the term
*/
public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions);
/**
* Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they
* can be skipped over. Derived classes should set this to true if they want to ignore positions. The default
* is false, meaning positions will be loaded if they are stored.
* @return false
*/
public boolean isIgnoringPositions()
{
return ignoringPositions;
}
/**
*
* @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets. false by default.
* @return false
*/
public boolean isIgnoringOffsets()
{
return ignoringOffsets;
}
}

View File

@ -17,9 +17,9 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.BufferedIndexInput;
import java.io.IOException;
@ -104,18 +104,9 @@ class TermVectorsReader implements Cloneable {
return size;
}
/**
* Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
* @throws IOException if there is an error reading the term vector files
*/
TermFreqVector get(int docNum, String field) throws IOException {
// Check if no term vectors are available for this segment at all
int fieldNumber = fieldInfos.fieldNumber(field);
TermFreqVector result = null;
public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
if (tvx != null) {
int fieldNumber = fieldInfos.fieldNumber(field);
//We need to account for the FORMAT_SIZE at when seeking in the tvx
//We don't need to do this in other seeks because we already have the
// file pointer
@ -137,7 +128,7 @@ class TermVectorsReader implements Cloneable {
number = tvd.readVInt();
else
number += tvd.readVInt();
if (number == fieldNumber)
found = i;
}
@ -150,14 +141,30 @@ class TermVectorsReader implements Cloneable {
for (int i = 0; i <= found; i++)
position += tvd.readVLong();
result = readTermVector(field, position);
readTermVector(field, position, mapper);
} else {
//System.out.println("Fieldable not found");
}
} else {
//System.out.println("No tvx file");
}
return result;
}
/**
* Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
* @throws IOException if there is an error reading the term vector files
*/
TermFreqVector get(int docNum, String field) throws IOException {
// Check if no term vectors are available for this segment at all
ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
get(docNum, field, mapper);
return mapper.materializeVector();
}
/**
@ -169,7 +176,6 @@ class TermVectorsReader implements Cloneable {
*/
TermFreqVector[] get(int docNum) throws IOException {
TermFreqVector[] result = null;
// Check if no term vectors are available for this segment at all
if (tvx != null) {
//We need to offset by
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
@ -182,7 +188,7 @@ class TermVectorsReader implements Cloneable {
if (fieldCount != 0) {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
number = tvd.readVInt();
@ -208,24 +214,76 @@ class TermVectorsReader implements Cloneable {
return result;
}
public void get(int docNumber, TermVectorMapper mapper) throws IOException {
// Check if no term vectors are available for this segment at all
if (tvx != null) {
//We need to offset by
tvx.seek((docNumber * 8L) + TermVectorsWriter.FORMAT_SIZE);
long position = tvx.readLong();
tvd.seek(position);
int fieldCount = tvd.readVInt();
// No fields are vectorized for this document
if (fieldCount != 0) {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
number = tvd.readVInt();
else
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
// Compute position in the tvf file
position = 0;
long[] tvfPointers = new long[fieldCount];
for (int i = 0; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
readTermVectors(fields, tvfPointers, mapper);
}
} else {
//System.out.println("No tvx file");
}
}
private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
throws IOException {
SegmentTermVector res[] = new SegmentTermVector[fields.length];
for (int i = 0; i < fields.length; i++) {
res[i] = readTermVector(fields[i], tvfPointers[i]);
ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
readTermVector(fields[i], tvfPointers[i], mapper);
res[i] = (SegmentTermVector) mapper.materializeVector();
}
return res;
}
private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
throws IOException {
for (int i = 0; i < fields.length; i++) {
readTermVector(fields[i], tvfPointers[i], mapper);
}
}
/**
*
* @param field The field to read in
* @param tvfPointer The pointer within the tvf file where we should start reading
* @param mapper The mapper used to map the TermVector
* @return The TermVector located at that position
* @throws IOException
*/
private SegmentTermVector readTermVector(String field, long tvfPointer)
private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
throws IOException {
// Now read the data from specified position
@ -236,7 +294,7 @@ class TermVectorsReader implements Cloneable {
//System.out.println("Num Terms: " + numTerms);
// If no terms - return a constant empty termvector. However, this should never occur!
if (numTerms == 0)
return new SegmentTermVector(field, null, null);
return;
boolean storePositions;
boolean storeOffsets;
@ -251,18 +309,7 @@ class TermVectorsReader implements Cloneable {
storePositions = false;
storeOffsets = false;
}
String terms[] = new String[numTerms];
int termFreqs[] = new int[numTerms];
// we may not need these, but declare them
int positions[][] = null;
TermVectorOffsetInfo offsets[][] = null;
if(storePositions)
positions = new int[numTerms][];
if(storeOffsets)
offsets = new TermVectorOffsetInfo[numTerms][];
mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
int start = 0;
int deltaLength = 0;
int totalLength = 0;
@ -282,45 +329,54 @@ class TermVectorsReader implements Cloneable {
}
tvf.readChars(buffer, start, deltaLength);
terms[i] = new String(buffer, 0, totalLength);
String term = new String(buffer, 0, totalLength);
previousBuffer = buffer;
int freq = tvf.readVInt();
termFreqs[i] = freq;
int [] positions = null;
if (storePositions) { //read in the positions
int [] pos = new int[freq];
positions[i] = pos;
int prevPosition = 0;
for (int j = 0; j < freq; j++)
{
pos[j] = prevPosition + tvf.readVInt();
prevPosition = pos[j];
//does the mapper even care about positions?
if (mapper.isIgnoringPositions() == false) {
positions = new int[freq];
int prevPosition = 0;
for (int j = 0; j < freq; j++)
{
positions[j] = prevPosition + tvf.readVInt();
prevPosition = positions[j];
}
} else {
//we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
//
for (int j = 0; j < freq; j++)
{
tvf.readVInt();
}
}
}
TermVectorOffsetInfo[] offsets = null;
if (storeOffsets) {
TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
offsets[i] = offs;
int prevOffset = 0;
for (int j = 0; j < freq; j++) {
int startOffset = prevOffset + tvf.readVInt();
int endOffset = startOffset + tvf.readVInt();
offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
prevOffset = endOffset;
//does the mapper even care about offsets?
if (mapper.isIgnoringOffsets() == false) {
offsets = new TermVectorOffsetInfo[freq];
int prevOffset = 0;
for (int j = 0; j < freq; j++) {
int startOffset = prevOffset + tvf.readVInt();
int endOffset = startOffset + tvf.readVInt();
offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
prevOffset = endOffset;
}
} else {
for (int j = 0; j < freq; j++){
tvf.readVInt();
tvf.readVInt();
}
}
}
mapper.map(term, freq, offsets, positions);
}
SegmentTermVector tv;
if (storePositions || storeOffsets){
tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
}
else {
tv = new SegmentTermVector(field, terms, termFreqs);
}
return tv;
}
protected Object clone() {
if (tvx == null || tvd == null || tvf == null)
@ -337,4 +393,67 @@ class TermVectorsReader implements Cloneable {
return clone;
}
}
/**
* Models the existing parallel array structure
*/
class ParallelArrayTermVectorMapper extends TermVectorMapper
{
private int numTerms;
private String[] terms;
private int[] termFreqs;
private int positions[][] = null;
private TermVectorOffsetInfo offsets[][] = null;
private int currentPosition;
private boolean storingOffsets;
private boolean storingPositions;
private String field;
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
this.numTerms = numTerms;
this.field = field;
terms = new String[numTerms];
termFreqs = new int[numTerms];
this.storingOffsets = storeOffsets;
this.storingPositions = storePositions;
if(storePositions)
this.positions = new int[numTerms][];
if(storeOffsets)
this.offsets = new TermVectorOffsetInfo[numTerms][];
}
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
terms[currentPosition] = term;
termFreqs[currentPosition] = frequency;
if (storingOffsets)
{
this.offsets[currentPosition] = offsets;
}
if (storingPositions)
{
this.positions[currentPosition] = positions;
}
currentPosition++;
}
/**
* Construct the vector
* @return
*/
public TermFreqVector materializeVector() {
SegmentTermVector tv = null;
if (field != null && terms != null) {
if (storingPositions || storingOffsets) {
tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
} else {
tv = new SegmentTermVector(field, terms, termFreqs);
}
}
return tv;
}
}

View File

@ -21,29 +21,20 @@ package org.apache.lucene.index;
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.*;
import org.apache.lucene.util._TestUtil;
import java.util.Collection;
import java.util.Arrays;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.File;
import org.apache.lucene.store.MockRAMDirectory;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
public class TestIndexReader extends TestCase
{
@ -180,8 +171,43 @@ public class TestIndexReader extends TestCase
d.close();
}
public void testTermVectors() throws Exception {
RAMDirectory d = new MockRAMDirectory();
// set up writer
IndexWriter writer = new IndexWriter(d, new StandardAnalyzer(), true);
// want to get some more segments here
// new termvector fields
for (int i = 0; i < 5 * writer.getMergeFactor(); i++) {
Document doc = new Document();
doc.add(new Field("tvnot","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO));
doc.add(new Field("termvector","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
doc.add(new Field("tvoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS));
doc.add(new Field("tvposition","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
doc.add(new Field("tvpositionoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
private void assertTermDocsCount(String msg,
writer.addDocument(doc);
}
writer.close();
IndexReader reader = IndexReader.open(d);
FieldSortedTermVectorMapper mapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
reader.getTermFreqVector(0, mapper);
Map map = mapper.getFieldToTerms();
assertTrue("map is null and it shouldn't be", map != null);
assertTrue("map Size: " + map.size() + " is not: " + 4, map.size() == 4);
Set set = (Set) map.get("termvector");
for (Iterator iterator = set.iterator(); iterator.hasNext();) {
TermVectorEntry entry = (TermVectorEntry) iterator.next();
assertTrue("entry is null and it shouldn't be", entry != null);
System.out.println("Entry: " + entry);
}
}
private void assertTermDocsCount(String msg,
IndexReader reader,
Term term,
int expected)

View File

@ -22,16 +22,19 @@ import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
public class TestTermVectorsReader extends TestCase {
private TermVectorsWriter writer = null;
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
private String [] testFields = {"f1", "f2", "f3"};
private boolean [] testFieldsStorePos = {true, false, true, false};
private boolean [] testFieldsStoreOff = {true, false, false, true};
private String [] testTerms = {"this", "is", "a", "test"};
private int [][] positions = new int[testTerms.length][];
private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
private String[] testFields = {"f1", "f2", "f3", "f4"};
private boolean[] testFieldsStorePos = {true, false, true, false};
private boolean[] testFieldsStoreOff = {true, false, false, true};
private String[] testTerms = {"this", "is", "a", "test"};
private int[][] positions = new int[testTerms.length][];
private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
private RAMDirectory dir = new RAMDirectory();
private String seg = "testSegment";
private FieldInfos fieldInfos = new FieldInfos();
@ -44,35 +47,37 @@ public class TestTermVectorsReader extends TestCase {
for (int i = 0; i < testFields.length; i++) {
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
}
for (int i = 0; i < testTerms.length; i++)
{
for (int i = 0; i < testTerms.length; i++) {
positions[i] = new int[3];
for (int j = 0; j < positions[i].length; j++) {
// poditions are always sorted in increasing order
positions[i][j] = (int)(j * 10 + Math.random() * 10);
positions[i][j] = (int) (j * 10 + Math.random() * 10);
}
offsets[i] = new TermVectorOffsetInfo[3];
for (int j = 0; j < offsets[i].length; j++){
for (int j = 0; j < offsets[i].length; j++) {
// ofsets are alway sorted in increasing order
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
}
}
}
Arrays.sort(testTerms);
//Create 5 documents for testing, they all have the same terms
writer = new TermVectorsWriter(dir, seg, fieldInfos);
for (int j = 0; j < 5; j++) {
writer = new TermVectorsWriter(dir, seg, fieldInfos);
writer.openDocument();
for (int k = 0; k < testFields.length; k++) {
writer.openField(testFields[k]);
for (int i = 0; i < testTerms.length; i++) {
writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
}
writer.closeField();
}
writer.closeDocument();
writer.close();
}
writer.close();
}
protected void tearDown() {
@ -80,34 +85,38 @@ public class TestTermVectorsReader extends TestCase {
}
public void test() {
//Check to see the files were created properly in setup
assertTrue(writer.isDocumentOpen() == false);
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
//Check to see the files were created properly in setup
assertTrue(writer.isDocumentOpen() == false);
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
}
public void testReader() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermFreqVector vector = reader.get(0, testFields[0]);
assertTrue(vector != null);
String [] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
for (int j = 0; j < 5; j++) {
TermFreqVector vector = reader.get(j, testFields[0]);
assertTrue(vector != null);
String[] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
}
}
}
}
public void testPositionReader() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermPositionVector vector;
String [] terms;
vector = (TermPositionVector)reader.get(0, testFields[0]);
assertTrue(vector != null);
String[] terms;
vector = (TermPositionVector) reader.get(0, testFields[0]);
assertTrue(vector != null);
terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
@ -115,14 +124,14 @@ public class TestTermVectorsReader extends TestCase {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
int [] positions = vector.getTermPositions(i);
int[] positions = vector.getTermPositions(i);
assertTrue(positions != null);
assertTrue(positions.length == this.positions[i].length);
for (int j = 0; j < positions.length; j++) {
int position = positions[j];
assertTrue(position == this.positions[i][j]);
}
TermVectorOffsetInfo [] offset = vector.getOffsets(i);
TermVectorOffsetInfo[] offset = vector.getOffsets(i);
assertTrue(offset != null);
assertTrue(offset.length == this.offsets[i].length);
for (int j = 0; j < offset.length; j++) {
@ -130,9 +139,9 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
}
}
TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
assertTrue(freqVector != null);
assertTrue(freqVector != null);
assertTrue(freqVector instanceof TermPositionVector == false);
terms = freqVector.getTerms();
assertTrue(terms != null);
@ -140,30 +149,30 @@ public class TestTermVectorsReader extends TestCase {
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
assertTrue(term.equals(testTerms[i]));
}
}
public void testOffsetReader() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
assertTrue(vector != null);
String [] terms = vector.getTerms();
String[] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
int [] positions = vector.getTermPositions(i);
int[] positions = vector.getTermPositions(i);
assertTrue(positions != null);
assertTrue(positions.length == this.positions[i].length);
for (int j = 0; j < positions.length; j++) {
int position = positions[j];
assertTrue(position == this.positions[i][j]);
}
TermVectorOffsetInfo [] offset = vector.getOffsets(i);
TermVectorOffsetInfo[] offset = vector.getOffsets(i);
assertTrue(offset != null);
assertTrue(offset.length == this.offsets[i].length);
for (int j = 0; j < offset.length; j++) {
@ -172,18 +181,112 @@ public class TestTermVectorsReader extends TestCase {
}
}
}
public void testMapper() throws IOException {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
reader.get(0, mapper);
SortedSet set = mapper.getTermVectorEntrySet();
assertTrue("set is null and it shouldn't be", set != null);
//three fields, 4 terms, all terms are the same
assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
//Check offsets and positions
for (Iterator iterator = set.iterator(); iterator.hasNext();) {
TermVectorEntry tve = (TermVectorEntry) iterator.next();
assertTrue("tve is null and it shouldn't be", tve != null);
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
}
mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
reader.get(1, mapper);
set = mapper.getTermVectorEntrySet();
assertTrue("set is null and it shouldn't be", set != null);
//three fields, 4 terms, all terms are the same
assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
//Should have offsets and positions b/c we are munging all the fields together
for (Iterator iterator = set.iterator(); iterator.hasNext();) {
TermVectorEntry tve = (TermVectorEntry) iterator.next();
assertTrue("tve is null and it shouldn't be", tve != null);
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
}
FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
reader.get(0, fsMapper);
Map map = fsMapper.getFieldToTerms();
assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
Map.Entry entry = (Map.Entry) iterator.next();
SortedSet sortedSet = (SortedSet) entry.getValue();
assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
TermVectorEntry tve = (TermVectorEntry) inner.next();
assertTrue("tve is null and it shouldn't be", tve != null);
//Check offsets and positions.
assertTrue("tve is null and it shouldn't be", tve != null);
String field = tve.getField();
if (field.equals(testFields[0])) {
//should have offsets
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
}
else if (field.equals(testFields[1])) {
//should not have offsets
assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
}
}
}
//Try mapper that ignores offs and positions
fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
reader.get(0, fsMapper);
map = fsMapper.getFieldToTerms();
assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
Map.Entry entry = (Map.Entry) iterator.next();
SortedSet sortedSet = (SortedSet) entry.getValue();
assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
TermVectorEntry tve = (TermVectorEntry) inner.next();
assertTrue("tve is null and it shouldn't be", tve != null);
//Check offsets and positions.
assertTrue("tve is null and it shouldn't be", tve != null);
String field = tve.getField();
if (field.equals(testFields[0])) {
//should have offsets
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
}
else if (field.equals(testFields[1])) {
//should not have offsets
assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
}
}
}
}
/**
* Make sure exceptions and bad params are handled appropriately
*/
*/
public void testBadParams() {
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
//Bad document number, good field number
reader.get(50, testFields[0]);
fail();
fail();
} catch (IOException e) {
// expected exception
}
@ -192,7 +295,7 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(reader != null);
//Bad document number, no field
reader.get(50);
fail();
fail();
} catch (IOException e) {
// expected exception
}
@ -201,9 +304,9 @@ public class TestTermVectorsReader extends TestCase {
assertTrue(reader != null);
//good document number, bad field number
TermFreqVector vector = reader.get(0, "f50");
assertTrue(vector == null);
assertTrue(vector == null);
} catch (IOException e) {
fail();
}
}
}
}

View File

@ -28,7 +28,9 @@ import org.apache.lucene.util.English;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedSet;
public class TestTermVectors extends TestCase {
private IndexSearcher searcher;
@ -171,7 +173,7 @@ public class TestTermVectors extends TestCase {
assertTrue(false);
}
}
public void testKnownSetOfDocuments() {
String test1 = "eating chocolate in a computer lab"; //6 terms
String test2 = "computer in a computer lab"; //5 terms
@ -275,20 +277,45 @@ public class TestTermVectors extends TestCase {
Integer freqInt = (Integer)test4Map.get(term);
assertTrue(freqInt != null);
assertTrue(freqInt.intValue() == freq);
}
}
SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
knownSearcher.reader.getTermFreqVector(hits.id(1), mapper);
SortedSet vectorEntrySet = mapper.getTermVectorEntrySet();
assertTrue("mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
TermVectorEntry last = null;
for (Iterator iterator = vectorEntrySet.iterator(); iterator.hasNext();) {
TermVectorEntry tve = (TermVectorEntry) iterator.next();
if (tve != null && last != null)
{
assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency());
Integer expectedFreq = (Integer) test4Map.get(tve.getTerm());
//we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue());
}
last = tve;
}
FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
knownSearcher.reader.getTermFreqVector(hits.id(1), fieldMapper);
Map map = fieldMapper.getFieldToTerms();
assertTrue("map Size: " + map.size() + " is not: " + 2, map.size() == 2);
vectorEntrySet = (SortedSet) map.get("field");
assertTrue("vectorEntrySet is null and it shouldn't be", vectorEntrySet != null);
assertTrue("vectorEntrySet Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
knownSearcher.close();
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
private void setupDoc(Document doc, String text)
{
doc.add(new Field("field", text, Field.Store.YES,
Field.Index.TOKENIZED, Field.TermVector.YES));
doc.add(new Field("field2", text, Field.Store.YES,
Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
//System.out.println("Document: " + doc);
}