mirror of
https://github.com/apache/lucene.git
synced 2025-02-06 18:18:38 +00:00
LUCENE-868: New Term Vector access mechanism. Allows for applications to define how they access term vector information instead of having to pack/unpack the TV info returned by the old way.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@558592 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
86432275f6
commit
e97d5830ce
@ -54,6 +54,10 @@ New features
|
|||||||
|
|
||||||
2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
|
2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
|
||||||
|
|
||||||
|
3. LUCENE-868: Added new Term Vector access features. New callback mechanism allows application to define how and where to read Term Vectors from disk.
|
||||||
|
This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the
|
||||||
|
actual storage of Term Vectors has taken place.
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
||||||
|
@ -17,6 +17,16 @@ package org.apache.lucene.index.memory;
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
@ -30,22 +40,13 @@ import org.apache.lucene.index.TermEnum;
|
|||||||
import org.apache.lucene.index.TermFreqVector;
|
import org.apache.lucene.index.TermFreqVector;
|
||||||
import org.apache.lucene.index.TermPositionVector;
|
import org.apache.lucene.index.TermPositionVector;
|
||||||
import org.apache.lucene.index.TermPositions;
|
import org.apache.lucene.index.TermPositions;
|
||||||
|
import org.apache.lucene.index.TermVectorMapper;
|
||||||
import org.apache.lucene.search.HitCollector;
|
import org.apache.lucene.search.HitCollector;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.Searcher;
|
import org.apache.lucene.search.Searcher;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* High-performance single-document main memory Apache Lucene fulltext search index.
|
* High-performance single-document main memory Apache Lucene fulltext search index.
|
||||||
*
|
*
|
||||||
@ -935,8 +936,47 @@ public class MemoryIndex {
|
|||||||
}
|
}
|
||||||
return vectors;
|
return vectors;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
|
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException
|
||||||
|
{
|
||||||
|
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors");
|
||||||
|
|
||||||
|
// if (vectors.length == 0) return null;
|
||||||
|
for (Iterator iterator = fields.keySet().iterator(); iterator.hasNext();)
|
||||||
|
{
|
||||||
|
String fieldName = (String) iterator.next();
|
||||||
|
getTermFreqVector(docNumber, fieldName, mapper);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException
|
||||||
|
{
|
||||||
|
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
|
||||||
|
final Info info = getInfo(field);
|
||||||
|
if (info == null){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
info.sortTerms();
|
||||||
|
mapper.setExpectations(field, info.sortedTerms.length, stride != 1, true);
|
||||||
|
for (int i = info.sortedTerms.length; --i >=0;){
|
||||||
|
|
||||||
|
ArrayIntList positions = (ArrayIntList) info.sortedTerms[i].getValue();
|
||||||
|
int size = positions.size();
|
||||||
|
org.apache.lucene.index.TermVectorOffsetInfo[] offsets =
|
||||||
|
new org.apache.lucene.index.TermVectorOffsetInfo[size / stride];
|
||||||
|
|
||||||
|
for (int k=0, j=1; j < size; k++, j += stride) {
|
||||||
|
int start = positions.get(j);
|
||||||
|
int end = positions.get(j+1);
|
||||||
|
offsets[k] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end);
|
||||||
|
}
|
||||||
|
mapper.map((String)info.sortedTerms[i].getKey(),
|
||||||
|
numPositions((ArrayIntList) info.sortedTerms[i].getValue()),
|
||||||
|
offsets, ((ArrayIntList) info.sortedTerms[i].getValue()).toArray(stride));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) {
|
||||||
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
|
if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector");
|
||||||
final Info info = getInfo(fieldName);
|
final Info info = getInfo(fieldName);
|
||||||
if (info == null) return null; // TODO: or return empty vector impl???
|
if (info == null) return null; // TODO: or return empty vector impl???
|
||||||
|
@ -0,0 +1,70 @@
|
|||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2007 The Apache Software Foundation
|
||||||
|
* <p/>
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p/>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p/>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each Field, store a sorted collection of {@link TermVectorEntry}s
|
||||||
|
* <p/>
|
||||||
|
* This is not thread-safe.
|
||||||
|
*/
|
||||||
|
public class FieldSortedTermVectorMapper extends TermVectorMapper{
|
||||||
|
private Map fieldToTerms = new HashMap();
|
||||||
|
private SortedSet currentSet;
|
||||||
|
private String currentField;
|
||||||
|
private Comparator comparator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param comparator A Comparator for sorting {@link TermVectorEntry}s
|
||||||
|
*/
|
||||||
|
public FieldSortedTermVectorMapper(Comparator comparator) {
|
||||||
|
this(false, false, comparator);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public FieldSortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
|
||||||
|
super(ignoringPositions, ignoringOffsets);
|
||||||
|
this.comparator = comparator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||||
|
TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions);
|
||||||
|
currentSet.add(entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
|
||||||
|
currentSet = new TreeSet(comparator);
|
||||||
|
currentField = field;
|
||||||
|
fieldToTerms.put(field, currentSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the mapping between fields and terms, sorted by the comparator
|
||||||
|
*
|
||||||
|
* @return A map between field names and {@link java.util.SortedSet}s per field. SortedSet entries are {@link TermVectorEntry}
|
||||||
|
*/
|
||||||
|
public Map getFieldToTerms() {
|
||||||
|
return fieldToTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Comparator getComparator() {
|
||||||
|
return comparator;
|
||||||
|
}
|
||||||
|
}
|
@ -115,6 +115,18 @@ public class FilterIndexReader extends IndexReader {
|
|||||||
return in.getTermFreqVector(docNumber, field);
|
return in.getTermFreqVector(docNumber, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
in.getTermFreqVector(docNumber, field, mapper);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
in.getTermFreqVector(docNumber, mapper);
|
||||||
|
}
|
||||||
|
|
||||||
public int numDocs() {
|
public int numDocs() {
|
||||||
// Don't call ensureOpen() here (it could affect performance)
|
// Don't call ensureOpen() here (it could affect performance)
|
||||||
return in.numDocs();
|
return in.numDocs();
|
||||||
|
@ -20,12 +20,7 @@ package org.apache.lucene.index;
|
|||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.*;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
|
||||||
import org.apache.lucene.store.IndexInput;
|
|
||||||
import org.apache.lucene.store.Lock;
|
|
||||||
import org.apache.lucene.store.LockObtainFailedException;
|
|
||||||
import org.apache.lucene.store.AlreadyClosedException;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
@ -385,6 +380,25 @@ public abstract class IndexReader {
|
|||||||
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
|
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
|
||||||
throws IOException;
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
|
||||||
|
* the {@link TermFreqVector}.
|
||||||
|
* @param docNumber The number of the document to load the vector for
|
||||||
|
* @param field The name of the field to load
|
||||||
|
* @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
|
||||||
|
* @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map all the term vectors for all fields in a Document
|
||||||
|
* @param docNumber The number of the document to load the vector for
|
||||||
|
* @param mapper The {@link TermVectorMapper} to process the vector. Must not be null
|
||||||
|
* @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
|
||||||
|
*/
|
||||||
|
abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns <code>true</code> if an index exists at the specified directory.
|
* Returns <code>true</code> if an index exists at the specified directory.
|
||||||
* If the directory does not exist or if there is no index in it.
|
* If the directory does not exist or if there is no index in it.
|
||||||
|
@ -85,6 +85,19 @@ public class MultiReader extends IndexReader {
|
|||||||
return subReaders[i].getTermFreqVector(n - starts[i], field);
|
return subReaders[i].getTermFreqVector(n - starts[i], field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
int i = readerIndex(docNumber); // find segment num
|
||||||
|
subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
int i = readerIndex(docNumber); // find segment num
|
||||||
|
subReaders[i].getTermFreqVector(docNumber - starts[i], mapper);
|
||||||
|
}
|
||||||
|
|
||||||
public synchronized int numDocs() {
|
public synchronized int numDocs() {
|
||||||
// Don't call ensureOpen() here (it could affect performance)
|
// Don't call ensureOpen() here (it could affect performance)
|
||||||
if (numDocs == -1) { // check cache
|
if (numDocs == -1) { // check cache
|
||||||
|
@ -194,6 +194,29 @@ public class ParallelReader extends IndexReader {
|
|||||||
return reader==null ? null : reader.getTermFreqVector(n, field);
|
return reader==null ? null : reader.getTermFreqVector(n, field);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
IndexReader reader = ((IndexReader)fieldToReader.get(field));
|
||||||
|
if (reader != null) {
|
||||||
|
reader.getTermFreqVector(docNumber, field, mapper);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
ensureOpen();
|
||||||
|
|
||||||
|
Iterator i = fieldToReader.entrySet().iterator();
|
||||||
|
while (i.hasNext()) {
|
||||||
|
Map.Entry e = (Map.Entry)i.next();
|
||||||
|
String field = (String)e.getKey();
|
||||||
|
IndexReader reader = (IndexReader)e.getValue();
|
||||||
|
reader.getTermFreqVector(docNumber, field, mapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public boolean hasNorms(String field) throws IOException {
|
public boolean hasNorms(String field) throws IOException {
|
||||||
ensureOpen();
|
ensureOpen();
|
||||||
IndexReader reader = ((IndexReader)fieldToReader.get(field));
|
IndexReader reader = ((IndexReader)fieldToReader.get(field));
|
||||||
|
@ -20,10 +20,10 @@ package org.apache.lucene.index;
|
|||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
import org.apache.lucene.search.DefaultSimilarity;
|
import org.apache.lucene.search.DefaultSimilarity;
|
||||||
|
import org.apache.lucene.store.BufferedIndexInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.store.BufferedIndexInput;
|
|
||||||
import org.apache.lucene.util.BitVector;
|
import org.apache.lucene.util.BitVector;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -643,6 +643,35 @@ class SegmentReader extends IndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
FieldInfo fi = fieldInfos.fieldInfo(field);
|
||||||
|
if (fi == null || !fi.storeTermVector || termVectorsReaderOrig == null)
|
||||||
|
throw new IOException("field does not contain term vectors");
|
||||||
|
|
||||||
|
TermVectorsReader termVectorsReader = getTermVectorsReader();
|
||||||
|
if (termVectorsReader == null)
|
||||||
|
{
|
||||||
|
throw new IOException("Cannot open a reader for the term vectors");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
termVectorsReader.get(docNumber, field, mapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException {
|
||||||
|
ensureOpen();
|
||||||
|
if (termVectorsReaderOrig == null)
|
||||||
|
return;
|
||||||
|
|
||||||
|
TermVectorsReader termVectorsReader = getTermVectorsReader();
|
||||||
|
if (termVectorsReader == null)
|
||||||
|
return;
|
||||||
|
|
||||||
|
termVectorsReader.get(docNumber, mapper);
|
||||||
|
}
|
||||||
|
|
||||||
/** Return an array of term frequency vectors for the specified document.
|
/** Return an array of term frequency vectors for the specified document.
|
||||||
* The array contains a vector for each vectorized field in the document.
|
* The array contains a vector for each vectorized field in the document.
|
||||||
* Each vector vector contains term numbers and frequencies for all terms
|
* Each vector vector contains term numbers and frequencies for all terms
|
||||||
|
129
src/java/org/apache/lucene/index/SortedTermVectorMapper.java
Normal file
129
src/java/org/apache/lucene/index/SortedTermVectorMapper.java
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
package org.apache.lucene.index;
|
||||||
|
/**
|
||||||
|
* Copyright 2007 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information
|
||||||
|
* into a single, SortedSet.
|
||||||
|
* <br/>
|
||||||
|
* NOTE: This Mapper ignores all Field information for the Document. This means that if you are using offset/positions you will not
|
||||||
|
* know what Fields they correlate with.
|
||||||
|
* <br/>
|
||||||
|
* This is not thread-safe
|
||||||
|
*/
|
||||||
|
public class SortedTermVectorMapper extends TermVectorMapper{
|
||||||
|
|
||||||
|
|
||||||
|
private SortedSet currentSet;
|
||||||
|
private Map termToTVE = new HashMap();
|
||||||
|
private boolean storeOffsets;
|
||||||
|
private boolean storePositions;
|
||||||
|
/**
|
||||||
|
* Stand-in name for the field in {@link TermVectorEntry}.
|
||||||
|
*/
|
||||||
|
public static final String ALL = "_ALL_";
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param comparator A Comparator for sorting {@link TermVectorEntry}s
|
||||||
|
*/
|
||||||
|
public SortedTermVectorMapper(Comparator comparator) {
|
||||||
|
this(false, false, comparator);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) {
|
||||||
|
super(ignoringPositions, ignoringOffsets);
|
||||||
|
currentSet = new TreeSet(comparator);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param term The term to map
|
||||||
|
* @param frequency The frequency of the term
|
||||||
|
* @param offsets Offset information, may be null
|
||||||
|
* @param positions Position information, may be null
|
||||||
|
*/
|
||||||
|
//We need to combine any previous mentions of the term
|
||||||
|
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||||
|
TermVectorEntry entry = (TermVectorEntry) termToTVE.get(term);
|
||||||
|
if (entry == null) {
|
||||||
|
entry = new TermVectorEntry(ALL, term, frequency,
|
||||||
|
storeOffsets == true ? offsets : null,
|
||||||
|
storePositions == true ? positions : null);
|
||||||
|
termToTVE.put(term, entry);
|
||||||
|
currentSet.add(entry);
|
||||||
|
} else {
|
||||||
|
entry.setFrequency(entry.getFrequency() + frequency);
|
||||||
|
if (storeOffsets)
|
||||||
|
{
|
||||||
|
TermVectorOffsetInfo [] existingOffsets = entry.getOffsets();
|
||||||
|
//A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions
|
||||||
|
if (existingOffsets != null && offsets != null && offsets.length > 0)
|
||||||
|
{
|
||||||
|
//copy over the existing offsets
|
||||||
|
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length];
|
||||||
|
System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length);
|
||||||
|
System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length);
|
||||||
|
entry.setOffsets(newOffsets);
|
||||||
|
}
|
||||||
|
else if (existingOffsets == null && offsets != null && offsets.length > 0)
|
||||||
|
{
|
||||||
|
entry.setOffsets(offsets);
|
||||||
|
}
|
||||||
|
//else leave it alone
|
||||||
|
}
|
||||||
|
if (storePositions)
|
||||||
|
{
|
||||||
|
int [] existingPositions = entry.getPositions();
|
||||||
|
if (existingPositions != null && positions != null && positions.length > 0)
|
||||||
|
{
|
||||||
|
int [] newPositions = new int[existingPositions.length + positions.length];
|
||||||
|
System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length);
|
||||||
|
System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length);
|
||||||
|
entry.setPositions(newPositions);
|
||||||
|
}
|
||||||
|
else if (existingPositions == null && positions != null && positions.length > 0)
|
||||||
|
{
|
||||||
|
entry.setPositions(positions);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
|
||||||
|
|
||||||
|
this.storeOffsets = storeOffsets;
|
||||||
|
this.storePositions = storePositions;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The TermVectorEntrySet. A SortedSet of {@link TermVectorEntry} objects. Sort is by the comparator passed into the constructor.
|
||||||
|
*<br/>
|
||||||
|
* This set will be empty until after the mapping process takes place.
|
||||||
|
*
|
||||||
|
* @return The SortedSet of {@link TermVectorEntry}.
|
||||||
|
*/
|
||||||
|
public SortedSet getTermVectorEntrySet()
|
||||||
|
{
|
||||||
|
return currentSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
98
src/java/org/apache/lucene/index/TermVectorEntry.java
Normal file
98
src/java/org/apache/lucene/index/TermVectorEntry.java
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2007 The Apache Software Foundation
|
||||||
|
* <p/>
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
* <p/>
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* <p/>
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenience class for holding TermVector information.
|
||||||
|
*/
|
||||||
|
public class TermVectorEntry {
|
||||||
|
private String field;
|
||||||
|
private String term;
|
||||||
|
private int frequency;
|
||||||
|
private TermVectorOffsetInfo [] offsets;
|
||||||
|
int [] positions;
|
||||||
|
|
||||||
|
|
||||||
|
public TermVectorEntry() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||||
|
this.field = field;
|
||||||
|
this.term = term;
|
||||||
|
this.frequency = frequency;
|
||||||
|
this.offsets = offsets;
|
||||||
|
this.positions = positions;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String getField() {
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getFrequency() {
|
||||||
|
return frequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TermVectorOffsetInfo[] getOffsets() {
|
||||||
|
return offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int[] getPositions() {
|
||||||
|
return positions;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTerm() {
|
||||||
|
return term;
|
||||||
|
}
|
||||||
|
|
||||||
|
//Keep package local
|
||||||
|
void setFrequency(int frequency) {
|
||||||
|
this.frequency = frequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
void setOffsets(TermVectorOffsetInfo[] offsets) {
|
||||||
|
this.offsets = offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
void setPositions(int[] positions) {
|
||||||
|
this.positions = positions;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
TermVectorEntry that = (TermVectorEntry) o;
|
||||||
|
|
||||||
|
if (term != null ? !term.equals(that.term) : that.term != null) return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return (term != null ? term.hashCode() : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "TermVectorEntry{" +
|
||||||
|
"field='" + field + '\'' +
|
||||||
|
", term='" + term + '\'' +
|
||||||
|
", frequency=" + frequency +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
package org.apache.lucene.index;
|
||||||
|
/**
|
||||||
|
* Copyright 2007 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares {@link org.apache.lucene.index.TermVectorEntry}s first by frequency and then by
|
||||||
|
* the term (case-sensitive)
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public class TermVectorEntryFreqSortedComparator implements Comparator {
|
||||||
|
public int compare(Object object, Object object1) {
|
||||||
|
int result = 0;
|
||||||
|
TermVectorEntry entry = (TermVectorEntry) object;
|
||||||
|
TermVectorEntry entry1 = (TermVectorEntry) object1;
|
||||||
|
result = entry1.getFrequency() - entry.getFrequency();
|
||||||
|
if (result == 0)
|
||||||
|
{
|
||||||
|
result = entry.getTerm().compareTo(entry1.getTerm());
|
||||||
|
if (result == 0)
|
||||||
|
{
|
||||||
|
result = entry.getField().compareTo(entry1.getField());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
88
src/java/org/apache/lucene/index/TermVectorMapper.java
Normal file
88
src/java/org/apache/lucene/index/TermVectorMapper.java
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
package org.apache.lucene.index;
|
||||||
|
/**
|
||||||
|
* Copyright 2007 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The TermVectorMapper can be used to map Term Vectors into your own
|
||||||
|
* structure instead of the parallel array structure used by
|
||||||
|
* {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}.
|
||||||
|
* <p/>
|
||||||
|
* It is up to the implementation to make sure it is thread-safe.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
**/
|
||||||
|
public abstract class TermVectorMapper {
|
||||||
|
|
||||||
|
private boolean ignoringPositions;
|
||||||
|
private boolean ignoringOffsets;
|
||||||
|
|
||||||
|
|
||||||
|
protected TermVectorMapper() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored
|
||||||
|
* @param ignoringOffsets similar to ignoringPositions
|
||||||
|
*/
|
||||||
|
protected TermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets) {
|
||||||
|
this.ignoringPositions = ignoringPositions;
|
||||||
|
this.ignoringOffsets = ignoringOffsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tell the mapper what to expect in regards to field, number of terms, offset and position storage.
|
||||||
|
* This method will be called once before retrieving the vector for a field.
|
||||||
|
*
|
||||||
|
* This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}.
|
||||||
|
* @param field The field the vector is for
|
||||||
|
* @param numTerms The number of terms that need to be mapped
|
||||||
|
* @param storeOffsets true if the mapper should expect offset information
|
||||||
|
* @param storePositions true if the mapper should expect positions info
|
||||||
|
*/
|
||||||
|
public abstract void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions);
|
||||||
|
/**
|
||||||
|
* Map the Term Vector information into your own structure
|
||||||
|
* @param term The term to add to the vector
|
||||||
|
* @param frequency The frequency of the term in the document
|
||||||
|
* @param offsets null if the offset is not specified, otherwise the offset into the field of the term
|
||||||
|
* @param positions null if the position is not specified, otherwise the position in the field of the term
|
||||||
|
*/
|
||||||
|
public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they
|
||||||
|
* can be skipped over. Derived classes should set this to true if they want to ignore positions. The default
|
||||||
|
* is false, meaning positions will be loaded if they are stored.
|
||||||
|
* @return false
|
||||||
|
*/
|
||||||
|
public boolean isIgnoringPositions()
|
||||||
|
{
|
||||||
|
return ignoringPositions;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets. false by default.
|
||||||
|
* @return false
|
||||||
|
*/
|
||||||
|
public boolean isIgnoringOffsets()
|
||||||
|
{
|
||||||
|
return ignoringOffsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -17,9 +17,9 @@ package org.apache.lucene.index;
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.store.BufferedIndexInput;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.BufferedIndexInput;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@ -104,18 +104,9 @@ class TermVectorsReader implements Cloneable {
|
|||||||
return size;
|
return size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public void get(int docNum, String field, TermVectorMapper mapper) throws IOException {
|
||||||
* Retrieve the term vector for the given document and field
|
|
||||||
* @param docNum The document number to retrieve the vector for
|
|
||||||
* @param field The field within the document to retrieve
|
|
||||||
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
|
|
||||||
* @throws IOException if there is an error reading the term vector files
|
|
||||||
*/
|
|
||||||
TermFreqVector get(int docNum, String field) throws IOException {
|
|
||||||
// Check if no term vectors are available for this segment at all
|
|
||||||
int fieldNumber = fieldInfos.fieldNumber(field);
|
|
||||||
TermFreqVector result = null;
|
|
||||||
if (tvx != null) {
|
if (tvx != null) {
|
||||||
|
int fieldNumber = fieldInfos.fieldNumber(field);
|
||||||
//We need to account for the FORMAT_SIZE at when seeking in the tvx
|
//We need to account for the FORMAT_SIZE at when seeking in the tvx
|
||||||
//We don't need to do this in other seeks because we already have the
|
//We don't need to do this in other seeks because we already have the
|
||||||
// file pointer
|
// file pointer
|
||||||
@ -137,7 +128,7 @@ class TermVectorsReader implements Cloneable {
|
|||||||
number = tvd.readVInt();
|
number = tvd.readVInt();
|
||||||
else
|
else
|
||||||
number += tvd.readVInt();
|
number += tvd.readVInt();
|
||||||
|
|
||||||
if (number == fieldNumber)
|
if (number == fieldNumber)
|
||||||
found = i;
|
found = i;
|
||||||
}
|
}
|
||||||
@ -150,14 +141,30 @@ class TermVectorsReader implements Cloneable {
|
|||||||
for (int i = 0; i <= found; i++)
|
for (int i = 0; i <= found; i++)
|
||||||
position += tvd.readVLong();
|
position += tvd.readVLong();
|
||||||
|
|
||||||
result = readTermVector(field, position);
|
readTermVector(field, position, mapper);
|
||||||
} else {
|
} else {
|
||||||
//System.out.println("Fieldable not found");
|
//System.out.println("Fieldable not found");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//System.out.println("No tvx file");
|
//System.out.println("No tvx file");
|
||||||
}
|
}
|
||||||
return result;
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve the term vector for the given document and field
|
||||||
|
* @param docNum The document number to retrieve the vector for
|
||||||
|
* @param field The field within the document to retrieve
|
||||||
|
* @return The TermFreqVector for the document and field or null if there is no termVector for this field.
|
||||||
|
* @throws IOException if there is an error reading the term vector files
|
||||||
|
*/
|
||||||
|
TermFreqVector get(int docNum, String field) throws IOException {
|
||||||
|
// Check if no term vectors are available for this segment at all
|
||||||
|
ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
|
||||||
|
get(docNum, field, mapper);
|
||||||
|
|
||||||
|
return mapper.materializeVector();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -169,7 +176,6 @@ class TermVectorsReader implements Cloneable {
|
|||||||
*/
|
*/
|
||||||
TermFreqVector[] get(int docNum) throws IOException {
|
TermFreqVector[] get(int docNum) throws IOException {
|
||||||
TermFreqVector[] result = null;
|
TermFreqVector[] result = null;
|
||||||
// Check if no term vectors are available for this segment at all
|
|
||||||
if (tvx != null) {
|
if (tvx != null) {
|
||||||
//We need to offset by
|
//We need to offset by
|
||||||
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
tvx.seek(((docNum + docStoreOffset) * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||||
@ -182,7 +188,7 @@ class TermVectorsReader implements Cloneable {
|
|||||||
if (fieldCount != 0) {
|
if (fieldCount != 0) {
|
||||||
int number = 0;
|
int number = 0;
|
||||||
String[] fields = new String[fieldCount];
|
String[] fields = new String[fieldCount];
|
||||||
|
|
||||||
for (int i = 0; i < fieldCount; i++) {
|
for (int i = 0; i < fieldCount; i++) {
|
||||||
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
|
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
|
||||||
number = tvd.readVInt();
|
number = tvd.readVInt();
|
||||||
@ -208,24 +214,76 @@ class TermVectorsReader implements Cloneable {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void get(int docNumber, TermVectorMapper mapper) throws IOException {
|
||||||
|
// Check if no term vectors are available for this segment at all
|
||||||
|
if (tvx != null) {
|
||||||
|
//We need to offset by
|
||||||
|
tvx.seek((docNumber * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||||
|
long position = tvx.readLong();
|
||||||
|
|
||||||
|
tvd.seek(position);
|
||||||
|
int fieldCount = tvd.readVInt();
|
||||||
|
|
||||||
|
// No fields are vectorized for this document
|
||||||
|
if (fieldCount != 0) {
|
||||||
|
int number = 0;
|
||||||
|
String[] fields = new String[fieldCount];
|
||||||
|
|
||||||
|
for (int i = 0; i < fieldCount; i++) {
|
||||||
|
if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
|
||||||
|
number = tvd.readVInt();
|
||||||
|
else
|
||||||
|
number += tvd.readVInt();
|
||||||
|
|
||||||
|
fields[i] = fieldInfos.fieldName(number);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute position in the tvf file
|
||||||
|
position = 0;
|
||||||
|
long[] tvfPointers = new long[fieldCount];
|
||||||
|
for (int i = 0; i < fieldCount; i++) {
|
||||||
|
position += tvd.readVLong();
|
||||||
|
tvfPointers[i] = position;
|
||||||
|
}
|
||||||
|
|
||||||
|
readTermVectors(fields, tvfPointers, mapper);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//System.out.println("No tvx file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
|
private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
|
||||||
throws IOException {
|
throws IOException {
|
||||||
SegmentTermVector res[] = new SegmentTermVector[fields.length];
|
SegmentTermVector res[] = new SegmentTermVector[fields.length];
|
||||||
for (int i = 0; i < fields.length; i++) {
|
for (int i = 0; i < fields.length; i++) {
|
||||||
res[i] = readTermVector(fields[i], tvfPointers[i]);
|
ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
|
||||||
|
readTermVector(fields[i], tvfPointers[i], mapper);
|
||||||
|
res[i] = (SegmentTermVector) mapper.materializeVector();
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper)
|
||||||
|
throws IOException {
|
||||||
|
for (int i = 0; i < fields.length; i++) {
|
||||||
|
readTermVector(fields[i], tvfPointers[i], mapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param field The field to read in
|
* @param field The field to read in
|
||||||
* @param tvfPointer The pointer within the tvf file where we should start reading
|
* @param tvfPointer The pointer within the tvf file where we should start reading
|
||||||
|
* @param mapper The mapper used to map the TermVector
|
||||||
* @return The TermVector located at that position
|
* @return The TermVector located at that position
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
|
||||||
*/
|
*/
|
||||||
private SegmentTermVector readTermVector(String field, long tvfPointer)
|
private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
// Now read the data from specified position
|
// Now read the data from specified position
|
||||||
@ -236,7 +294,7 @@ class TermVectorsReader implements Cloneable {
|
|||||||
//System.out.println("Num Terms: " + numTerms);
|
//System.out.println("Num Terms: " + numTerms);
|
||||||
// If no terms - return a constant empty termvector. However, this should never occur!
|
// If no terms - return a constant empty termvector. However, this should never occur!
|
||||||
if (numTerms == 0)
|
if (numTerms == 0)
|
||||||
return new SegmentTermVector(field, null, null);
|
return;
|
||||||
|
|
||||||
boolean storePositions;
|
boolean storePositions;
|
||||||
boolean storeOffsets;
|
boolean storeOffsets;
|
||||||
@ -251,18 +309,7 @@ class TermVectorsReader implements Cloneable {
|
|||||||
storePositions = false;
|
storePositions = false;
|
||||||
storeOffsets = false;
|
storeOffsets = false;
|
||||||
}
|
}
|
||||||
|
mapper.setExpectations(field, numTerms, storeOffsets, storePositions);
|
||||||
String terms[] = new String[numTerms];
|
|
||||||
int termFreqs[] = new int[numTerms];
|
|
||||||
|
|
||||||
// we may not need these, but declare them
|
|
||||||
int positions[][] = null;
|
|
||||||
TermVectorOffsetInfo offsets[][] = null;
|
|
||||||
if(storePositions)
|
|
||||||
positions = new int[numTerms][];
|
|
||||||
if(storeOffsets)
|
|
||||||
offsets = new TermVectorOffsetInfo[numTerms][];
|
|
||||||
|
|
||||||
int start = 0;
|
int start = 0;
|
||||||
int deltaLength = 0;
|
int deltaLength = 0;
|
||||||
int totalLength = 0;
|
int totalLength = 0;
|
||||||
@ -282,45 +329,54 @@ class TermVectorsReader implements Cloneable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
tvf.readChars(buffer, start, deltaLength);
|
tvf.readChars(buffer, start, deltaLength);
|
||||||
terms[i] = new String(buffer, 0, totalLength);
|
String term = new String(buffer, 0, totalLength);
|
||||||
previousBuffer = buffer;
|
previousBuffer = buffer;
|
||||||
int freq = tvf.readVInt();
|
int freq = tvf.readVInt();
|
||||||
termFreqs[i] = freq;
|
int [] positions = null;
|
||||||
|
|
||||||
if (storePositions) { //read in the positions
|
if (storePositions) { //read in the positions
|
||||||
int [] pos = new int[freq];
|
//does the mapper even care about positions?
|
||||||
positions[i] = pos;
|
if (mapper.isIgnoringPositions() == false) {
|
||||||
int prevPosition = 0;
|
positions = new int[freq];
|
||||||
for (int j = 0; j < freq; j++)
|
int prevPosition = 0;
|
||||||
{
|
for (int j = 0; j < freq; j++)
|
||||||
pos[j] = prevPosition + tvf.readVInt();
|
{
|
||||||
prevPosition = pos[j];
|
positions[j] = prevPosition + tvf.readVInt();
|
||||||
|
prevPosition = positions[j];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
|
||||||
|
//
|
||||||
|
for (int j = 0; j < freq; j++)
|
||||||
|
{
|
||||||
|
tvf.readVInt();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
TermVectorOffsetInfo[] offsets = null;
|
||||||
if (storeOffsets) {
|
if (storeOffsets) {
|
||||||
TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
|
//does the mapper even care about offsets?
|
||||||
offsets[i] = offs;
|
if (mapper.isIgnoringOffsets() == false) {
|
||||||
int prevOffset = 0;
|
offsets = new TermVectorOffsetInfo[freq];
|
||||||
for (int j = 0; j < freq; j++) {
|
int prevOffset = 0;
|
||||||
int startOffset = prevOffset + tvf.readVInt();
|
for (int j = 0; j < freq; j++) {
|
||||||
int endOffset = startOffset + tvf.readVInt();
|
int startOffset = prevOffset + tvf.readVInt();
|
||||||
offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
|
int endOffset = startOffset + tvf.readVInt();
|
||||||
prevOffset = endOffset;
|
offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
|
||||||
|
prevOffset = endOffset;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < freq; j++){
|
||||||
|
tvf.readVInt();
|
||||||
|
tvf.readVInt();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
mapper.map(term, freq, offsets, positions);
|
||||||
}
|
}
|
||||||
|
|
||||||
SegmentTermVector tv;
|
|
||||||
if (storePositions || storeOffsets){
|
|
||||||
tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
tv = new SegmentTermVector(field, terms, termFreqs);
|
|
||||||
}
|
|
||||||
return tv;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
protected Object clone() {
|
protected Object clone() {
|
||||||
|
|
||||||
if (tvx == null || tvd == null || tvf == null)
|
if (tvx == null || tvd == null || tvf == null)
|
||||||
@ -337,4 +393,67 @@ class TermVectorsReader implements Cloneable {
|
|||||||
|
|
||||||
return clone;
|
return clone;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Models the existing parallel array structure
|
||||||
|
*/
|
||||||
|
class ParallelArrayTermVectorMapper extends TermVectorMapper
|
||||||
|
{
|
||||||
|
|
||||||
|
private int numTerms;
|
||||||
|
private String[] terms;
|
||||||
|
private int[] termFreqs;
|
||||||
|
private int positions[][] = null;
|
||||||
|
private TermVectorOffsetInfo offsets[][] = null;
|
||||||
|
private int currentPosition;
|
||||||
|
private boolean storingOffsets;
|
||||||
|
private boolean storingPositions;
|
||||||
|
private String field;
|
||||||
|
|
||||||
|
public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
|
||||||
|
this.numTerms = numTerms;
|
||||||
|
this.field = field;
|
||||||
|
terms = new String[numTerms];
|
||||||
|
termFreqs = new int[numTerms];
|
||||||
|
this.storingOffsets = storeOffsets;
|
||||||
|
this.storingPositions = storePositions;
|
||||||
|
if(storePositions)
|
||||||
|
this.positions = new int[numTerms][];
|
||||||
|
if(storeOffsets)
|
||||||
|
this.offsets = new TermVectorOffsetInfo[numTerms][];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
|
||||||
|
terms[currentPosition] = term;
|
||||||
|
termFreqs[currentPosition] = frequency;
|
||||||
|
if (storingOffsets)
|
||||||
|
{
|
||||||
|
this.offsets[currentPosition] = offsets;
|
||||||
|
}
|
||||||
|
if (storingPositions)
|
||||||
|
{
|
||||||
|
this.positions[currentPosition] = positions;
|
||||||
|
}
|
||||||
|
currentPosition++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct the vector
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public TermFreqVector materializeVector() {
|
||||||
|
SegmentTermVector tv = null;
|
||||||
|
if (field != null && terms != null) {
|
||||||
|
if (storingPositions || storingOffsets) {
|
||||||
|
tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
|
||||||
|
} else {
|
||||||
|
tv = new SegmentTermVector(field, terms, termFreqs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tv;
|
||||||
|
}
|
||||||
|
}
|
@ -21,29 +21,20 @@ package org.apache.lucene.index;
|
|||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import junit.framework.TestSuite;
|
import junit.framework.TestSuite;
|
||||||
import junit.textui.TestRunner;
|
import junit.textui.TestRunner;
|
||||||
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
|
||||||
import org.apache.lucene.store.FSDirectory;
|
|
||||||
import org.apache.lucene.store.LockObtainFailedException;
|
|
||||||
import org.apache.lucene.store.AlreadyClosedException;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
|
||||||
import org.apache.lucene.search.Hits;
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.store.*;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import org.apache.lucene.store.MockRAMDirectory;
|
import java.io.IOException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
public class TestIndexReader extends TestCase
|
public class TestIndexReader extends TestCase
|
||||||
{
|
{
|
||||||
@ -180,8 +171,43 @@ public class TestIndexReader extends TestCase
|
|||||||
d.close();
|
d.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTermVectors() throws Exception {
|
||||||
|
RAMDirectory d = new MockRAMDirectory();
|
||||||
|
// set up writer
|
||||||
|
IndexWriter writer = new IndexWriter(d, new StandardAnalyzer(), true);
|
||||||
|
// want to get some more segments here
|
||||||
|
// new termvector fields
|
||||||
|
for (int i = 0; i < 5 * writer.getMergeFactor(); i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new Field("tvnot","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO));
|
||||||
|
doc.add(new Field("termvector","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||||
|
doc.add(new Field("tvoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS));
|
||||||
|
doc.add(new Field("tvposition","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||||
|
doc.add(new Field("tvpositionoffset","one two two three three three", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
|
|
||||||
private void assertTermDocsCount(String msg,
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
IndexReader reader = IndexReader.open(d);
|
||||||
|
FieldSortedTermVectorMapper mapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
|
||||||
|
reader.getTermFreqVector(0, mapper);
|
||||||
|
Map map = mapper.getFieldToTerms();
|
||||||
|
assertTrue("map is null and it shouldn't be", map != null);
|
||||||
|
assertTrue("map Size: " + map.size() + " is not: " + 4, map.size() == 4);
|
||||||
|
Set set = (Set) map.get("termvector");
|
||||||
|
for (Iterator iterator = set.iterator(); iterator.hasNext();) {
|
||||||
|
TermVectorEntry entry = (TermVectorEntry) iterator.next();
|
||||||
|
assertTrue("entry is null and it shouldn't be", entry != null);
|
||||||
|
System.out.println("Entry: " + entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertTermDocsCount(String msg,
|
||||||
IndexReader reader,
|
IndexReader reader,
|
||||||
Term term,
|
Term term,
|
||||||
int expected)
|
int expected)
|
||||||
|
@ -22,16 +22,19 @@ import org.apache.lucene.store.RAMDirectory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.SortedSet;
|
||||||
|
|
||||||
public class TestTermVectorsReader extends TestCase {
|
public class TestTermVectorsReader extends TestCase {
|
||||||
private TermVectorsWriter writer = null;
|
private TermVectorsWriter writer = null;
|
||||||
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
||||||
private String [] testFields = {"f1", "f2", "f3"};
|
private String[] testFields = {"f1", "f2", "f3", "f4"};
|
||||||
private boolean [] testFieldsStorePos = {true, false, true, false};
|
private boolean[] testFieldsStorePos = {true, false, true, false};
|
||||||
private boolean [] testFieldsStoreOff = {true, false, false, true};
|
private boolean[] testFieldsStoreOff = {true, false, false, true};
|
||||||
private String [] testTerms = {"this", "is", "a", "test"};
|
private String[] testTerms = {"this", "is", "a", "test"};
|
||||||
private int [][] positions = new int[testTerms.length][];
|
private int[][] positions = new int[testTerms.length][];
|
||||||
private TermVectorOffsetInfo [][] offsets = new TermVectorOffsetInfo[testTerms.length][];
|
private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
|
||||||
private RAMDirectory dir = new RAMDirectory();
|
private RAMDirectory dir = new RAMDirectory();
|
||||||
private String seg = "testSegment";
|
private String seg = "testSegment";
|
||||||
private FieldInfos fieldInfos = new FieldInfos();
|
private FieldInfos fieldInfos = new FieldInfos();
|
||||||
@ -44,35 +47,37 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
for (int i = 0; i < testFields.length; i++) {
|
for (int i = 0; i < testFields.length; i++) {
|
||||||
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
|
fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < testTerms.length; i++)
|
for (int i = 0; i < testTerms.length; i++) {
|
||||||
{
|
|
||||||
positions[i] = new int[3];
|
positions[i] = new int[3];
|
||||||
for (int j = 0; j < positions[i].length; j++) {
|
for (int j = 0; j < positions[i].length; j++) {
|
||||||
// poditions are always sorted in increasing order
|
// poditions are always sorted in increasing order
|
||||||
positions[i][j] = (int)(j * 10 + Math.random() * 10);
|
positions[i][j] = (int) (j * 10 + Math.random() * 10);
|
||||||
}
|
}
|
||||||
offsets[i] = new TermVectorOffsetInfo[3];
|
offsets[i] = new TermVectorOffsetInfo[3];
|
||||||
for (int j = 0; j < offsets[i].length; j++){
|
for (int j = 0; j < offsets[i].length; j++) {
|
||||||
// ofsets are alway sorted in increasing order
|
// ofsets are alway sorted in increasing order
|
||||||
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
|
offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Arrays.sort(testTerms);
|
Arrays.sort(testTerms);
|
||||||
|
//Create 5 documents for testing, they all have the same terms
|
||||||
|
writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||||
for (int j = 0; j < 5; j++) {
|
for (int j = 0; j < 5; j++) {
|
||||||
writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
|
||||||
writer.openDocument();
|
writer.openDocument();
|
||||||
|
|
||||||
for (int k = 0; k < testFields.length; k++) {
|
for (int k = 0; k < testFields.length; k++) {
|
||||||
writer.openField(testFields[k]);
|
writer.openField(testFields[k]);
|
||||||
for (int i = 0; i < testTerms.length; i++) {
|
for (int i = 0; i < testTerms.length; i++) {
|
||||||
writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
|
writer.addTerm(testTerms[i], 3, positions[i], offsets[i]);
|
||||||
}
|
}
|
||||||
writer.closeField();
|
writer.closeField();
|
||||||
}
|
}
|
||||||
writer.closeDocument();
|
writer.closeDocument();
|
||||||
writer.close();
|
|
||||||
}
|
}
|
||||||
|
writer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void tearDown() {
|
protected void tearDown() {
|
||||||
@ -80,34 +85,38 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void test() {
|
public void test() {
|
||||||
//Check to see the files were created properly in setup
|
//Check to see the files were created properly in setup
|
||||||
assertTrue(writer.isDocumentOpen() == false);
|
assertTrue(writer.isDocumentOpen() == false);
|
||||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReader() throws IOException {
|
public void testReader() throws IOException {
|
||||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||||
assertTrue(reader != null);
|
assertTrue(reader != null);
|
||||||
TermFreqVector vector = reader.get(0, testFields[0]);
|
for (int j = 0; j < 5; j++) {
|
||||||
assertTrue(vector != null);
|
TermFreqVector vector = reader.get(j, testFields[0]);
|
||||||
String [] terms = vector.getTerms();
|
assertTrue(vector != null);
|
||||||
assertTrue(terms != null);
|
String[] terms = vector.getTerms();
|
||||||
assertTrue(terms.length == testTerms.length);
|
assertTrue(terms != null);
|
||||||
for (int i = 0; i < terms.length; i++) {
|
assertTrue(terms.length == testTerms.length);
|
||||||
String term = terms[i];
|
for (int i = 0; i < terms.length; i++) {
|
||||||
//System.out.println("Term: " + term);
|
String term = terms[i];
|
||||||
assertTrue(term.equals(testTerms[i]));
|
//System.out.println("Term: " + term);
|
||||||
|
assertTrue(term.equals(testTerms[i]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public void testPositionReader() throws IOException {
|
public void testPositionReader() throws IOException {
|
||||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||||
assertTrue(reader != null);
|
assertTrue(reader != null);
|
||||||
TermPositionVector vector;
|
TermPositionVector vector;
|
||||||
String [] terms;
|
String[] terms;
|
||||||
vector = (TermPositionVector)reader.get(0, testFields[0]);
|
vector = (TermPositionVector) reader.get(0, testFields[0]);
|
||||||
assertTrue(vector != null);
|
assertTrue(vector != null);
|
||||||
terms = vector.getTerms();
|
terms = vector.getTerms();
|
||||||
assertTrue(terms != null);
|
assertTrue(terms != null);
|
||||||
assertTrue(terms.length == testTerms.length);
|
assertTrue(terms.length == testTerms.length);
|
||||||
@ -115,14 +124,14 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
String term = terms[i];
|
String term = terms[i];
|
||||||
//System.out.println("Term: " + term);
|
//System.out.println("Term: " + term);
|
||||||
assertTrue(term.equals(testTerms[i]));
|
assertTrue(term.equals(testTerms[i]));
|
||||||
int [] positions = vector.getTermPositions(i);
|
int[] positions = vector.getTermPositions(i);
|
||||||
assertTrue(positions != null);
|
assertTrue(positions != null);
|
||||||
assertTrue(positions.length == this.positions[i].length);
|
assertTrue(positions.length == this.positions[i].length);
|
||||||
for (int j = 0; j < positions.length; j++) {
|
for (int j = 0; j < positions.length; j++) {
|
||||||
int position = positions[j];
|
int position = positions[j];
|
||||||
assertTrue(position == this.positions[i][j]);
|
assertTrue(position == this.positions[i][j]);
|
||||||
}
|
}
|
||||||
TermVectorOffsetInfo [] offset = vector.getOffsets(i);
|
TermVectorOffsetInfo[] offset = vector.getOffsets(i);
|
||||||
assertTrue(offset != null);
|
assertTrue(offset != null);
|
||||||
assertTrue(offset.length == this.offsets[i].length);
|
assertTrue(offset.length == this.offsets[i].length);
|
||||||
for (int j = 0; j < offset.length; j++) {
|
for (int j = 0; j < offset.length; j++) {
|
||||||
@ -130,9 +139,9 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
|
assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
|
TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
|
||||||
assertTrue(freqVector != null);
|
assertTrue(freqVector != null);
|
||||||
assertTrue(freqVector instanceof TermPositionVector == false);
|
assertTrue(freqVector instanceof TermPositionVector == false);
|
||||||
terms = freqVector.getTerms();
|
terms = freqVector.getTerms();
|
||||||
assertTrue(terms != null);
|
assertTrue(terms != null);
|
||||||
@ -140,30 +149,30 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
for (int i = 0; i < terms.length; i++) {
|
for (int i = 0; i < terms.length; i++) {
|
||||||
String term = terms[i];
|
String term = terms[i];
|
||||||
//System.out.println("Term: " + term);
|
//System.out.println("Term: " + term);
|
||||||
assertTrue(term.equals(testTerms[i]));
|
assertTrue(term.equals(testTerms[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOffsetReader() throws IOException {
|
public void testOffsetReader() throws IOException {
|
||||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||||
assertTrue(reader != null);
|
assertTrue(reader != null);
|
||||||
TermPositionVector vector = (TermPositionVector)reader.get(0, testFields[0]);
|
TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
|
||||||
assertTrue(vector != null);
|
assertTrue(vector != null);
|
||||||
String [] terms = vector.getTerms();
|
String[] terms = vector.getTerms();
|
||||||
assertTrue(terms != null);
|
assertTrue(terms != null);
|
||||||
assertTrue(terms.length == testTerms.length);
|
assertTrue(terms.length == testTerms.length);
|
||||||
for (int i = 0; i < terms.length; i++) {
|
for (int i = 0; i < terms.length; i++) {
|
||||||
String term = terms[i];
|
String term = terms[i];
|
||||||
//System.out.println("Term: " + term);
|
//System.out.println("Term: " + term);
|
||||||
assertTrue(term.equals(testTerms[i]));
|
assertTrue(term.equals(testTerms[i]));
|
||||||
int [] positions = vector.getTermPositions(i);
|
int[] positions = vector.getTermPositions(i);
|
||||||
assertTrue(positions != null);
|
assertTrue(positions != null);
|
||||||
assertTrue(positions.length == this.positions[i].length);
|
assertTrue(positions.length == this.positions[i].length);
|
||||||
for (int j = 0; j < positions.length; j++) {
|
for (int j = 0; j < positions.length; j++) {
|
||||||
int position = positions[j];
|
int position = positions[j];
|
||||||
assertTrue(position == this.positions[i][j]);
|
assertTrue(position == this.positions[i][j]);
|
||||||
}
|
}
|
||||||
TermVectorOffsetInfo [] offset = vector.getOffsets(i);
|
TermVectorOffsetInfo[] offset = vector.getOffsets(i);
|
||||||
assertTrue(offset != null);
|
assertTrue(offset != null);
|
||||||
assertTrue(offset.length == this.offsets[i].length);
|
assertTrue(offset.length == this.offsets[i].length);
|
||||||
for (int j = 0; j < offset.length; j++) {
|
for (int j = 0; j < offset.length; j++) {
|
||||||
@ -172,18 +181,112 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMapper() throws IOException {
|
||||||
|
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||||
|
assertTrue(reader != null);
|
||||||
|
SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
|
||||||
|
reader.get(0, mapper);
|
||||||
|
SortedSet set = mapper.getTermVectorEntrySet();
|
||||||
|
assertTrue("set is null and it shouldn't be", set != null);
|
||||||
|
//three fields, 4 terms, all terms are the same
|
||||||
|
assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
|
||||||
|
//Check offsets and positions
|
||||||
|
for (Iterator iterator = set.iterator(); iterator.hasNext();) {
|
||||||
|
TermVectorEntry tve = (TermVectorEntry) iterator.next();
|
||||||
|
assertTrue("tve is null and it shouldn't be", tve != null);
|
||||||
|
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
|
||||||
|
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
|
||||||
|
reader.get(1, mapper);
|
||||||
|
set = mapper.getTermVectorEntrySet();
|
||||||
|
assertTrue("set is null and it shouldn't be", set != null);
|
||||||
|
//three fields, 4 terms, all terms are the same
|
||||||
|
assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
|
||||||
|
//Should have offsets and positions b/c we are munging all the fields together
|
||||||
|
for (Iterator iterator = set.iterator(); iterator.hasNext();) {
|
||||||
|
TermVectorEntry tve = (TermVectorEntry) iterator.next();
|
||||||
|
assertTrue("tve is null and it shouldn't be", tve != null);
|
||||||
|
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
|
||||||
|
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
|
||||||
|
reader.get(0, fsMapper);
|
||||||
|
Map map = fsMapper.getFieldToTerms();
|
||||||
|
assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
|
||||||
|
for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
|
||||||
|
Map.Entry entry = (Map.Entry) iterator.next();
|
||||||
|
SortedSet sortedSet = (SortedSet) entry.getValue();
|
||||||
|
assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
|
||||||
|
for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
|
||||||
|
TermVectorEntry tve = (TermVectorEntry) inner.next();
|
||||||
|
assertTrue("tve is null and it shouldn't be", tve != null);
|
||||||
|
//Check offsets and positions.
|
||||||
|
assertTrue("tve is null and it shouldn't be", tve != null);
|
||||||
|
String field = tve.getField();
|
||||||
|
if (field.equals(testFields[0])) {
|
||||||
|
//should have offsets
|
||||||
|
|
||||||
|
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
|
||||||
|
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
|
||||||
|
}
|
||||||
|
else if (field.equals(testFields[1])) {
|
||||||
|
//should not have offsets
|
||||||
|
|
||||||
|
assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
|
||||||
|
assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//Try mapper that ignores offs and positions
|
||||||
|
fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
|
||||||
|
reader.get(0, fsMapper);
|
||||||
|
map = fsMapper.getFieldToTerms();
|
||||||
|
assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
|
||||||
|
for (Iterator iterator = map.entrySet().iterator(); iterator.hasNext();) {
|
||||||
|
Map.Entry entry = (Map.Entry) iterator.next();
|
||||||
|
SortedSet sortedSet = (SortedSet) entry.getValue();
|
||||||
|
assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
|
||||||
|
for (Iterator inner = sortedSet.iterator(); inner.hasNext();) {
|
||||||
|
TermVectorEntry tve = (TermVectorEntry) inner.next();
|
||||||
|
assertTrue("tve is null and it shouldn't be", tve != null);
|
||||||
|
//Check offsets and positions.
|
||||||
|
assertTrue("tve is null and it shouldn't be", tve != null);
|
||||||
|
String field = tve.getField();
|
||||||
|
if (field.equals(testFields[0])) {
|
||||||
|
//should have offsets
|
||||||
|
|
||||||
|
assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
|
||||||
|
assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
|
||||||
|
}
|
||||||
|
else if (field.equals(testFields[1])) {
|
||||||
|
//should not have offsets
|
||||||
|
|
||||||
|
assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
|
||||||
|
assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Make sure exceptions and bad params are handled appropriately
|
* Make sure exceptions and bad params are handled appropriately
|
||||||
*/
|
*/
|
||||||
public void testBadParams() {
|
public void testBadParams() {
|
||||||
try {
|
try {
|
||||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||||
assertTrue(reader != null);
|
assertTrue(reader != null);
|
||||||
//Bad document number, good field number
|
//Bad document number, good field number
|
||||||
reader.get(50, testFields[0]);
|
reader.get(50, testFields[0]);
|
||||||
fail();
|
fail();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// expected exception
|
// expected exception
|
||||||
}
|
}
|
||||||
@ -192,7 +295,7 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
assertTrue(reader != null);
|
assertTrue(reader != null);
|
||||||
//Bad document number, no field
|
//Bad document number, no field
|
||||||
reader.get(50);
|
reader.get(50);
|
||||||
fail();
|
fail();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// expected exception
|
// expected exception
|
||||||
}
|
}
|
||||||
@ -201,9 +304,9 @@ public class TestTermVectorsReader extends TestCase {
|
|||||||
assertTrue(reader != null);
|
assertTrue(reader != null);
|
||||||
//good document number, bad field number
|
//good document number, bad field number
|
||||||
TermFreqVector vector = reader.get(0, "f50");
|
TermFreqVector vector = reader.get(0, "f50");
|
||||||
assertTrue(vector == null);
|
assertTrue(vector == null);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
fail();
|
fail();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -28,7 +28,9 @@ import org.apache.lucene.util.English;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.SortedSet;
|
||||||
|
|
||||||
public class TestTermVectors extends TestCase {
|
public class TestTermVectors extends TestCase {
|
||||||
private IndexSearcher searcher;
|
private IndexSearcher searcher;
|
||||||
@ -171,7 +173,7 @@ public class TestTermVectors extends TestCase {
|
|||||||
assertTrue(false);
|
assertTrue(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testKnownSetOfDocuments() {
|
public void testKnownSetOfDocuments() {
|
||||||
String test1 = "eating chocolate in a computer lab"; //6 terms
|
String test1 = "eating chocolate in a computer lab"; //6 terms
|
||||||
String test2 = "computer in a computer lab"; //5 terms
|
String test2 = "computer in a computer lab"; //5 terms
|
||||||
@ -275,20 +277,45 @@ public class TestTermVectors extends TestCase {
|
|||||||
Integer freqInt = (Integer)test4Map.get(term);
|
Integer freqInt = (Integer)test4Map.get(term);
|
||||||
assertTrue(freqInt != null);
|
assertTrue(freqInt != null);
|
||||||
assertTrue(freqInt.intValue() == freq);
|
assertTrue(freqInt.intValue() == freq);
|
||||||
}
|
}
|
||||||
|
SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
|
||||||
|
knownSearcher.reader.getTermFreqVector(hits.id(1), mapper);
|
||||||
|
SortedSet vectorEntrySet = mapper.getTermVectorEntrySet();
|
||||||
|
assertTrue("mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
|
||||||
|
TermVectorEntry last = null;
|
||||||
|
for (Iterator iterator = vectorEntrySet.iterator(); iterator.hasNext();) {
|
||||||
|
TermVectorEntry tve = (TermVectorEntry) iterator.next();
|
||||||
|
if (tve != null && last != null)
|
||||||
|
{
|
||||||
|
assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency());
|
||||||
|
Integer expectedFreq = (Integer) test4Map.get(tve.getTerm());
|
||||||
|
//we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields
|
||||||
|
assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue());
|
||||||
|
}
|
||||||
|
last = tve;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
|
||||||
|
knownSearcher.reader.getTermFreqVector(hits.id(1), fieldMapper);
|
||||||
|
Map map = fieldMapper.getFieldToTerms();
|
||||||
|
assertTrue("map Size: " + map.size() + " is not: " + 2, map.size() == 2);
|
||||||
|
vectorEntrySet = (SortedSet) map.get("field");
|
||||||
|
assertTrue("vectorEntrySet is null and it shouldn't be", vectorEntrySet != null);
|
||||||
|
assertTrue("vectorEntrySet Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10);
|
||||||
knownSearcher.close();
|
knownSearcher.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
assertTrue(false);
|
assertTrue(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setupDoc(Document doc, String text)
|
private void setupDoc(Document doc, String text)
|
||||||
{
|
{
|
||||||
doc.add(new Field("field", text, Field.Store.YES,
|
doc.add(new Field("field", text, Field.Store.YES,
|
||||||
Field.Index.TOKENIZED, Field.TermVector.YES));
|
Field.Index.TOKENIZED, Field.TermVector.YES));
|
||||||
|
doc.add(new Field("field2", text, Field.Store.YES,
|
||||||
|
Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
|
||||||
//System.out.println("Document: " + doc);
|
//System.out.println("Document: " + doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user