Added term vector support.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150206 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doug Cutting 2004-02-20 20:14:56 +00:00
parent abb62bda9c
commit 12eee6df5a
41 changed files with 3734 additions and 328 deletions

View File

@ -54,6 +54,10 @@ $Id$
9. Added MultiReader, an IndexReader that combines multiple other
IndexReaders. (Cutting)
10. Added support for term vectors. See Field#isTermVectorStored().
(Grant Ingersoll, Cutting & Dmitry)
1.3 final
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to

View File

@ -71,6 +71,7 @@ import org.apache.lucene.search.Hits; // for javadoc
public final class Field implements java.io.Serializable {
private String name = "body";
private String stringValue = null;
private boolean storeTermVector = false;
private Reader readerValue = null;
private boolean isStored = false;
private boolean isIndexed = true;
@ -114,7 +115,8 @@ public final class Field implements java.io.Serializable {
}
/** Constructs a String-valued Field that is not tokenized, but is indexed
and stored. Useful for non-text fields, e.g. date or url. */
and stored. Useful for non-text fields, e.g. date or url.
*/
public static final Field Keyword(String name, String value) {
return new Field(name, value, true, true, false);
}
@ -127,9 +129,9 @@ public final class Field implements java.io.Serializable {
/** Constructs a String-valued Field that is tokenized and indexed,
and is stored in the index, for return with hits. Useful for short text
fields, like "title" or "subject". */
fields, like "title" or "subject". Term vector will not be stored for this field. */
public static final Field Text(String name, String value) {
return new Field(name, value, true, true, true);
return Text(name, value, false);
}
/** Constructs a Date-valued Field that is not tokenized and is indexed,
@ -139,16 +141,38 @@ public final class Field implements java.io.Serializable {
}
/** Constructs a String-valued Field that is tokenized and indexed,
but that is not stored in the index. */
and is stored in the index, for return with hits. Useful for short text
fields, like "title" or "subject". */
public static final Field Text(String name, String value, boolean storeTermVector) {
return new Field(name, value, true, true, true, storeTermVector);
}
/** Constructs a String-valued Field that is tokenized and indexed,
but that is not stored in the index. Term vector will not be stored for this field. */
public static final Field UnStored(String name, String value) {
return new Field(name, value, false, true, true);
return UnStored(name, value, false);
}
/** Constructs a String-valued Field that is tokenized and indexed,
but that is not stored in the index. */
public static final Field UnStored(String name, String value, boolean storeTermVector) {
return new Field(name, value, false, true, true, storeTermVector);
}
/** Constructs a Reader-valued Field that is tokenized and indexed, but is
not stored in the index verbatim. Useful for longer text fields, like
"body". Term vector will not be stored for this field. */
public static final Field Text(String name, Reader value) {
return Text(name, value, false);
}
/** Constructs a Reader-valued Field that is tokenized and indexed, but is
not stored in the index verbatim. Useful for longer text fields, like
"body". */
public static final Field Text(String name, Reader value) {
return new Field(name, value);
public static final Field Text(String name, Reader value, boolean storeTermVector) {
Field f = new Field(name, value);
f.storeTermVector = storeTermVector;
return f;
}
/** The name of the field (e.g., "date", "subject", "title", or "body")
@ -162,19 +186,41 @@ public final class Field implements java.io.Serializable {
is used. Exactly one of stringValue() and readerValue() must be set. */
public Reader readerValue() { return readerValue; }
/** Create a field by specifying all parameters except for <code>storeTermVector</code>,
* which is set to <code>false</code>.
*/
public Field(String name, String string,
boolean store, boolean index, boolean token) {
this(name, string, store, index, token, false);
}
/**
*
* @param name The name of the field
* @param string The string to process
* @param store true if the field should store the string
* @param index true if the field should be indexed
* @param token true if the field should be tokenized
* @param storeTermVector true if we should store the Term Vector info
*/
public Field(String name, String string,
boolean store, boolean index, boolean token, boolean storeTermVector) {
if (name == null)
throw new IllegalArgumentException("name cannot be null");
if (string == null)
throw new IllegalArgumentException("value cannot be null");
if (!index && storeTermVector)
throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed.");
this.name = name.intern(); // field names are interned
this.stringValue = string;
this.isStored = store;
this.isIndexed = index;
this.isTokenized = token;
this.storeTermVector = storeTermVector;
}
Field(String name, Reader reader) {
if (name == null)
throw new IllegalArgumentException("name cannot be null");
@ -199,6 +245,16 @@ public final class Field implements java.io.Serializable {
Reader-valued. */
public final boolean isTokenized() { return isTokenized; }
/** True iff the term or terms used to index this field are stored as a term
* vector, avaliable from {@link IndexReader#getTermFreqVector(int,String)}.
* These methods do not provide access to the original content of the field,
* only to terms used to index it. If the original content must be
* preserved, use the <code>stored</code> attribute instead.
*
* @see IndexReader#getTermFreqVector(int, String)
*/
public final boolean isTermVectorStored() { return storeTermVector; }
/** Prints a Field for human consumption. */
public final String toString() {
if (isStored && isIndexed && !isTokenized)
@ -209,8 +265,14 @@ public final class Field implements java.io.Serializable {
return "Text<" + name + ":" + stringValue + ">";
else if (!isStored && isIndexed && isTokenized && readerValue!=null)
return "Text<" + name + ":" + readerValue + ">";
else if (!isStored && isIndexed && isTokenized)
{
return "UnStored<" + name + ">";
}
else
{
return super.toString();
}
}
}

View File

@ -72,7 +72,7 @@ import java.io.IOException;
* @author Dmitry Serebrennikov
* @version $Id$
*/
public class CompoundFileReader extends Directory {
class CompoundFileReader extends Directory {
private static final class FileEntry {
long offset;

View File

@ -77,6 +77,13 @@ final class DocumentWriter {
private FieldInfos fieldInfos;
private int maxFieldLength;
/**
*
* @param directory The directory to write the document information to
* @param analyzer The analyzer to use for the document
* @param similarity The Similarity function
* @param maxFieldLength The maximum number of tokens a field may have
*/
DocumentWriter(Directory directory, Analyzer analyzer,
Similarity similarity, int maxFieldLength) {
this.directory = directory;
@ -86,7 +93,7 @@ final class DocumentWriter {
}
final void addDocument(String segment, Document doc)
throws IOException {
throws IOException {
// write field names
fieldInfos = new FieldInfos();
fieldInfos.add(doc);
@ -94,7 +101,7 @@ final class DocumentWriter {
// write field values
FieldsWriter fieldsWriter =
new FieldsWriter(directory, segment, fieldInfos);
new FieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.addDocument(doc);
} finally {
@ -144,7 +151,7 @@ final class DocumentWriter {
// Tokenizes the fields of a document into Postings.
private final void invertDocument(Document doc)
throws IOException {
throws IOException {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
@ -166,7 +173,7 @@ final class DocumentWriter {
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException
("field must have either String or Reader value");
("field must have either String or Reader value");
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
@ -277,15 +284,17 @@ final class DocumentWriter {
}
private final void writePostings(Posting[] postings, String segment)
throws IOException {
throws IOException {
OutputStream freq = null, prox = null;
TermInfosWriter tis = null;
TermVectorsWriter termVectorWriter = null;
try {
//open files for inverse index storage
freq = directory.createFile(segment + ".frq");
prox = directory.createFile(segment + ".prx");
tis = new TermInfosWriter(directory, segment, fieldInfos);
TermInfo ti = new TermInfo();
String currentField = null;
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
@ -295,38 +304,65 @@ final class DocumentWriter {
tis.add(posting.term, ti);
// add an entry to the freq file
int f = posting.freq;
if (f == 1) // optimize freq=1
int postingFreq = posting.freq;
if (postingFreq == 1) // optimize freq=1
freq.writeVInt(1); // set low bit of doc num.
else {
freq.writeVInt(0); // the document number
freq.writeVInt(f); // frequency in doc
freq.writeVInt(postingFreq); // frequency in doc
}
int lastPosition = 0; // write positions
int[] positions = posting.positions;
for (int j = 0; j < f; j++) { // use delta-encoding
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
int position = positions[j];
prox.writeVInt(position - lastPosition);
lastPosition = position;
}
// check to see if we switched to a new field
String termField = posting.term.field();
if (currentField != termField) {
// changing field - see if there is something to save
currentField = termField;
FieldInfo fi = fieldInfos.fieldInfo(currentField);
if (fi.storeTermVector) {
if (termVectorWriter == null) {
termVectorWriter =
new TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter.openDocument();
}
termVectorWriter.openField(currentField);
} else if (termVectorWriter != null) {
termVectorWriter.closeField();
}
}
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
termVectorWriter.addTerm(posting.term.text(), postingFreq);
}
}
if (termVectorWriter != null)
termVectorWriter.closeDocument();
} finally {
if (freq != null) freq.close();
if (prox != null) prox.close();
if (tis != null) tis.close();
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
IOException keep = null;
if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace();
}
}
private final void writeNorms(Document doc, String segment)
throws IOException {
throws IOException {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
if (field.isIndexed()) {
int n = fieldInfos.fieldNumber(field.name());
float norm =
fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
fieldBoosts[n] * similarity.lengthNorm(field.name(), fieldLengths[n]);
OutputStream norms = directory.createFile(segment + ".f" + n);
try {
norms.writeByte(similarity.encodeNorm(norm));

View File

@ -59,9 +59,13 @@ final class FieldInfo {
boolean isIndexed;
int number;
FieldInfo(String na, boolean tk, int nu) {
// true if term vector for this field should be stored
boolean storeTermVector;
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) {
name = na;
isIndexed = tk;
number = nu;
this.storeTermVector = storeTermVector;
}
}

View File

@ -54,11 +54,7 @@ package org.apache.lucene.index;
* <http://www.apache.org/>.
*/
import java.util.Hashtable;
import java.util.Vector;
import java.util.Enumeration;
import java.util.Collection;
import java.util.Iterator;
import java.util.*;
import java.io.IOException;
import org.apache.lucene.document.Document;
@ -68,6 +64,12 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.InputStream;
/** Access to the Field Info file that describes document fields and whether or
* not they are indexed. Each segment has a separate Field Info file. Objects
* of this class are thread-safe for multiple readers, but only one thread can
* be adding documents at a time, with no other reader or writer threads
* accessing this object.
*/
final class FieldInfos {
private Vector byNumber = new Vector();
private Hashtable byName = new Hashtable();
@ -76,6 +78,15 @@ final class FieldInfos {
add("", false);
}
/**
* Construct a FieldInfos object using the directory and the name of the file
* InputStream
* @param d The directory to open the InputStream from
* @param name The name of the file to open the InputStream from in the Directory
* @throws IOException
*
* @see #read
*/
FieldInfos(Directory d, String name) throws IOException {
InputStream input = d.openFile(name);
try {
@ -86,36 +97,83 @@ final class FieldInfos {
}
/** Adds field info for a Document. */
final void add(Document doc) {
Enumeration fields = doc.fields();
public void add(Document doc) {
Enumeration fields = doc.fields();
while (fields.hasMoreElements()) {
Field field = (Field)fields.nextElement();
add(field.name(), field.isIndexed());
Field field = (Field) fields.nextElement();
add(field.name(), field.isIndexed(), field.isTermVectorStored());
}
}
final void add(Collection names, boolean isIndexed) {
/**
* @param names The names of the fields
* @param storeTermVectors Whether the fields store term vectors or not
*/
public void addIndexed(Collection names, boolean storeTermVectors) {
Iterator i = names.iterator();
int j = 0;
while (i.hasNext()) {
add((String)i.next(), true, storeTermVectors);
}
}
/**
* Assumes the field is not storing term vectors
* @param names The names of the fields
* @param isIndexed Whether the fields are indexed or not
*
* @see #add(String, boolean)
*/
public void add(Collection names, boolean isIndexed) {
Iterator i = names.iterator();
int j = 0;
while (i.hasNext()) {
add((String)i.next(), isIndexed);
}
}
final void add(String name, boolean isIndexed) {
FieldInfo fi = fieldInfo(name);
if (fi == null)
addInternal(name, isIndexed);
else if (fi.isIndexed != isIndexed)
fi.isIndexed = true;
}
/**
* Calls three parameter add with false for the storeTermVector parameter
* @param name The name of the Field
* @param isIndexed true if the field is indexed
* @see #add(String, boolean, boolean)
*/
public void add(String name, boolean isIndexed) {
add(name, isIndexed, false);
}
private final void addInternal(String name, boolean isIndexed) {
FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.size());
/** If the field is not yet known, adds it. If it is known, checks to make
* sure that the isIndexed flag is the same as was given previously for this
* field. If not - marks it as being indexed. Same goes for storeTermVector
*
* @param name The name of the field
* @param isIndexed true if the field is indexed
* @param storeTermVector true if the term vector should be stored
*/
public void add(String name, boolean isIndexed, boolean storeTermVector) {
FieldInfo fi = fieldInfo(name);
if (fi == null) {
addInternal(name, isIndexed, storeTermVector);
} else {
if (fi.isIndexed != isIndexed) {
fi.isIndexed = true; // once indexed, always index
}
if (fi.storeTermVector != storeTermVector) {
fi.storeTermVector = true; // once vector, always vector
}
}
}
private void addInternal(String name, boolean isIndexed,
boolean storeTermVector) {
FieldInfo fi =
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector);
byNumber.addElement(fi);
byName.put(name, fi);
}
final int fieldNumber(String fieldName) {
public int fieldNumber(String fieldName) {
FieldInfo fi = fieldInfo(fieldName);
if (fi != null)
return fi.number;
@ -123,23 +181,32 @@ final class FieldInfos {
return -1;
}
final FieldInfo fieldInfo(String fieldName) {
return (FieldInfo)byName.get(fieldName);
public FieldInfo fieldInfo(String fieldName) {
return (FieldInfo) byName.get(fieldName);
}
final String fieldName(int fieldNumber) {
public String fieldName(int fieldNumber) {
return fieldInfo(fieldNumber).name;
}
final FieldInfo fieldInfo(int fieldNumber) {
return (FieldInfo)byNumber.elementAt(fieldNumber);
public FieldInfo fieldInfo(int fieldNumber) {
return (FieldInfo) byNumber.elementAt(fieldNumber);
}
final int size() {
public int size() {
return byNumber.size();
}
final void write(Directory d, String name) throws IOException {
public boolean hasVectors() {
boolean hasVectors = false;
for (int i = 0; i < size(); i++) {
if (fieldInfo(i).storeTermVector)
hasVectors = true;
}
return hasVectors;
}
public void write(Directory d, String name) throws IOException {
OutputStream output = d.createFile(name);
try {
write(output);
@ -148,19 +215,29 @@ final class FieldInfos {
}
}
final void write(OutputStream output) throws IOException {
public void write(OutputStream output) throws IOException {
output.writeVInt(size());
for (int i = 0; i < size(); i++) {
FieldInfo fi = fieldInfo(i);
byte bits = 0x0;
if (fi.isIndexed) bits |= 0x1;
if (fi.storeTermVector) bits |= 0x2;
output.writeString(fi.name);
output.writeByte((byte)(fi.isIndexed ? 1 : 0));
//Was REMOVE
//output.writeByte((byte)(fi.isIndexed ? 1 : 0));
output.writeByte(bits);
}
}
private final void read(InputStream input) throws IOException {
int size = input.readVInt();
for (int i = 0; i < size; i++)
addInternal(input.readString().intern(),
input.readByte() != 0);
private void read(InputStream input) throws IOException {
int size = input.readVInt();//read in the size
for (int i = 0; i < size; i++) {
String name = input.readString().intern();
byte bits = input.readByte();
boolean isIndexed = (bits & 0x1) != 0;
boolean storeTermVector = (bits & 0x2) != 0;
addInternal(name, isIndexed, storeTermVector);
}
}
}

View File

@ -63,6 +63,7 @@ import org.apache.lucene.document.Field;
/**
* Class responsible for access to stored document fields.
*
* It uses &lt;segment&gt;.fdt and &lt;segment&gt;.fdx; files.
*
* @version $Id$
@ -108,7 +109,7 @@ final class FieldsReader {
fieldsStream.readString(), // read value
true, // stored
fi.isIndexed, // indexed
(bits & 1) != 0)); // tokenized
(bits & 1) != 0, fi.storeTermVector)); // vector
}
return doc;

View File

@ -66,7 +66,7 @@ import org.apache.lucene.document.Document;
* contained index reader. Subclasses of <code>FilterIndexReader</code> may
* further override some of these methods and may also provide additional
* methods and fields.
*/
*/
public class FilterIndexReader extends IndexReader {
/** Base class for filtering {@link TermDocs} implementations. */
@ -89,7 +89,7 @@ public class FilterIndexReader extends IndexReader {
/** Base class for filtering {@link TermPositions} implementations. */
public static class FilterTermPositions
extends FilterTermDocs implements TermPositions {
extends FilterTermDocs implements TermPositions {
public FilterTermPositions(TermPositions in) { super(in); }
@ -118,10 +118,20 @@ public class FilterIndexReader extends IndexReader {
this.in = in;
}
public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException {
return in.getTermFreqVectors(docNumber);
}
public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException {
return in.getTermFreqVector(docNumber, field);
}
public int numDocs() { return in.numDocs(); }
public int maxDoc() { return in.maxDoc(); }
public Document document(int n) throws IOException {return in.document(n);}
public Document document(int n) throws IOException { return in.document(n); }
public boolean isDeleted(int n) { return in.isDeleted(n); }
public boolean hasDeletions() { return in.hasDeletions(); }
@ -132,7 +142,7 @@ public class FilterIndexReader extends IndexReader {
in.norms(f, bytes, offset);
}
public void setNorm(int d, String f, byte b) throws IOException {
in.setNorm(d,f,b);
in.setNorm(d, f, b);
}
public TermEnum terms() throws IOException { return in.terms(); }
@ -141,6 +151,7 @@ public class FilterIndexReader extends IndexReader {
public int docFreq(Term t) throws IOException { return in.docFreq(t); }
public TermDocs termDocs() throws IOException { return in.termDocs(); }
public TermPositions termPositions() throws IOException {
return in.termPositions();
}
@ -151,7 +162,18 @@ public class FilterIndexReader extends IndexReader {
public Collection getFieldNames() throws IOException {
return in.getFieldNames();
}
public Collection getFieldNames(boolean indexed) throws IOException {
return in.getFieldNames(indexed);
}
/**
*
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
* else only indexed fields without term vector info
* @return Collection of Strings indicating the names of the fields
*/
public Collection getIndexedFieldNames(boolean storedTermVector) {
return in.getIndexedFieldNames(storedTermVector);
}
}

View File

@ -66,20 +66,20 @@ import org.apache.lucene.document.Field; // for javadoc
import org.apache.lucene.search.Similarity;
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
so that any subclass which implements it is searchable.
index. Search of an index is done entirely through this abstract interface,
so that any subclass which implements it is searchable.
<p> Concrete subclasses of IndexReader are usually constructed with a call to
the static method {@link #open}.
<p> Concrete subclasses of IndexReader are usually constructed with a call to
the static method {@link #open}.
<p> For efficiency, in this API documents are often referred to via
<i>document numbers</i>, non-negative integers which each name a unique
document in the index. These document numbers are ephemeral--they may change
as documents are added to and deleted from an index. Clients should thus not
rely on a given document having the same number between sessions.
<p> For efficiency, in this API documents are often referred to via
<i>document numbers</i>, non-negative integers which each name a unique
document in the index. These document numbers are ephemeral--they may change
as documents are added to and deleted from an index. Clients should thus not
rely on a given document having the same number between sessions.
@author Doug Cutting
@version $Id$
@author Doug Cutting
@version $Id$
*/
public abstract class IndexReader {
protected IndexReader(Directory directory) {
@ -92,21 +92,21 @@ public abstract class IndexReader {
private Lock writeLock;
SegmentInfos segmentInfos = null;
private boolean stale = false;
/** Returns an IndexReader reading the index in an FSDirectory in the named
path. */
path. */
public static IndexReader open(String path) throws IOException {
return open(FSDirectory.getDirectory(path, false));
}
/** Returns an IndexReader reading the index in an FSDirectory in the named
path. */
path. */
public static IndexReader open(File path) throws IOException {
return open(FSDirectory.getDirectory(path, false));
}
/** Returns an IndexReader reading the index in the given Directory. */
public static IndexReader open(final Directory directory) throws IOException{
public static IndexReader open(final Directory directory) throws IOException {
synchronized (directory) { // in- & inter-process sync
return (IndexReader)new Lock.With(
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
@ -117,10 +117,10 @@ public abstract class IndexReader {
if (infos.size() == 1) { // index is optimized
return new SegmentReader(infos, infos.info(0), true);
} else {
IndexReader[] readers = new IndexReader[infos.size()];
for (int i = 0; i < infos.size(); i++)
readers[i] = new SegmentReader(infos, infos.info(i), i==infos.size()-1);
return new MultiReader(directory, readers);
IndexReader[] readers = new IndexReader[infos.size()];
for (int i = 0; i < infos.size(); i++)
readers[i] = new SegmentReader(infos, infos.info(i), i==infos.size()-1);
return new MultiReader(directory, readers);
}
}
}.run();
@ -174,7 +174,7 @@ public abstract class IndexReader {
public static long lastModified(Directory directory) throws IOException {
return directory.fileModified("segments");
}
/**
* Reads version number from segments files. The version number counts the
* number of changes of the index.
@ -186,7 +186,7 @@ public abstract class IndexReader {
public static long getCurrentVersion(String directory) throws IOException {
return getCurrentVersion(new File(directory));
}
/**
* Reads version number from segments files. The version number counts the
* number of changes of the index.
@ -201,7 +201,7 @@ public abstract class IndexReader {
dir.close();
return version;
}
/**
* Reads version number from segments files. The version number counts the
* number of changes of the index.
@ -214,6 +214,27 @@ public abstract class IndexReader {
return SegmentInfos.readCurrentVersion(directory);
}
/** Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document.
* Each vector vector contains term numbers and frequencies for all terms
* in a given vectorized field.
* If no such fields existed, the method returns null.
*
* @see Field#isTermVectorStored()
*/
abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException;
/** Return a term frequency vector for the specified document and field. The
* vector returned contains term numbers and frequencies for all terms in
* the specified field of this document, if the field had storeTermVector
* flag set. If the flag was not set, the method returns null.
*
* @see Field#isTermVectorStored()
*/
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException;
/**
* Returns <code>true</code> if an index exists at the specified directory.
* If the directory does not exist or if there is no index in it.
@ -250,13 +271,13 @@ public abstract class IndexReader {
public abstract int numDocs();
/** Returns one greater than the largest possible document number.
This may be used to, e.g., determine how big to allocate an array which
will have an element for every document number in an index.
This may be used to, e.g., determine how big to allocate an array which
will have an element for every document number in an index.
*/
public abstract int maxDoc();
/** Returns the stored fields of the <code>n</code><sup>th</sup>
<code>Document</code> in this index. */
<code>Document</code> in this index. */
public abstract Document document(int n) throws IOException;
/** Returns true if document <i>n</i> has been deleted */
@ -264,7 +285,7 @@ public abstract class IndexReader {
/** Returns true if any documents have been deleted */
public abstract boolean hasDeletions();
/** Returns the byte-encoded normalization factor for the named field of
* every document. This is used by the search code to score documents.
*
@ -283,14 +304,14 @@ public abstract class IndexReader {
/** Expert: Resets the normalization factor for the named field of the named
* document. The norm represents the product of the field's {@link
* Field#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
* int) length normalization}. Thus, to preserve the length normalization
* int) length normalization}. Thus, to preserve the length normalization
* values when resetting this, one should base the new value upon the old.
*
* @see #norms(String)
* @see Similarity#decodeNorm(byte)
*/
public abstract void setNorm(int doc, String field, byte value)
throws IOException;
throws IOException;
/** Expert: Resets the normalization factor for the named field of the named
* document.
@ -299,20 +320,20 @@ public abstract class IndexReader {
* @see Similarity#decodeNorm(byte)
*/
public void setNorm(int doc, String field, float value)
throws IOException {
throws IOException {
setNorm(doc, field, Similarity.encodeNorm(value));
}
/** Returns an enumeration of all the terms in the index.
The enumeration is ordered by Term.compareTo(). Each term
is greater than all that precede it in the enumeration.
The enumeration is ordered by Term.compareTo(). Each term
is greater than all that precede it in the enumeration.
*/
public abstract TermEnum terms() throws IOException;
/** Returns an enumeration of all terms after a given term.
The enumeration is ordered by Term.compareTo(). Each term
is greater than all that precede it in the enumeration.
The enumeration is ordered by Term.compareTo(). Each term
is greater than all that precede it in the enumeration.
*/
public abstract TermEnum terms(Term t) throws IOException;
@ -320,15 +341,15 @@ public abstract class IndexReader {
public abstract int docFreq(Term t) throws IOException;
/** Returns an enumeration of all the documents which contain
<code>term</code>. For each document, the document number, the frequency of
the term in that document is also provided, for use in search scoring.
Thus, this method implements the mapping:
<p><ul>
Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup>
</ul>
<p>The enumeration is ordered by document number. Each document number
is greater than all that precede it in the enumeration.
*/
<code>term</code>. For each document, the document number, the frequency of
the term in that document is also provided, for use in search scoring.
Thus, this method implements the mapping:
<p><ul>
Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup>
</ul>
<p>The enumeration is ordered by document number. Each document number
is greater than all that precede it in the enumeration.
*/
public TermDocs termDocs(Term term) throws IOException {
TermDocs termDocs = termDocs();
termDocs.seek(term);
@ -339,21 +360,21 @@ public abstract class IndexReader {
public abstract TermDocs termDocs() throws IOException;
/** Returns an enumeration of all the documents which contain
<code>term</code>. For each document, in addition to the document number
and frequency of the term in that document, a list of all of the ordinal
positions of the term in the document is available. Thus, this method
implements the mapping:
<code>term</code>. For each document, in addition to the document number
and frequency of the term in that document, a list of all of the ordinal
positions of the term in the document is available. Thus, this method
implements the mapping:
<p><ul>
Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
&lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
pos<sub>freq-1</sub>&gt;
&gt;<sup>*</sup>
</ul>
<p> This positional information faciliates phrase and proximity searching.
<p>The enumeration is ordered by document number. Each document number is
greater than all that precede it in the enumeration.
*/
<p><ul>
Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
&lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
pos<sub>freq-1</sub>&gt;
&gt;<sup>*</sup>
</ul>
<p> This positional information faciliates phrase and proximity searching.
<p>The enumeration is ordered by document number. Each document number is
greater than all that precede it in the enumeration.
*/
public TermPositions termPositions(Term term) throws IOException {
TermPositions termPositions = termPositions();
termPositions.seek(term);
@ -364,16 +385,16 @@ public abstract class IndexReader {
public abstract TermPositions termPositions() throws IOException;
/** Deletes the document numbered <code>docNum</code>. Once a document is
deleted it will not appear in TermDocs or TermPostitions enumerations.
Attempts to read its field with the {@link #document}
method will result in an error. The presence of this document may still be
reflected in the {@link #docFreq} statistic, though
this will be corrected eventually as the index is further modified.
*/
deleted it will not appear in TermDocs or TermPostitions enumerations.
Attempts to read its field with the {@link #document}
method will result in an error. The presence of this document may still be
reflected in the {@link #docFreq} statistic, though
this will be corrected eventually as the index is further modified.
*/
public final synchronized void delete(int docNum) throws IOException {
if(stale)
if (stale)
throw new IOException("IndexReader out of date and no longer valid for deletion");
if (writeLock == null) {
Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
if (!writeLock.obtain(IndexWriter.WRITE_LOCK_TIMEOUT)) // obtain write lock
@ -382,11 +403,11 @@ public abstract class IndexReader {
// we have to check whether index has changed since this reader was opened.
// if so, this reader is no longer valid for deletion
if(segmentInfos != null && SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()){
stale = true;
this.writeLock.release();
this.writeLock = null;
throw new IOException("IndexReader out of date and no longer valid for deletion");
if (segmentInfos != null && SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()) {
stale = true;
this.writeLock.release();
this.writeLock = null;
throw new IOException("IndexReader out of date and no longer valid for deletion");
}
}
doDelete(docNum);
@ -398,14 +419,14 @@ public abstract class IndexReader {
protected abstract void doDelete(int docNum) throws IOException;
/** Deletes all documents containing <code>term</code>.
This is useful if one uses a document field to hold a unique ID string for
the document. Then to delete such a document, one merely constructs a
term with the appropriate field and the unique ID string as its text and
passes it to this method. Returns the number of documents deleted.
*/
This is useful if one uses a document field to hold a unique ID string for
the document. Then to delete such a document, one merely constructs a
term with the appropriate field and the unique ID string as its text and
passes it to this method. Returns the number of documents deleted.
*/
public final int delete(Term term) throws IOException {
TermDocs docs = termDocs(term);
if ( docs == null ) return 0;
if (docs == null) return 0;
int n = 0;
try {
while (docs.next()) {
@ -444,25 +465,33 @@ public abstract class IndexReader {
writeLock = null;
}
}
/**
* Returns a list of all unique field names that exist in the index pointed to by
* this IndexReader.
* Returns a list of all unique field names that exist in the index pointed
* to by this IndexReader.
* @return Collection of Strings indicating the names of the fields
* @throws IOException if there is a problem with accessing the index
*/
public abstract Collection getFieldNames() throws IOException;
/**
* Returns a list of all unique field names that exist in the index pointed to by
* this IndexReader. The boolean argument specifies whether the fields returned
* are indexed or not.
* Returns a list of all unique field names that exist in the index pointed
* to by this IndexReader. The boolean argument specifies whether the fields
* returned are indexed or not.
* @param indexed <code>true</code> if only indexed fields should be returned;
* <code>false</code> if only unindexed fields should be returned.
* @return Collection of Strings indicating the names of the fields
* @throws IOException if there is a problem with accessing the index
*/
public abstract Collection getFieldNames(boolean indexed) throws IOException;
public abstract Collection getFieldNames(boolean indexed) throws IOException;
/**
*
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
* else only indexed fields without term vector info
* @return Collection of Strings indicating the names of the fields
*/
public abstract Collection getIndexedFieldNames(boolean storedTermVector);
/**
* Returns <code>true</code> iff the index in the named directory is
@ -470,12 +499,12 @@ public abstract class IndexReader {
* @param directory the directory to check for a lock
* @throws IOException if there is a problem with accessing the index
*/
public static boolean isLocked(Directory directory) throws IOException {
return
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
public static boolean isLocked(Directory directory) throws IOException {
return
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
}
}
/**
* Returns <code>true</code> iff the index in the named directory is
@ -483,19 +512,19 @@ public abstract class IndexReader {
* @param directory the directory to check for a lock
* @throws IOException if there is a problem with accessing the index
*/
public static boolean isLocked(String directory) throws IOException {
return isLocked(FSDirectory.getDirectory(directory, false));
}
public static boolean isLocked(String directory) throws IOException {
return isLocked(FSDirectory.getDirectory(directory, false));
}
/**
* Forcibly unlocks the index in the named directory.
* <P>
* Caution: this should only be used by failure recovery code,
* when it is known that no other process nor thread is in fact
* currently accessing this index.
*/
public static void unlock(Directory directory) throws IOException {
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).release();
}
/**
* Forcibly unlocks the index in the named directory.
* <P>
* Caution: this should only be used by failure recovery code,
* when it is known that no other process nor thread is in fact
* currently accessing this index.
*/
public static void unlock(Directory directory) throws IOException {
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).release();
}
}

View File

@ -75,7 +75,7 @@ public class MultiReader extends IndexReader {
private int maxDoc = 0;
private int numDocs = -1;
private boolean hasDeletions = false;
/** Construct reading the named set of readers. */
public MultiReader(IndexReader[] readers) throws IOException {
this(readers.length == 0 ? null : readers[0].directory(), readers);
@ -97,6 +97,25 @@ public class MultiReader extends IndexReader {
starts[readers.length] = maxDoc;
}
/** Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document.
* Each vector vector contains term numbers and frequencies for all terms
* in a given vectorized field.
* If no such fields existed, the method returns null.
*/
public TermFreqVector[] getTermFreqVectors(int n)
throws IOException {
int i = readerIndex(n); // find segment num
return readers[i].getTermFreqVectors(n - starts[i]); // dispatch to segment
}
public TermFreqVector getTermFreqVector(int n, String field)
throws IOException {
int i = readerIndex(n); // find segment num
return readers[i].getTermFreqVector(n - starts[i], field);
}
public synchronized int numDocs() {
if (numDocs == -1) { // check cache
int n = 0; // cache miss--recompute
@ -245,6 +264,18 @@ public class MultiReader extends IndexReader {
}
return fieldSet;
}
public Collection getIndexedFieldNames(boolean storedTermVector) {
// maintain a unique set of field names
Set fieldSet = new HashSet();
for (int i = 0; i < readers.length; i++) {
IndexReader reader = readers[i];
Collection names = reader.getIndexedFieldNames(storedTermVector);
fieldSet.addAll(names);
}
return fieldSet;
}
}
class MultiTermEnum extends TermEnum {

View File

@ -61,10 +61,19 @@ import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BitVector;
/**
* The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
* into a single Segment. After adding the appropriate readers, call the merge method to combine the
* segments.
*<P>
* If the compoundFile flag is set, then the segments will be merged into a compound file.
*
*
* @see #merge
* @see #add
*/
final class SegmentMerger {
private boolean useCompoundFile;
private Directory directory;
@ -77,51 +86,78 @@ final class SegmentMerger {
private static final String COMPOUND_EXTENSIONS[] = new String[] {
"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
};
private static final String VECTOR_EXTENSIONS[] = new String[] {
"tvx", "tvd", "tvf"
};
/**
*
* @param dir The Directory to merge the other segments into
* @param name The name of the new segment
* @param compoundFile true if the new segment should use a compoundFile
*/
SegmentMerger(Directory dir, String name, boolean compoundFile) {
directory = dir;
segment = name;
useCompoundFile = compoundFile;
}
/**
* Add an IndexReader to the collection of readers that are to be merged
* @param reader
*/
final void add(IndexReader reader) {
readers.addElement(reader);
}
/**
*
* @param i The index of the reader to return
* @return The ith reader to be merged
*/
final IndexReader segmentReader(int i) {
return (IndexReader)readers.elementAt(i);
return (IndexReader) readers.elementAt(i);
}
/**
* Merges the readers specified by the {@link #add} method into the directory passed to the constructor
* @return The number of documents that were merged
* @throws IOException
*/
final int merge() throws IOException {
int value;
try {
value = mergeFields();
mergeTerms();
mergeNorms();
if (fieldInfos.hasVectors())
mergeVectors();
} finally {
for (int i = 0; i < readers.size(); i++) { // close readers
IndexReader reader = (IndexReader)readers.elementAt(i);
reader.close();
IndexReader reader = (IndexReader) readers.elementAt(i);
reader.close();
}
}
if (useCompoundFile)
createCompoundFile();
createCompoundFile();
return value;
}
private final void createCompoundFile()
throws IOException {
CompoundFileWriter cfsWriter =
new CompoundFileWriter(directory, segment + ".cfs");
ArrayList files =
new ArrayList(COMPOUND_EXTENSIONS.length + fieldInfos.size());
private final void createCompoundFile()
throws IOException {
CompoundFileWriter cfsWriter =
new CompoundFileWriter(directory, segment + ".cfs");
ArrayList files =
new ArrayList(COMPOUND_EXTENSIONS.length + fieldInfos.size());
// Basic files
for (int i=0; i<COMPOUND_EXTENSIONS.length; i++) {
files.add(segment + "." + COMPOUND_EXTENSIONS[i]);
for (int i = 0; i < COMPOUND_EXTENSIONS.length; i++) {
files.add(segment + "." + COMPOUND_EXTENSIONS[i]);
}
// Field norm files
@ -132,9 +168,16 @@ final class SegmentMerger {
}
}
// Vector files
if (fieldInfos.hasVectors()) {
for (int i = 0; i < VECTOR_EXTENSIONS.length; i++) {
files.add(segment + "." + VECTOR_EXTENSIONS[i]);
}
}
// Now merge all added files
Iterator it = files.iterator();
while(it.hasNext()) {
while (it.hasNext()) {
cfsWriter.addFile((String) it.next());
}
@ -143,33 +186,38 @@ final class SegmentMerger {
// Now delete the source files
it = files.iterator();
while(it.hasNext()) {
while (it.hasNext()) {
directory.deleteFile((String) it.next());
}
}
/**
*
* @return The number of documents in all of the readers
* @throws IOException
*/
private final int mergeFields() throws IOException {
fieldInfos = new FieldInfos(); // merge field names
int docCount = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader)readers.elementAt(i);
fieldInfos.add(reader.getFieldNames(true), true);
IndexReader reader = (IndexReader) readers.elementAt(i);
fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true);
fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false);
fieldInfos.add(reader.getFieldNames(false), false);
}
fieldInfos.write(directory, segment + ".fnm");
FieldsWriter fieldsWriter = // merge field values
new FieldsWriter(directory, segment, fieldInfos);
FieldsWriter fieldsWriter = // merge field values
new FieldsWriter(directory, segment, fieldInfos);
try {
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader)readers.elementAt(i);
int maxDoc = reader.maxDoc();
for (int j = 0; j < maxDoc; j++)
if (!reader.isDeleted(j)){ // skip deleted docs
IndexReader reader = (IndexReader) readers.elementAt(i);
int maxDoc = reader.maxDoc();
for (int j = 0; j < maxDoc; j++)
if (!reader.isDeleted(j)) { // skip deleted docs
fieldsWriter.addDocument(reader.document(j));
docCount++;
}
}
}
} finally {
fieldsWriter.close();
@ -177,6 +225,50 @@ final class SegmentMerger {
return docCount;
}
/**
* Merge the TermVectors from each of the segments into the new one.
* @throws IOException
*/
private final void mergeVectors() throws IOException {
TermVectorsWriter termVectorsWriter =
new TermVectorsWriter(directory, segment, fieldInfos);
try {
for (int r = 0; r < readers.size(); r++) {
IndexReader reader = (IndexReader) readers.elementAt(r);
int maxDoc = reader.maxDoc();
for (int docNum = 0; docNum < maxDoc; docNum++) {
// skip deleted docs
if (reader.isDeleted(docNum)) {
continue;
}
termVectorsWriter.openDocument();
// get all term vectors
TermFreqVector[] sourceTermVector =
reader.getTermFreqVectors(docNum);
if (sourceTermVector != null) {
for (int f = 0; f < sourceTermVector.length; f++) {
// translate field numbers
TermFreqVector termVector = sourceTermVector[f];
termVectorsWriter.openField(termVector.getField());
String [] terms = termVector.getTerms();
int [] freqs = termVector.getTermFrequencies();
for (int t = 0; t < terms.length; t++) {
termVectorsWriter.addTerm(terms[t], freqs[t]);
}
}
termVectorsWriter.closeDocument();
}
}
}
} finally {
termVectorsWriter.close();
}
}
private OutputStream freqOutput = null;
private OutputStream proxOutput = null;
private TermInfosWriter termInfosWriter = null;
@ -187,15 +279,15 @@ final class SegmentMerger {
freqOutput = directory.createFile(segment + ".frq");
proxOutput = directory.createFile(segment + ".prx");
termInfosWriter =
new TermInfosWriter(directory, segment, fieldInfos);
new TermInfosWriter(directory, segment, fieldInfos);
mergeTermInfos();
} finally {
if (freqOutput != null) freqOutput.close();
if (proxOutput != null) proxOutput.close();
if (termInfosWriter != null) termInfosWriter.close();
if (queue != null) queue.close();
if (freqOutput != null) freqOutput.close();
if (proxOutput != null) proxOutput.close();
if (termInfosWriter != null) termInfosWriter.close();
if (queue != null) queue.close();
}
}
@ -203,7 +295,7 @@ final class SegmentMerger {
queue = new SegmentMergeQueue(readers.size());
int base = 0;
for (int i = 0; i < readers.size(); i++) {
IndexReader reader = (IndexReader)readers.elementAt(i);
IndexReader reader = (IndexReader) readers.elementAt(i);
TermEnum termEnum = reader.terms();
SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
base += reader.numDocs();
@ -214,20 +306,20 @@ final class SegmentMerger {
}
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
while (queue.size() > 0) {
int matchSize = 0; // pop matching terms
match[matchSize++] = (SegmentMergeInfo)queue.pop();
match[matchSize++] = (SegmentMergeInfo) queue.pop();
Term term = match[0].term;
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
SegmentMergeInfo top = (SegmentMergeInfo) queue.top();
while (top != null && term.compareTo(top.term) == 0) {
match[matchSize++] = (SegmentMergeInfo)queue.pop();
top = (SegmentMergeInfo)queue.top();
match[matchSize++] = (SegmentMergeInfo) queue.pop();
top = (SegmentMergeInfo) queue.top();
}
mergeTermInfo(match, matchSize); // add new TermInfo
while (matchSize > 0) {
SegmentMergeInfo smi = match[--matchSize];
if (smi.next())
@ -240,8 +332,15 @@ final class SegmentMerger {
private final TermInfo termInfo = new TermInfo(); // minimize consing
/** Merge one term found in one or more segments. The array <code>smis</code>
* contains segments that are positioned at the same term. <code>N</code>
* is the number of cells in the array actually occupied.
*
* @param smis array of segments
* @param n number of cells in the array actually occupied
*/
private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
throws IOException {
throws IOException {
long freqPointer = freqOutput.getFilePointer();
long proxPointer = proxOutput.getFilePointer();
@ -251,13 +350,21 @@ final class SegmentMerger {
if (df > 0) {
// add an entry to the dictionary with pointers to prox and freq files
termInfo.set(df, freqPointer, proxPointer, (int)(skipPointer-freqPointer));
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
termInfosWriter.add(smis[0].term, termInfo);
}
}
/** Process postings from multiple segments all positioned on the
* same term. Writes out merged entries into freqOutput and
* the proxOutput streams.
*
* @param smis array of segments
* @param n number of cells in the array actually occupied
* @return number of documents across all segments where this term was found
*/
private final int appendPostings(SegmentMergeInfo[] smis, int n)
throws IOException {
throws IOException {
final int skipInterval = termInfosWriter.skipInterval;
int lastDoc = 0;
int df = 0; // number of docs w/ term
@ -285,7 +392,7 @@ final class SegmentMerger {
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
lastDoc = doc;
int freq = postings.freq();
if (freq == 1) {
freqOutput.writeVInt(docCode | 1); // write doc & freq=1
@ -293,10 +400,10 @@ final class SegmentMerger {
freqOutput.writeVInt(docCode); // write doc
freqOutput.writeVInt(freq); // write frequency in doc
}
int lastPosition = 0; // write position deltas
for (int j = 0; j < freq; j++) {
int position = postings.nextPosition();
for (int j = 0; j < freq; j++) {
int position = postings.nextPosition();
proxOutput.writeVInt(position - lastPosition);
lastPosition = position;
}
@ -321,9 +428,9 @@ final class SegmentMerger {
long freqPointer = freqOutput.getFilePointer();
long proxPointer = proxOutput.getFilePointer();
skipBuffer.writeVInt(doc - lastSkipDoc);
skipBuffer.writeVInt((int)(freqPointer - lastSkipFreqPointer));
skipBuffer.writeVInt((int)(proxPointer - lastSkipProxPointer));
skipBuffer.writeVInt(doc - lastSkipDoc);
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
lastSkipDoc = doc;
lastSkipFreqPointer = freqPointer;
@ -340,22 +447,22 @@ final class SegmentMerger {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed) {
OutputStream output = directory.createFile(segment + ".f" + i);
try {
for (int j = 0; j < readers.size(); j++) {
IndexReader reader = (IndexReader)readers.elementAt(j);
byte[] input = reader.norms(fi.name);
OutputStream output = directory.createFile(segment + ".f" + i);
try {
for (int j = 0; j < readers.size(); j++) {
IndexReader reader = (IndexReader) readers.elementAt(j);
byte[] input = reader.norms(fi.name);
int maxDoc = reader.maxDoc();
for (int k = 0; k < maxDoc; k++) {
byte norm = input != null ? input[k] : (byte)0;
byte norm = input != null ? input[k] : (byte) 0;
if (!reader.isDeleted(k)) {
output.writeByte(norm);
}
}
}
} finally {
output.close();
}
}
}
} finally {
output.close();
}
}
}
}

View File

@ -82,6 +82,7 @@ final class SegmentReader extends IndexReader {
private FieldsReader fieldsReader;
TermInfosReader tis;
TermVectorsReader termVectorsReader;
BitVector deletedDocs = null;
private boolean deletedDocsDirty = false;
@ -109,21 +110,22 @@ final class SegmentReader extends IndexReader {
out.close();
}
String fileName = segment + ".f" + fieldInfos.fieldNumber(name);
directory().renameFile(segment + ".tmp", fileName);
directory().renameFile(segment + ".tmp", fileName);
this.dirty = false;
}
}
private Hashtable norms = new Hashtable();
SegmentReader(SegmentInfos sis, SegmentInfo si, boolean closeDir)
throws IOException {
throws IOException {
this(si);
closeDirectory = closeDir;
segmentInfos = sis;
}
SegmentReader(SegmentInfo si)
throws IOException {
throws IOException {
super(si.dir);
segment = si.name;
@ -149,13 +151,17 @@ final class SegmentReader extends IndexReader {
freqStream = cfsDir.openFile(segment + ".frq");
proxStream = cfsDir.openFile(segment + ".prx");
openNorms(cfsDir);
if (fieldInfos.hasVectors()) { // open term vector files only as needed
termVectorsReader = new TermVectorsReader(cfsDir, segment, fieldInfos);
}
}
protected final synchronized void doClose() throws IOException {
if (deletedDocsDirty || normsDirty) {
synchronized (directory()) { // in- & inter-process sync
new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
IndexWriter.COMMIT_LOCK_TIMEOUT) {
IndexWriter.COMMIT_LOCK_TIMEOUT) {
public Object doBody() throws IOException {
if (deletedDocsDirty) { // re-write deleted
@ -164,18 +170,18 @@ final class SegmentReader extends IndexReader {
}
if (normsDirty) { // re-write norms
Enumeration keys = norms.keys();
Enumeration values = norms.elements();
Enumeration keys = norms.keys();
Enumeration values = norms.elements();
while (values.hasMoreElements()) {
String field = (String)keys.nextElement();
Norm norm = (Norm)values.nextElement();
String field = (String) keys.nextElement();
Norm norm = (Norm) values.nextElement();
if (norm.dirty) {
norm.reWrite(field);
}
}
}
if(segmentInfos != null)
if (segmentInfos != null)
segmentInfos.write(directory());
else
directory().touchFile("segments");
@ -196,6 +202,7 @@ final class SegmentReader extends IndexReader {
proxStream.close();
closeNorms();
if (termVectorsReader != null) termVectorsReader.close();
if (cfsReader != null)
cfsReader.close();
@ -212,6 +219,7 @@ final class SegmentReader extends IndexReader {
return deletedDocs != null;
}
static final boolean usesCompoundFile(SegmentInfo si) throws IOException {
return si.dir.fileExists(si.name + ".cfs");
}
@ -226,7 +234,7 @@ final class SegmentReader extends IndexReader {
public synchronized void undeleteAll() throws IOException {
synchronized (directory()) { // in- & inter-process sync
new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
IndexWriter.COMMIT_LOCK_TIMEOUT) {
IndexWriter.COMMIT_LOCK_TIMEOUT) {
public Object doBody() throws IOException {
if (directory().fileExists(segment + ".del")) {
directory().deleteFile(segment + ".del");
@ -242,11 +250,11 @@ final class SegmentReader extends IndexReader {
final Vector files() throws IOException {
Vector files = new Vector(16);
final String ext[] = new String[] {
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del"
};
final String ext[] = new String[]{
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
"tvx", "tvd", "tvf", "tvp" };
for (int i=0; i<ext.length; i++) {
for (int i = 0; i < ext.length; i++) {
String name = segment + "." + ext[i];
if (directory().fileExists(name))
files.addElement(name);
@ -271,7 +279,7 @@ final class SegmentReader extends IndexReader {
public final synchronized Document document(int n) throws IOException {
if (isDeleted(n))
throw new IllegalArgumentException
("attempt to access a deleted document");
("attempt to access a deleted document");
return fieldsReader.doc(n);
}
@ -329,12 +337,31 @@ final class SegmentReader extends IndexReader {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed == indexed)
fieldSet.add(fi.name);
}
return fieldSet;
}
return fieldSet;
}
/**
*
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
* else only indexed fields without term vector info
* @return Collection of Strings indicating the names of the fields
*/
public Collection getIndexedFieldNames(boolean storedTermVector) {
// maintain a unique set of field names
Set fieldSet = new HashSet();
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
if (fi.isIndexed == true && fi.storeTermVector == storedTermVector){
fieldSet.add(fi.name);
}
}
return fieldSet;
}
public synchronized byte[] norms(String field) throws IOException {
Norm norm = (Norm)norms.get(field);
Norm norm = (Norm) norms.get(field);
if (norm == null) // not an indexed field
return null;
if (norm.bytes == null) { // value not yet read
@ -346,8 +373,8 @@ final class SegmentReader extends IndexReader {
}
public synchronized void setNorm(int doc, String field, byte value)
throws IOException {
Norm norm = (Norm)norms.get(field);
throws IOException {
Norm norm = (Norm) norms.get(field);
if (norm == null) // not an indexed field
return;
norm.dirty = true; // mark it dirty
@ -360,7 +387,7 @@ final class SegmentReader extends IndexReader {
public synchronized void norms(String field, byte[] bytes, int offset)
throws IOException {
Norm norm = (Norm)norms.get(field);
Norm norm = (Norm) norms.get(field);
if (norm == null)
return; // use zeros in array
@ -369,7 +396,7 @@ final class SegmentReader extends IndexReader {
return;
}
InputStream normStream = (InputStream)norm.in.clone();
InputStream normStream = (InputStream) norm.in.clone();
try { // read from disk
normStream.seek(0);
normStream.readBytes(bytes, offset, maxDoc());
@ -392,11 +419,40 @@ final class SegmentReader extends IndexReader {
private final void closeNorms() throws IOException {
synchronized (norms) {
Enumeration enumerator = norms.elements();
Enumeration enumerator = norms.elements();
while (enumerator.hasMoreElements()) {
Norm norm = (Norm)enumerator.nextElement();
Norm norm = (Norm) enumerator.nextElement();
norm.in.close();
}
}
}
/** Return a term frequency vector for the specified document and field. The
* vector returned contains term numbers and frequencies for all terms in
* the specified field of this document, if the field had storeTermVector
* flag set. If the flag was not set, the method returns null.
*/
public TermFreqVector getTermFreqVector(int docNumber, String field)
throws IOException {
// Check if this field is invalid or has no stored term vector
FieldInfo fi = fieldInfos.fieldInfo(field);
if (fi == null || !fi.storeTermVector) return null;
return termVectorsReader.get(docNumber, field);
}
/** Return an array of term frequency vectors for the specified document.
* The array contains a vector for each vectorized field in the document.
* Each vector vector contains term numbers and frequencies for all terms
* in a given vectorized field.
* If no such fields existed, the method returns null.
*/
public TermFreqVector[] getTermFreqVectors(int docNumber)
throws IOException {
if (termVectorsReader == null)
return null;
return termVectorsReader.get(docNumber);
}
}

View File

@ -77,27 +77,27 @@ class SegmentTermDocs implements TermDocs {
private boolean haveSkipped;
SegmentTermDocs(SegmentReader parent)
throws IOException {
throws IOException {
this.parent = parent;
this.freqStream = (InputStream)parent.freqStream.clone();
this.freqStream = (InputStream) parent.freqStream.clone();
this.deletedDocs = parent.deletedDocs;
this.skipInterval = parent.tis.getSkipInterval();
}
public void seek(Term term) throws IOException {
TermInfo ti = parent.tis.get(term);
seek(ti);
}
public void seek(TermEnum enum) throws IOException {
TermInfo ti;
if (enum instanceof SegmentTermEnum) // optimized case
ti = ((SegmentTermEnum)enum).termInfo();
ti = ((SegmentTermEnum) enum).termInfo();
else // punt case
ti = parent.tis.get(enum.term());
seek(ti);
}
void seek(TermInfo ti) throws IOException {
count = 0;
if (ti == null) {
@ -114,7 +114,7 @@ class SegmentTermDocs implements TermDocs {
haveSkipped = false;
}
}
public void close() throws IOException {
freqStream.close();
}
@ -128,19 +128,19 @@ class SegmentTermDocs implements TermDocs {
public boolean next() throws IOException {
while (true) {
if (count == df)
return false;
return false;
int docCode = freqStream.readVInt();
doc += docCode >>> 1; // shift off low bit
if ((docCode & 1) != 0) // if low bit is set
freq = 1; // freq is one
freq = 1; // freq is one
else
freq = freqStream.readVInt(); // else read freq
freq = freqStream.readVInt(); // else read freq
count++;
if (deletedDocs == null || !deletedDocs.get(doc))
break;
break;
skippingDoc();
}
return true;
@ -148,7 +148,7 @@ class SegmentTermDocs implements TermDocs {
/** Optimized implementation. */
public int read(final int[] docs, final int[] freqs)
throws IOException {
throws IOException {
final int length = docs.length;
int i = 0;
while (i < length && count < df) {
@ -157,17 +157,17 @@ class SegmentTermDocs implements TermDocs {
final int docCode = freqStream.readVInt();
doc += docCode >>> 1; // shift off low bit
if ((docCode & 1) != 0) // if low bit is set
freq = 1; // freq is one
freq = 1; // freq is one
else
freq = freqStream.readVInt(); // else read freq
freq = freqStream.readVInt(); // else read freq
count++;
if (deletedDocs == null || !deletedDocs.get(doc)) {
docs[i] = doc;
freqs[i] = freq;
++i;
docs[i] = doc;
freqs[i] = freq;
++i;
}
}
}
return i;
}
@ -179,7 +179,7 @@ class SegmentTermDocs implements TermDocs {
if (df > skipInterval) { // optimized case
if (skipStream == null)
skipStream = (InputStream)freqStream.clone(); // lazily clone
skipStream = (InputStream) freqStream.clone(); // lazily clone
if (!haveSkipped) { // lazily seek skip stream
skipStream.seek(skipPointer);
@ -190,8 +190,8 @@ class SegmentTermDocs implements TermDocs {
int lastSkipDoc = skipDoc;
long lastFreqPointer = freqStream.getFilePointer();
long lastProxPointer = -1;
int numSkipped = -1 -(count % skipInterval);
int numSkipped = -1 - (count % skipInterval);
while (target > skipDoc) {
lastSkipDoc = skipDoc;
lastFreqPointer = freqPointer;
@ -205,7 +205,7 @@ class SegmentTermDocs implements TermDocs {
skipDoc += skipStream.readVInt();
freqPointer += skipStream.readVInt();
proxPointer += skipStream.readVInt();
skipCount++;
}
@ -213,7 +213,7 @@ class SegmentTermDocs implements TermDocs {
if (lastFreqPointer > freqStream.getFilePointer()) {
freqStream.seek(lastFreqPointer);
skipProx(lastProxPointer);
doc = lastSkipDoc;
count += numSkipped;
}
@ -223,7 +223,7 @@ class SegmentTermDocs implements TermDocs {
// done skipping, now just scan
do {
if (!next())
return false;
return false;
} while (target > doc);
return true;
}

View File

@ -76,9 +76,9 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
private char[] buffer = {};
SegmentTermEnum(InputStream i, FieldInfos fis, boolean isi)
throws IOException {
throws IOException {
input = i;
fieldInfos = fis;
fieldInfos = fis;
isIndex = isi;
int firstInt = input.readInt();
@ -98,24 +98,24 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
// check that it is a format we can understand
if (format < TermInfosWriter.FORMAT)
throw new IOException("Unknown format version:" + format);
size = input.readLong(); // read the size
if (!isIndex) {
indexInterval = input.readInt();
skipInterval = input.readInt();
}
}
}
protected Object clone() {
SegmentTermEnum clone = null;
try {
clone = (SegmentTermEnum)super.clone();
clone = (SegmentTermEnum) super.clone();
} catch (CloneNotSupportedException e) {}
clone.input = (InputStream)input.clone();
clone.input = (InputStream) input.clone();
clone.termInfo = new TermInfo(termInfo);
if (term != null) clone.growBuffer(term.text.length());
@ -123,7 +123,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
}
final void seek(long pointer, int p, Term t, TermInfo ti)
throws IOException {
throws IOException {
input.seek(pointer);
position = p;
term = t;
@ -134,7 +134,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
/** Increments the enumeration to the next element. True if one exists.*/
public final boolean next() throws IOException {
if (position++ >= size-1) {
if (position++ >= size - 1) {
term = null;
return false;
}
@ -145,7 +145,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
termInfo.docFreq = input.readVInt(); // read doc freq
termInfo.freqPointer += input.readVLong(); // read freq pointer
termInfo.proxPointer += input.readVLong(); // read prox pointer
if (!isIndex) {
if (termInfo.docFreq > skipInterval) {
termInfo.skipOffset = input.readVInt();
@ -164,10 +164,10 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
int totalLength = start + length;
if (buffer.length < totalLength)
growBuffer(totalLength);
input.readChars(buffer, start, length);
return new Term(fieldInfos.fieldName(input.readVInt()),
new String(buffer, 0, totalLength), false);
new String(buffer, 0, totalLength), false);
}
private final void growBuffer(int length) {
@ -177,25 +177,25 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
}
/** Returns the current Term in the enumeration.
Initially invalid, valid after next() called for the first time.*/
Initially invalid, valid after next() called for the first time.*/
public final Term term() {
return term;
}
/** Returns the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
Initially invalid, valid after next() called for the first time.*/
final TermInfo termInfo() {
return new TermInfo(termInfo);
}
/** Sets the argument to the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
Initially invalid, valid after next() called for the first time.*/
final void termInfo(TermInfo ti) {
ti.set(termInfo);
}
/** Returns the docFreq from the current TermInfo in the enumeration.
Initially invalid, valid after next() called for the first time.*/
Initially invalid, valid after next() called for the first time.*/
public final int docFreq() {
return termInfo.docFreq;
}

View File

@ -106,7 +106,7 @@ extends SegmentTermDocs implements TermPositions {
public final int read(final int[] docs, final int[] freqs)
throws IOException {
throw new UnsupportedOperationException();
throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.index;
import java.io.IOException;
import java.util.*;
/**
*/
class SegmentTermVector implements TermFreqVector {
private String field;
private String terms[];
private int termFreqs[];
SegmentTermVector(String field, String terms[], int termFreqs[]) {
this.field = field;
this.terms = terms;
this.termFreqs = termFreqs;
}
/**
*
* @return The number of the field this vector is associated with
*/
public String getField() {
return field;
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append('{');
sb.append(field).append(": ");
for (int i=0; i<terms.length; i++) {
if (i>0) sb.append(", ");
sb.append(terms[i]).append('/').append(termFreqs[i]);
}
sb.append('}');
return sb.toString();
}
public String toString(IndexReader ir)
throws IOException
{
return toString();
/*StringBuffer sb = new StringBuffer();
//TODO: Reimplement
sb.append('{');
sb.append(field).append(": ");
for (int i=0; i<terms.length; i++) {
if (i>0) sb.append(", ");
Term t = ir.getTerm(terms[i]);
String text = t == null ? "UNKNOWN(" + i + ")" : t.text;
sb.append(text).append('/').append(termFreqs[i]);
if (termProx != null) appendTermProx(sb.append('/'), termProx[i]);
}
sb.append('}');
return sb.toString();*/
}
/** Number of terms in the term vector. If there are no terms in the
* vector, returns 0.
*/
public int size() {
return terms == null ? 0 : terms.length;
}
/** Array of term numbers in ascending order. If there are no terms in
* the vector, returns null.
*/
public String [] getTerms() {
return terms;
}
/** Array of term frequencies. Locations of the array correspond one to one
* to the term numbers in the array obtained from <code>getTermNumbers</code>
* method. Each location in the array contains the number of times this
* term occurs in the document or the document field. If there are no terms in
* the vector, returns null.
*/
public int[] getTermFrequencies() {
return termFreqs;
}
/** Return an index in the term numbers array returned from <code>getTermNumbers</code>
* at which the term with the specified <code>termNumber</code> appears. If this
* term does not appear in the array, return -1.
*/
public int indexOf(String termText) {
int res = Arrays.binarySearch(terms, termText);
return res >= 0 ? res : -1;
}
/** Just like <code>indexOf(int)</code> but searches for a number of terms
* at the same time. Returns an array that has the same size as the number
* of terms searched for, each slot containing the result of searching for
* that term number. Array of term numbers must be sorted in ascending order.
*
* @param termNumbers array containing term numbers to look for
* @param start index in the array where the list of termNumbers starts
* @param len the number of termNumbers in the list
*/
public int[] indexesOf(String [] termNumbers, int start, int len) {
// TODO: there must be a more efficient way of doing this.
// At least, we could advance the lower bound of the terms array
// as we find valid indexes. Also, it might be possible to leverage
// this even more by starting in the middle of the termNumbers array
// and thus dividing the terms array maybe in half with each found index.
int res[] = new int[len];
for (int i=0; i < len; i++) {
res[i] = indexOf(termNumbers[i]);
}
return res;
}
}

View File

@ -57,13 +57,13 @@ package org.apache.lucene.index;
import java.io.IOException;
/** TermDocs provides an interface for enumerating &lt;document, frequency&gt;
pairs for a term. <p> The document portion names each document containing
the term. Documents are indicated by number. The frequency portion gives
the number of times the term occurred in each document. <p> The pairs are
ordered by document number.
pairs for a term. <p> The document portion names each document containing
the term. Documents are indicated by number. The frequency portion gives
the number of times the term occurred in each document. <p> The pairs are
ordered by document number.
@see IndexReader#termDocs
*/
@see IndexReader#termDocs
*/
public interface TermDocs {
/** Sets this to the data for a term.
@ -77,15 +77,15 @@ public interface TermDocs {
void seek(TermEnum termEnum) throws IOException;
/** Returns the current document number. <p> This is invalid until {@link
#next()} is called for the first time.*/
#next()} is called for the first time.*/
int doc();
/** Returns the frequency of the term within the current document. <p> This
is invalid until {@link #next()} is called for the first time.*/
is invalid until {@link #next()} is called for the first time.*/
int freq();
/** Moves to the next pair in the enumeration. <p> Returns true iff there is
such a next pair in the enumeration. */
such a next pair in the enumeration. */
boolean next() throws IOException;
/** Attempts to read multiple entries from the enumeration, up to length of

View File

@ -73,4 +73,27 @@ public abstract class TermEnum {
/** Closes the enumeration to further activity, freeing resources. */
public abstract void close() throws IOException;
// Term Vector support
/** Skips terms to the first beyond the current whose value is
* greater or equal to <i>target</i>. <p>Returns true iff there is such
* an entry. <p>Behaves as if written: <pre>
* public boolean skipTo(Term target) {
* do {
* if (!next())
* return false;
* } while (target > term());
* return true;
* }
* </pre>
* Some implementations are considerably more efficient than that.
*/
public boolean skipTo(Term target) throws IOException {
do {
if (!next())
return false;
} while (target.compareTo(term()) > 0);
return true;
}
}

View File

@ -0,0 +1,64 @@
package org.apache.lucene.index;
import java.io.IOException;
/** Provides access to stored term vector of
* a document field.
*/
public interface TermFreqVector {
/**
*
* @return The field this vector is associated with.
*
*/
public String getField();
/**
* @return The number of terms in the term vector.
*/
public int size();
/**
* @return An Array of term texts in ascending order.
*/
public String[] getTerms();
/** Array of term frequencies. Locations of the array correspond one to one
* to the term numbers in the array obtained from <code>getTermNumbers</code>
* method. Each location in the array contains the number of times this
* term occurs in the document or the document field.
*/
public int[] getTermFrequencies();
/** Return a string representation of the vector.
*/
public String toString();
/** Return a string representation of the vector, but use the provided IndexReader
* to obtain text for each term and include the text instead of term numbers.
*/
public String toString(IndexReader ir) throws IOException;
/** Return an index in the term numbers array returned from <code>getTermNumbers</code>
* at which the term with the specified <code>termNumber</code> appears. If this
* term does not appear in the array, return -1.
*/
public int indexOf(String term);
/** Just like <code>indexOf(int)</code> but searches for a number of terms
* at the same time. Returns an array that has the same size as the number
* of terms searched for, each slot containing the result of searching for
* that term number.
*
* @param terms array containing terms to look for
* @param start index in the array where the list of terms starts
* @param len the number of terms in the list
*/
public int[] indexesOf(String[] terms, int start, int len);
}

View File

@ -57,6 +57,7 @@ package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.StringHelper;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
Directory. A TermInfos can be written once, in order. */
@ -156,10 +157,10 @@ final class TermInfosWriter {
lastTi.set(ti);
size++;
}
private final void writeTerm(Term term)
throws IOException {
int start = stringDifference(lastTerm.text, term.text);
int start = StringHelper.stringDifference(lastTerm.text, term.text);
int length = term.text.length() - start;
output.writeVInt(start); // write shared prefix length
@ -171,15 +172,7 @@ final class TermInfosWriter {
lastTerm = term;
}
private static final int stringDifference(String s1, String s2) {
int len1 = s1.length();
int len2 = s2.length();
int len = len1 < len2 ? len1 : len2;
for (int i = 0; i < len; i++)
if (s1.charAt(i) != s2.charAt(i))
return i;
return len;
}
/** Called to complete TermInfos creation. */
final void close() throws IOException {
@ -190,4 +183,5 @@ final class TermInfosWriter {
if (!isIndex)
other.close();
}
}

View File

@ -0,0 +1,13 @@
package org.apache.lucene.index;
/** Extends <code>TermFreqVector</code> to provide additional information about
* positions in which each of the terms is found.
*/
public interface TermPositionVector extends TermFreqVector {
/** Returns an array of positions in which the term is found.
* Terms are identified by the index at which its number appears in the
* term number array obtained from <code>getTermNumbers</code> method.
*/
public int[] getTermPositions(int index);
}

View File

@ -0,0 +1,221 @@
package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.InputStream;
import java.io.IOException;
/** TODO: relax synchro!
*/
class TermVectorsReader {
private FieldInfos fieldInfos;
private InputStream tvx;
private InputStream tvd;
private InputStream tvf;
private int size;
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
throws IOException {
if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
tvx = d.openFile(segment + TermVectorsWriter.TVX_EXTENSION);
checkValidFormat(tvx);
tvd = d.openFile(segment + TermVectorsWriter.TVD_EXTENSION);
checkValidFormat(tvd);
tvf = d.openFile(segment + TermVectorsWriter.TVF_EXTENSION);
checkValidFormat(tvf);
size = (int) tvx.length() / 8;
}
this.fieldInfos = fieldInfos;
}
private void checkValidFormat(InputStream in) throws IOException
{
int format = in.readInt();
if (format > TermVectorsWriter.FORMAT_VERSION)
{
throw new IOException("Incompatible format version: " + format + " expected "
+ TermVectorsWriter.FORMAT_VERSION + " or less");
}
}
synchronized void close() throws IOException {
// why don't we trap the exception and at least make sure that
// all streams that we can close are closed?
if (tvx != null) tvx.close();
if (tvd != null) tvd.close();
if (tvf != null) tvf.close();
}
/**
*
* @return The number of documents in the reader
*/
int size() {
return size;
}
/**
* Retrieve the term vector for the given document and field
* @param docNum The document number to retrieve the vector for
* @param field The field within the document to retrieve
* @return The TermFreqVector for the document and field or null
*/
synchronized TermFreqVector get(int docNum, String field) {
// Check if no term vectors are available for this segment at all
int fieldNumber = fieldInfos.fieldNumber(field);
TermFreqVector result = null;
if (tvx != null) {
try {
//We need to account for the FORMAT_SIZE at when seeking in the tvx
//We don't need to do this in other seeks because we already have the file pointer
//that was written in another file
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
long position = tvx.readLong();
tvd.seek(position);
int fieldCount = tvd.readVInt();
//System.out.println("Num Fields: " + fieldCount);
// There are only a few fields per document. We opt for a full scan
// rather then requiring that they be ordered. We need to read through
// all of the fields anyway to get to the tvf pointers.
int number = 0;
int found = -1;
for (int i = 0; i < fieldCount; i++) {
number += tvd.readVInt();
if (number == fieldNumber) found = i;
}
// This field, although valid in the segment, was not found in this document
if (found != -1) {
// Compute position in the tvf file
position = 0;
for (int i = 0; i <= found; i++)
{
position += tvd.readVLong();
}
result = readTermVector(field, position);
}
else {
//System.out.println("Field not found");
}
} catch (Exception e) {
//e.printStackTrace();
}
}
else
{
System.out.println("No tvx file");
}
return result;
}
/** Return all term vectors stored for this document or null if the could not be read in. */
synchronized TermFreqVector[] get(int docNum) {
TermFreqVector[] result = null;
// Check if no term vectors are available for this segment at all
if (tvx != null) {
try {
//We need to offset by
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
long position = tvx.readLong();
tvd.seek(position);
int fieldCount = tvd.readVInt();
// No fields are vectorized for this document
if (fieldCount != 0) {
int number = 0;
String[] fields = new String[fieldCount];
for (int i = 0; i < fieldCount; i++) {
number += tvd.readVInt();
fields[i] = fieldInfos.fieldName(number);
}
// Compute position in the tvf file
position = 0;
long[] tvfPointers = new long[fieldCount];
for (int i = 0; i < fieldCount; i++) {
position += tvd.readVLong();
tvfPointers[i] = position;
}
result = readTermVectors(fields, tvfPointers);
}
} catch (IOException e) {
e.printStackTrace();
}
}
else
{
System.out.println("No tvx file");
}
return result;
}
private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
throws IOException {
SegmentTermVector res[] = new SegmentTermVector[fields.length];
for (int i = 0; i < fields.length; i++) {
res[i] = readTermVector(fields[i], tvfPointers[i]);
}
return res;
}
/**
*
* @param fieldNum The field to read in
* @param tvfPointer The pointer within the tvf file where we should start reading
* @return The TermVector located at that position
* @throws IOException
*/
private SegmentTermVector readTermVector(String field, long tvfPointer)
throws IOException {
// Now read the data from specified position
//We don't need to offset by the FORMAT here since the pointer already includes the offset
tvf.seek(tvfPointer);
int numTerms = tvf.readVInt();
//System.out.println("Num Terms: " + numTerms);
// If no terms - return a constant empty termvector
if (numTerms == 0) return new SegmentTermVector(field, null, null);
int length = numTerms + tvf.readVInt();
String terms[] = new String[numTerms];
int termFreqs[] = new int[numTerms];
int start = 0;
int deltaLength = 0;
int totalLength = 0;
char [] buffer = {};
String previousString = "";
for (int i = 0; i < numTerms; i++) {
start = tvf.readVInt();
deltaLength = tvf.readVInt();
totalLength = start + deltaLength;
if (buffer.length < totalLength)
{
buffer = new char[totalLength];
for (int j = 0; j < previousString.length(); j++) // copy contents
buffer[j] = previousString.charAt(j);
}
tvf.readChars(buffer, start, deltaLength);
terms[i] = new String(buffer, 0, totalLength);
previousString = terms[i];
termFreqs[i] = tvf.readVInt();
}
SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
return tv;
}
}

View File

@ -0,0 +1,301 @@
package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.util.StringHelper;
import java.io.IOException;
import java.util.Vector;
/**
* Writer works by opening a document and then opening the fields within the document and then
* writing out the vectors for each field.
*
* Rough usage:
*
<CODE>
for each document
{
writer.openDocument();
for each field on the document
{
writer.openField(field);
for all of the terms
{
writer.addTerm(...)
}
writer.closeField
}
writer.closeDocument()
}
</CODE>
*/
final class TermVectorsWriter {
public static final int FORMAT_VERSION = 1;
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
public static final int FORMAT_SIZE = 4;
//TODO: Figure out how to write with or w/o position information and read back in
public static final String TVX_EXTENSION = ".tvx";
public static final String TVD_EXTENSION = ".tvd";
public static final String TVF_EXTENSION = ".tvf";
private OutputStream tvx = null, tvd = null, tvf = null;
private Vector fields = null;
private Vector terms = null;
private FieldInfos fieldInfos;
private TVField currentField = null;
private long currentDocPointer = -1;
/** Create term vectors writer for the specified segment in specified
* directory. A new TermVectorsWriter should be created for each
* segment. The parameter <code>maxFields</code> indicates how many total
* fields are found in this document. Not all of these fields may require
* termvectors to be stored, so the number of calls to
* <code>openField</code> is less or equal to this number.
*/
public TermVectorsWriter(Directory directory, String segment,
FieldInfos fieldInfos)
throws IOException {
// Open files for TermVector storage
tvx = directory.createFile(segment + TVX_EXTENSION);
tvx.writeInt(FORMAT_VERSION);
tvd = directory.createFile(segment + TVD_EXTENSION);
tvd.writeInt(FORMAT_VERSION);
tvf = directory.createFile(segment + TVF_EXTENSION);
tvf.writeInt(FORMAT_VERSION);
this.fieldInfos = fieldInfos;
fields = new Vector(fieldInfos.size());
terms = new Vector();
}
public final void openDocument()
throws IOException {
closeDocument();
currentDocPointer = tvd.getFilePointer();
}
public final void closeDocument()
throws IOException {
if (isDocumentOpen()) {
closeField();
writeDoc();
fields.clear();
currentDocPointer = -1;
}
}
public final boolean isDocumentOpen() {
return currentDocPointer != -1;
}
/** Start processing a field. This can be followed by a number of calls to
* addTerm, and a final call to closeField to indicate the end of
* processing of this field. If a field was previously open, it is
* closed automatically.
*/
public final void openField(String field)
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");
closeField();
currentField = new TVField(fieldInfos.fieldNumber(field));
}
/** Finished processing current field. This should be followed by a call to
* openField before future calls to addTerm.
*/
public final void closeField()
throws IOException {
if (isFieldOpen()) {
/* DEBUG */
//System.out.println("closeField()");
/* DEBUG */
// save field and terms
writeField();
fields.add(currentField);
terms.clear();
currentField = null;
}
}
/** Return true if a field is currently open. */
public final boolean isFieldOpen() {
return currentField != null;
}
/** Add term to the field's term vector. Field must already be open
* of NullPointerException is thrown. Terms should be added in
* increasing order of terms, one call per unique termNum. ProxPointer
* is a pointer into the TermPosition file (prx). Freq is the number of
* times this term appears in this field, in this document.
*/
public final void addTerm(String termText, int freq) {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");
if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
addTermInternal(termText, freq);
}
private final void addTermInternal(String termText, int freq) {
currentField.length += freq;
TVTerm term = new TVTerm();
term.termText = termText;
term.freq = freq;
terms.add(term);
}
/** Add specified vectors to the document.
*/
public final void addVectors(TermFreqVector[] vectors)
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
for (int i = 0; i < vectors.length; i++) {
addTermFreqVector(vectors[i]);
}
}
/** Add specified vector to the document. Document must be open but no field
* should be open or exception is thrown. The same document can have <code>addTerm</code>
* and <code>addVectors</code> calls mixed, however a given field must either be
* populated with <code>addTerm</code> or with <code>addVector</code>. *
*/
public final void addTermFreqVector(TermFreqVector vector)
throws IOException {
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");
addTermFreqVectorInternal(vector);
}
private final void addTermFreqVectorInternal(TermFreqVector vector)
throws IOException {
openField(vector.getField());
for (int i = 0; i < vector.size(); i++) {
addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
}
closeField();
}
/** Close all streams. */
final void close() throws IOException {
try {
closeDocument();
} finally {
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
IOException keep = null;
if (tvx != null)
try {
tvx.close();
} catch (IOException e) {
if (keep == null) keep = e;
}
if (tvd != null)
try {
tvd.close();
} catch (IOException e) {
if (keep == null) keep = e;
}
if (tvf != null)
try {
tvf.close();
} catch (IOException e) {
if (keep == null) keep = e;
}
if (keep != null) throw (IOException) keep.fillInStackTrace();
}
}
private void writeField() throws IOException {
// remember where this field is written
currentField.tvfPointer = tvf.getFilePointer();
//System.out.println("Field Pointer: " + currentField.tvfPointer);
final int size;
tvf.writeVInt(size = terms.size());
tvf.writeVInt(currentField.length - size);
String lastTermText = "";
// write term ids and positions
for (int i = 0; i < size; i++) {
TVTerm term = (TVTerm) terms.elementAt(i);
//tvf.writeString(term.termText);
int start = StringHelper.stringDifference(lastTermText, term.termText);
int length = term.termText.length() - start;
tvf.writeVInt(start); // write shared prefix length
tvf.writeVInt(length); // write delta length
tvf.writeChars(term.termText, start, length); // write delta chars
tvf.writeVInt(term.freq);
lastTermText = term.termText;
}
}
private void writeDoc() throws IOException {
if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document");
//System.out.println("Writing doc pointer: " + currentDocPointer);
// write document index record
tvx.writeLong(currentDocPointer);
// write document data record
final int size;
// write the number of fields
tvd.writeVInt(size = fields.size());
// write field numbers
int lastFieldNumber = 0;
for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i);
tvd.writeVInt(field.number - lastFieldNumber);
lastFieldNumber = field.number;
}
// write field pointers
long lastFieldPointer = 0;
for (int i = 0; i < size; i++) {
TVField field = (TVField) fields.elementAt(i);
tvd.writeVLong(field.tvfPointer - lastFieldPointer);
lastFieldPointer = field.tvfPointer;
}
//System.out.println("After writing doc pointer: " + tvx.getFilePointer());
}
private static class TVField {
int number;
long tvfPointer = 0;
int length = 0; // number of distinct term positions
TVField(int number) {
this.number = number;
}
}
private static class TVTerm {
String termText;
int freq = 0;
//int positions[] = null;
}
}

View File

@ -0,0 +1,216 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermFreqVector;
import java.io.IOException;
import java.io.StringReader;
import java.util.*;
/**
*
*
**/
public class QueryTermVector implements TermFreqVector {
private String [] terms = new String[0];
private int [] termFreqs = new int[0];
public String getField() { return null; }
/**
*
* @param queryTerms The original list of terms from the query, can contain duplicates
*/
public QueryTermVector(String [] queryTerms) {
processTerms(queryTerms);
}
public QueryTermVector(String queryString, Analyzer analyzer) {
if (analyzer != null)
{
TokenStream stream = analyzer.tokenStream("", new StringReader(queryString));
if (stream != null)
{
Token next = null;
List terms = new ArrayList();
try {
while ((next = stream.next()) != null)
{
terms.add(next.termText());
}
processTerms((String[])terms.toArray(new String[terms.size()]));
} catch (IOException e) {
}
}
}
}
private void processTerms(String[] queryTerms) {
if (queryTerms != null) {
Arrays.sort(queryTerms);
Map tmpSet = new HashMap(queryTerms.length);
//filter out duplicates
List tmpList = new ArrayList(queryTerms.length);
List tmpFreqs = new ArrayList(queryTerms.length);
int j = 0;
for (int i = 0; i < queryTerms.length; i++) {
String term = queryTerms[i];
Integer position = (Integer)tmpSet.get(term);
if (position == null) {
tmpSet.put(term, new Integer(j++));
tmpList.add(term);
tmpFreqs.add(new Integer(1));
}
else {
Integer integer = (Integer)tmpFreqs.get(position.intValue());
tmpFreqs.set(position.intValue(), new Integer(integer.intValue() + 1));
}
}
terms = (String[])tmpList.toArray(terms);
//termFreqs = (int[])tmpFreqs.toArray(termFreqs);
termFreqs = new int[tmpFreqs.size()];
int i = 0;
for (Iterator iter = tmpFreqs.iterator(); iter.hasNext();) {
Integer integer = (Integer) iter.next();
termFreqs[i++] = integer.intValue();
}
}
}
public final String toString() {
StringBuffer sb = new StringBuffer();
sb.append('{');
for (int i=0; i<terms.length; i++) {
if (i>0) sb.append(", ");
sb.append(terms[i]).append('/').append(termFreqs[i]);
}
sb.append('}');
return sb.toString();
}
/**
* @return The number of terms in the term vector.
*/
public int size() {
return terms.length;
}
/** Returns an array of positions in which the term is found or null if no position information is
* available or positions are not implemented.
* Terms are identified by the index at which its number appears in the
* term array obtained from <code>getTerms</code> method.
*/
public int[] getTermPositions(int index) {
return null;
}
/**
* @return An Array of term texts in ascending order.
*/
public String[] getTerms() {
return terms;
}
/** Array of term frequencies. Locations of the array correspond one to one
* to the term numbers in the array obtained from <code>getTermNumbers</code>
* method. Each location in the array contains the number of times this
* term occurs in the document or the document field.
*/
public int[] getTermFrequencies() {
return termFreqs;
}
/** Return a string representation of the vector, but use the provided IndexReader
* to obtain text for each term and include the text instead of term numbers.
*/
public String toString(IndexReader ir) throws IOException {
return toString();
}
/** Return an index in the term numbers array returned from <code>getTermNumbers</code>
* at which the term with the specified <code>termNumber</code> appears. If this
* term does not appear in the array, return -1.
*/
public int indexOf(String term) {
int res = Arrays.binarySearch(terms, term);
return res >= 0 ? res : -1;
}
/** Just like <code>indexOf(int)</code> but searches for a number of terms
* at the same time. Returns an array that has the same size as the number
* of terms searched for, each slot containing the result of searching for
* that term number.
*
* @param terms array containing terms to look for
* @param start index in the array where the list of terms starts
* @param len the number of terms in the list
*/
public int[] indexesOf(String[] terms, int start, int len) {
int res[] = new int[len];
for (int i=0; i < len; i++) {
res[i] = indexOf(terms[i]);
}
return res;
}
}

View File

@ -0,0 +1,32 @@
package org.apache.lucene.util;
/**
* Methods for manipulating strings
*
**/
public abstract class StringHelper {
/**
*
* @param s1 The first string to compare
* @param s2 The second string to compare
* @return The first position where the two strings differ.
*/
public static final int stringDifference(String s1, String s2) {
int len1 = s1.length();
int len2 = s2.length();
int len = len1 < len2 ? len1 : len2;
for (int i = 0; i < len; i++) {
if (s1.charAt(i) != s2.charAt(i)) {
return i;
}
}
return len;
}
private StringHelper() {
}
}

View File

@ -0,0 +1,159 @@
package org.apache.lucene.index;
/**
* Created by IntelliJ IDEA.
* User: Grant Ingersoll
* Date: Feb 2, 2004
* Time: 6:16:12 PM
* $Id$
* Copyright 2004. Center For Natural Language Processing
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Enumeration;
/**
*
*
**/
class DocHelper {
public static final String FIELD_1_TEXT = "field one text";
public static final String TEXT_FIELD_1_KEY = "textField1";
public static Field textField1 = Field.Text(TEXT_FIELD_1_KEY, FIELD_1_TEXT, false);
public static final String FIELD_2_TEXT = "field field field two text";
//Fields will be lexicographically sorted. So, the order is: field, text, two
public static final int [] FIELD_2_FREQS = {3, 1, 1};
public static final String TEXT_FIELD_2_KEY = "textField2";
public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true);
public static final String KEYWORD_TEXT = "Keyword";
public static final String KEYWORD_FIELD_KEY = "keyField";
public static Field keyField = Field.Keyword(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
public static final String UNINDEXED_FIELD_TEXT = "unindexed field text";
public static final String UNINDEXED_FIELD_KEY = "unIndField";
public static Field unIndField = Field.UnIndexed(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
public static final String UNSTORED_1_FIELD_TEXT = "unstored field text";
public static final String UNSTORED_FIELD_1_KEY = "unStoredField1";
public static Field unStoredField1 = Field.UnStored(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, false);
public static final String UNSTORED_2_FIELD_TEXT = "unstored field text";
public static final String UNSTORED_FIELD_2_KEY = "unStoredField2";
public static Field unStoredField2 = Field.UnStored(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, true);
// public static Set fieldNamesSet = null;
// public static Set fieldValuesSet = null;
public static Map nameValues = null;
static
{
nameValues = new HashMap();
nameValues.put(TEXT_FIELD_1_KEY, FIELD_1_TEXT);
nameValues.put(TEXT_FIELD_2_KEY, FIELD_2_TEXT);
nameValues.put(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
nameValues.put(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
nameValues.put(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT);
nameValues.put(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT);
}
/**
* Adds the fields above to a document
* @param doc The document to write
*/
public static void setupDoc(Document doc) {
doc.add(textField1);
doc.add(textField2);
doc.add(keyField);
doc.add(unIndField);
doc.add(unStoredField1);
doc.add(unStoredField2);
}
/**
* Writes the document to the directory using a segment named "test"
* @param dir
* @param doc
*/
public static void writeDoc(Directory dir, Document doc)
{
writeDoc(dir, "test", doc);
}
/**
* Writes the document to the directory in the given segment
* @param dir
* @param segment
* @param doc
*/
public static void writeDoc(Directory dir, String segment, Document doc)
{
Analyzer analyzer = new WhitespaceAnalyzer();
Similarity similarity = Similarity.getDefault();
writeDoc(dir, analyzer, similarity, segment, doc);
}
/**
* Writes the document to the directory segment named "test" using the specified analyzer and similarity
* @param dir
* @param analyzer
* @param similarity
* @param doc
*/
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc)
{
writeDoc(dir, analyzer, similarity, "test", doc);
}
/**
* Writes the document to the directory segment using the analyzer and the similarity score
* @param dir
* @param analyzer
* @param similarity
* @param segment
* @param doc
*/
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc)
{
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
try {
writer.addDocument(segment, doc);
} catch (IOException e) {
e.printStackTrace();
}
}
public static int numFields(Document doc) {
Enumeration fields = doc.fields();
int result = 0;
while (fields.hasMoreElements()) {
fields.nextElement();
result++;
}
return result;
}
}
/*
fieldNamesSet = new HashSet();
fieldNamesSet.add(TEXT_FIELD_1_KEY);
fieldNamesSet.add(TEXT_FIELD_2_KEY);
fieldNamesSet.add(KEYWORD_FIELD_KEY);
fieldNamesSet.add(UNINDEXED_FIELD_KEY);
fieldNamesSet.add(UNSTORED_FIELD_1_KEY);
fieldNamesSet.add(UNSTORED_FIELD_2_KEY);
fieldValuesSet = new HashSet();
fieldValuesSet.add(FIELD_1_TEXT);
fieldValuesSet.add(FIELD_2_TEXT);
fieldValuesSet.add(KEYWORD_TEXT);
fieldValuesSet.add(UNINDEXED_FIELD_TEXT);
fieldValuesSet.add(UNSTORED_1_FIELD_TEXT);
fieldValuesSet.add(UNSTORED_2_FIELD_TEXT);
*/

View File

@ -0,0 +1,121 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException;
public class TestDocumentWriter extends TestCase {
private RAMDirectory dir = new RAMDirectory();
private Document testDoc = new Document();
public TestDocumentWriter(String s) {
super(s);
}
protected void setUp() {
DocHelper.setupDoc(testDoc);
}
protected void tearDown() {
}
public void test() {
assertTrue(dir != null);
}
public void testAddDocument() {
Analyzer analyzer = new WhitespaceAnalyzer();
Similarity similarity = Similarity.getDefault();
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
assertTrue(writer != null);
try {
writer.addDocument("test", testDoc);
//After adding the document, we should be able to read it back in
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
assertTrue(reader != null);
Document doc = reader.document(0);
assertTrue(doc != null);
//System.out.println("Document: " + doc);
Field [] fields = doc.getFields("textField2");
assertTrue(fields != null && fields.length == 1);
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
assertTrue(fields[0].isTermVectorStored() == true);
fields = doc.getFields("textField1");
assertTrue(fields != null && fields.length == 1);
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
assertTrue(fields[0].isTermVectorStored() == false);
fields = doc.getFields("keyField");
assertTrue(fields != null && fields.length == 1);
assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.index;
import junit.framework.TestCase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.OutputStream;
import java.io.IOException;
import java.util.Map;
//import org.cnlp.utils.properties.ResourceBundleHelper;
public class TestFieldInfos extends TestCase {
private Document testDoc = new Document();
public TestFieldInfos(String s) {
super(s);
}
protected void setUp() {
DocHelper.setupDoc(testDoc);
}
protected void tearDown() {
}
public void test() {
//Positive test of FieldInfos
assertTrue(testDoc != null);
FieldInfos fieldInfos = new FieldInfos();
fieldInfos.add(testDoc);
//Since the complement is stored as well in the fields map
assertTrue(fieldInfos.size() == 7); //this is 7 b/c we are using the no-arg constructor
RAMDirectory dir = new RAMDirectory();
String name = "testFile";
OutputStream output = dir.createFile(name);
assertTrue(output != null);
//Use a RAMOutputStream
try {
fieldInfos.write(output);
output.close();
assertTrue(output.length() > 0);
FieldInfos readIn = new FieldInfos(dir, name);
assertTrue(fieldInfos.size() == readIn.size());
FieldInfo info = readIn.fieldInfo("textField1");
assertTrue(info != null);
assertTrue(info.storeTermVector == false);
info = readIn.fieldInfo("textField2");
assertTrue(info != null);
assertTrue(info.storeTermVector == true);
dir.close();
} catch (IOException e) {
assertTrue(false);
}
}
}

View File

@ -0,0 +1,115 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.search.Similarity;
import java.util.Map;
import java.io.IOException;
public class TestFieldsReader extends TestCase {
private RAMDirectory dir = new RAMDirectory();
private Document testDoc = new Document();
private FieldInfos fieldInfos = null;
public TestFieldsReader(String s) {
super(s);
}
protected void setUp() {
fieldInfos = new FieldInfos();
DocHelper.setupDoc(testDoc);
fieldInfos.add(testDoc);
DocumentWriter writer = new DocumentWriter(dir, new WhitespaceAnalyzer(),
Similarity.getDefault(), 50);
assertTrue(writer != null);
try {
writer.addDocument("test", testDoc);
}
catch (IOException e)
{
}
}
protected void tearDown() {
}
public void test() {
assertTrue(dir != null);
assertTrue(fieldInfos != null);
try {
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
assertTrue(reader != null);
assertTrue(reader.size() == 1);
Document doc = reader.doc(0);
assertTrue(doc != null);
assertTrue(doc.getField("textField1") != null);
Field field = doc.getField("textField2");
assertTrue(field != null);
assertTrue(field.isTermVectorStored() == true);
reader.close();
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
}

View File

@ -79,7 +79,7 @@ public class TestFilterIndexReader extends TestCase {
private static class TestReader extends FilterIndexReader {
/** Filter that only permits terms containing 'e'.*/
/** Filter that only permits terms containing 'e'.*/
private static class TestTermEnum extends FilterTermEnum {
public TestTermEnum(TermEnum enum)
throws IOException {

View File

@ -0,0 +1,136 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.document.Document;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
public class TestMultiReader extends TestCase {
private Directory dir = new RAMDirectory();
private Document doc1 = new Document();
private Document doc2 = new Document();
private SegmentReader reader1;
private SegmentReader reader2;
private SegmentReader [] readers = new SegmentReader[2];
private SegmentInfos sis = new SegmentInfos();
public TestMultiReader(String s) {
super(s);
}
protected void setUp() {
DocHelper.setupDoc(doc1);
DocHelper.setupDoc(doc2);
DocHelper.writeDoc(dir, "seg-1", doc1);
DocHelper.writeDoc(dir, "seg-2", doc2);
try {
sis.write(dir);
reader1 = new SegmentReader(new SegmentInfo("seg-1", 1, dir));
reader2 = new SegmentReader(new SegmentInfo("seg-2", 1, dir));
readers[0] = reader1;
readers[1] = reader2;
} catch (IOException e) {
e.printStackTrace();
}
}
/*IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
writer.addDocument(doc1);
writer.addDocument(doc2);
writer.close();*/
protected void tearDown() {
}
public void test() {
assertTrue(dir != null);
assertTrue(reader1 != null);
assertTrue(reader2 != null);
assertTrue(sis != null);
}
public void testDocument() {
try {
sis.read(dir);
MultiReader reader = new MultiReader(dir, readers);
assertTrue(reader != null);
Document newDoc1 = reader.document(0);
assertTrue(newDoc1 != null);
assertTrue(DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - 2);
Document newDoc2 = reader.document(1);
assertTrue(newDoc2 != null);
assertTrue(DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - 2);
TermFreqVector vector = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
assertTrue(vector != null);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
public void testTermVectors() {
try {
MultiReader reader = new MultiReader(dir, readers);
assertTrue(reader != null);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
}

View File

@ -0,0 +1,163 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import java.io.IOException;
import java.util.Collection;
public class TestSegmentMerger extends TestCase {
//The variables for the new merged segment
private Directory mergedDir = new RAMDirectory();
private String mergedSegment = "test";
//First segment to be merged
private Directory merge1Dir = new RAMDirectory();
private Document doc1 = new Document();
private String merge1Segment = "test-1";
private SegmentReader reader1 = null;
//Second Segment to be merged
private Directory merge2Dir = new RAMDirectory();
private Document doc2 = new Document();
private String merge2Segment = "test-2";
private SegmentReader reader2 = null;
public TestSegmentMerger(String s) {
super(s);
}
protected void setUp() {
DocHelper.setupDoc(doc1);
DocHelper.writeDoc(merge1Dir, merge1Segment, doc1);
DocHelper.setupDoc(doc2);
DocHelper.writeDoc(merge2Dir, merge2Segment, doc2);
try {
reader1 = new SegmentReader(new SegmentInfo(merge1Segment, 1, merge1Dir));
reader2 = new SegmentReader(new SegmentInfo(merge2Segment, 1, merge2Dir));
} catch (IOException e) {
e.printStackTrace();
}
}
protected void tearDown() {
}
public void test() {
assertTrue(mergedDir != null);
assertTrue(merge1Dir != null);
assertTrue(merge2Dir != null);
assertTrue(reader1 != null);
assertTrue(reader2 != null);
}
public void testMerge() {
//System.out.println("----------------TestMerge------------------");
SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment, false);
merger.add(reader1);
merger.add(reader2);
try {
int docsMerged = merger.merge();
assertTrue(docsMerged == 2);
//Should be able to open a new SegmentReader against the new directory
SegmentReader mergedReader = new SegmentReader(new SegmentInfo(mergedSegment, docsMerged, mergedDir));
assertTrue(mergedReader != null);
assertTrue(mergedReader.numDocs() == 2);
Document newDoc1 = mergedReader.document(0);
assertTrue(newDoc1 != null);
//There are 2 unstored fields on the document
assertTrue(DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - 2);
Document newDoc2 = mergedReader.document(1);
assertTrue(newDoc2 != null);
assertTrue(DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - 2);
TermDocs termDocs = mergedReader.termDocs(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"));
assertTrue(termDocs != null);
assertTrue(termDocs.next() == true);
Collection stored = mergedReader.getIndexedFieldNames(true);
assertTrue(stored != null);
//System.out.println("stored size: " + stored.size());
assertTrue(stored.size() == 2);
TermFreqVector vector = mergedReader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
assertTrue(vector != null);
String [] terms = vector.getTerms();
assertTrue(terms != null);
//System.out.println("Terms size: " + terms.length);
assertTrue(terms.length == 3);
int [] freqs = vector.getTermFrequencies();
assertTrue(freqs != null);
//System.out.println("Freqs size: " + freqs.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
int freq = freqs[i];
//System.out.println("Term: " + term + " Freq: " + freq);
assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
assertTrue(DocHelper.FIELD_2_FREQS[i] == freq);
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
//System.out.println("---------------------end TestMerge-------------------");
}
}

View File

@ -0,0 +1,250 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.Enumeration;
public class TestSegmentReader extends TestCase {
private RAMDirectory dir = new RAMDirectory();
private Document testDoc = new Document();
private SegmentReader reader = null;
public TestSegmentReader(String s) {
super(s);
}
//TODO: Setup the reader w/ multiple documents
protected void setUp() {
try {
DocHelper.setupDoc(testDoc);
DocHelper.writeDoc(dir, testDoc);
reader = new SegmentReader(new SegmentInfo("test", 1, dir));
} catch (IOException e) {
}
}
protected void tearDown() {
}
public void test() {
assertTrue(dir != null);
assertTrue(reader != null);
assertTrue(DocHelper.nameValues.size() > 0);
assertTrue(DocHelper.numFields(testDoc) == 6);
}
public void testDocument() {
try {
assertTrue(reader.numDocs() == 1);
assertTrue(reader.maxDoc() >= 1);
Document result = reader.document(0);
assertTrue(result != null);
//There are 2 unstored fields on the document that are not preserved across writing
assertTrue(DocHelper.numFields(result) == DocHelper.numFields(testDoc) - 2);
Enumeration fields = result.fields();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
assertTrue(field != null);
assertTrue(DocHelper.nameValues.containsKey(field.name()));
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
public void testDelete() {
Document docToDelete = new Document();
DocHelper.setupDoc(docToDelete);
DocHelper.writeDoc(dir, "seg-to-delete", docToDelete);
try {
SegmentReader deleteReader = new SegmentReader(new SegmentInfo("seg-to-delete", 1, dir));
assertTrue(deleteReader != null);
assertTrue(deleteReader.numDocs() == 1);
deleteReader.delete(0);
assertTrue(deleteReader.isDeleted(0) == true);
assertTrue(deleteReader.hasDeletions() == true);
assertTrue(deleteReader.numDocs() == 0);
try {
Document test = deleteReader.document(0);
assertTrue(false);
} catch (IllegalArgumentException e) {
assertTrue(true);
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
public void testGetFieldNameVariations() {
try {
Collection result = reader.getFieldNames();
assertTrue(result != null);
assertTrue(result.size() == 7);
for (Iterator iter = result.iterator(); iter.hasNext();) {
String s = (String) iter.next();
//System.out.println("Name: " + s);
assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
}
result = reader.getFieldNames(true);
assertTrue(result != null);
// System.out.println("Size: " + result.size());
assertTrue(result.size() == 5);
for (Iterator iter = result.iterator(); iter.hasNext();) {
String s = (String) iter.next();
assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
}
result = reader.getFieldNames(false);
assertTrue(result != null);
assertTrue(result.size() == 2);
//Get all indexed fields that are storing term vectors
result = reader.getIndexedFieldNames(true);
assertTrue(result != null);
assertTrue(result.size() == 2);
result = reader.getIndexedFieldNames(false);
assertTrue(result != null);
assertTrue(result.size() == 3);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
public void testTerms() {
try {
TermEnum terms = reader.terms();
assertTrue(terms != null);
while (terms.next() == true)
{
Term term = terms.term();
assertTrue(term != null);
//System.out.println("Term: " + term);
String fieldValue = (String)DocHelper.nameValues.get(term.field());
assertTrue(fieldValue.indexOf(term.text()) != -1);
}
TermDocs termDocs = reader.termDocs();
assertTrue(termDocs != null);
termDocs.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
assertTrue(termDocs.next() == true);
TermPositions positions = reader.termPositions();
positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
assertTrue(positions != null);
assertTrue(positions.doc() == 0);
assertTrue(positions.nextPosition() >= 0);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
public void testNorms() {
//TODO: Not sure how these work/should be tested
/*
try {
byte [] norms = reader.norms(DocHelper.TEXT_FIELD_1_KEY);
System.out.println("Norms: " + norms);
assertTrue(norms != null);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
*/
}
public void testTermVectors() {
try {
TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
assertTrue(result != null);
String [] terms = result.getTerms();
int [] freqs = result.getTermFrequencies();
assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
int freq = freqs[i];
assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
assertTrue(freq > 0);
}
TermFreqVector [] results = reader.getTermFreqVectors(0);
assertTrue(results != null);
assertTrue(results.length == 2);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
}

View File

@ -0,0 +1,137 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException;
public class TestSegmentTermDocs extends TestCase {
private Document testDoc = new Document();
private Directory dir = new RAMDirectory();
public TestSegmentTermDocs(String s) {
super(s);
}
protected void setUp() {
DocHelper.setupDoc(testDoc);
DocHelper.writeDoc(dir, testDoc);
}
protected void tearDown() {
}
public void test() {
assertTrue(dir != null);
}
public void testTermDocs() {
try {
//After adding the document, we should be able to read it back in
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
assertTrue(reader != null);
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
assertTrue(segTermDocs != null);
segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"));
if (segTermDocs.next() == true)
{
int docId = segTermDocs.doc();
assertTrue(docId == 0);
int freq = segTermDocs.freq();
assertTrue(freq == 3);
}
reader.close();
} catch (IOException e) {
assertTrue(false);
}
}
public void testBadSeek() {
try {
//After adding the document, we should be able to read it back in
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 3, dir));
assertTrue(reader != null);
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
assertTrue(segTermDocs != null);
segTermDocs.seek(new Term("textField2", "bad"));
assertTrue(segTermDocs.next() == false);
reader.close();
} catch (IOException e) {
assertTrue(false);
}
try {
//After adding the document, we should be able to read it back in
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 3, dir));
assertTrue(reader != null);
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
assertTrue(segTermDocs != null);
segTermDocs.seek(new Term("junk", "bad"));
assertTrue(segTermDocs.next() == false);
reader.close();
} catch (IOException e) {
assertTrue(false);
}
}
}

View File

@ -0,0 +1,106 @@
package org.apache.lucene.index;
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.util.Arrays;
public class TestTermVectorsReader extends TestCase {
private TermVectorsWriter writer = null;
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
private String [] testFields = {"f1", "f2", "f3"};
private String [] testTerms = {"this", "is", "a", "test"};
private RAMDirectory dir = new RAMDirectory();
private String seg = "testSegment";
private FieldInfos fieldInfos = new FieldInfos();
public TestTermVectorsReader(String s) {
super(s);
}
protected void setUp() {
for (int i = 0; i < testFields.length; i++) {
fieldInfos.add(testFields[i], true, true);
}
try {
Arrays.sort(testTerms);
for (int j = 0; j < 5; j++) {
writer = new TermVectorsWriter(dir, seg, fieldInfos);
writer.openDocument();
for (int k = 0; k < testFields.length; k++) {
writer.openField(testFields[k]);
for (int i = 0; i < testTerms.length; i++) {
writer.addTerm(testTerms[i], i);
}
writer.closeField();
}
writer.closeDocument();
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
protected void tearDown() {
}
public void test() {
//Check to see the files were created properly in setup
assertTrue(writer.isDocumentOpen() == false);
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
}
public void testReader() {
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
TermFreqVector vector = reader.get(0, testFields[0]);
assertTrue(vector != null);
String [] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
assertTrue(term.equals(testTerms[i]));
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
/**
* Make sure exceptions and bad params are handled appropriately
*/
public void testBadParams() {
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
//Bad document number, good field number
TermFreqVector vector = reader.get(50, testFields[0]);
assertTrue(vector == null);
} catch (Exception e) {
assertTrue(false);
}
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
//good document number, bad field number
TermFreqVector vector = reader.get(0, "f50");
assertTrue(vector == null);
} catch (Exception e) {
assertTrue(false);
}
}
}

View File

@ -0,0 +1,240 @@
package org.apache.lucene.index;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
public class TestTermVectorsWriter extends TestCase {
private String[] testTerms = {"this", "is", "a", "test"};
private String [] testFields = {"f1", "f2", "f3"};
private int[][] positions = new int[testTerms.length][];
private RAMDirectory dir = new RAMDirectory();
private String seg = "testSegment";
private FieldInfos fieldInfos = new FieldInfos();
public TestTermVectorsWriter(String s) {
super(s);
}
protected void setUp() {
for (int i = 0; i < testFields.length; i++) {
fieldInfos.add(testFields[i], true, true);
}
for (int i = 0; i < testTerms.length; i++) {
positions[i] = new int[5];
for (int j = 0; j < positions[i].length; j++) {
positions[i][j] = i * 100;
}
}
}
protected void tearDown() {
}
public void test() {
assertTrue(dir != null);
assertTrue(positions != null);
}
/*public void testWriteNoPositions() {
try {
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, 50);
writer.openDocument();
assertTrue(writer.isDocumentOpen() == true);
writer.openField(0);
assertTrue(writer.isFieldOpen() == true);
for (int i = 0; i < testTerms.length; i++) {
writer.addTerm(testTerms[i], i);
}
writer.closeField();
writer.closeDocument();
writer.close();
assertTrue(writer.isDocumentOpen() == false);
//Check to see the files were created
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
//Now read it back in
TermVectorsReader reader = new TermVectorsReader(dir, seg);
assertTrue(reader != null);
checkTermVector(reader, 0, 0);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
} */
public void testWriter() {
try {
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
writer.openDocument();
assertTrue(writer.isDocumentOpen() == true);
writeField(writer, testFields[0]);
writer.closeDocument();
writer.close();
assertTrue(writer.isDocumentOpen() == false);
//Check to see the files were created
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
//Now read it back in
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
checkTermVector(reader, 0, testFields[0]);
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
private void checkTermVector(TermVectorsReader reader, int docNum, String field) throws IOException {
TermFreqVector vector = reader.get(docNum, field);
assertTrue(vector != null);
String[] terms = vector.getTerms();
assertTrue(terms != null);
assertTrue(terms.length == testTerms.length);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
assertTrue(term.equals(testTerms[i]));
}
}
/**
* Test one document, multiple fields
*/
public void testMultipleFields() {
try {
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
writeDocument(writer, testFields.length);
writer.close();
assertTrue(writer.isDocumentOpen() == false);
//Check to see the files were created
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
//Now read it back in
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
assertTrue(reader != null);
for (int j = 0; j < testFields.length; j++) {
checkTermVector(reader, 0, testFields[j]);
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
private void writeDocument(TermVectorsWriter writer, int numFields) throws IOException {
writer.openDocument();
assertTrue(writer.isDocumentOpen() == true);
for (int j = 0; j < numFields; j++) {
writeField(writer, testFields[j]);
}
writer.closeDocument();
assertTrue(writer.isDocumentOpen() == false);
}
/**
*
* @param writer The writer to write to
* @param j The field number
* @throws IOException
*/
private void writeField(TermVectorsWriter writer, String f) throws IOException {
writer.openField(f);
assertTrue(writer.isFieldOpen() == true);
for (int i = 0; i < testTerms.length; i++) {
writer.addTerm(testTerms[i], i);
}
writer.closeField();
}
public void testMultipleDocuments() {
try {
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
assertTrue(writer != null);
for (int i = 0; i < 10; i++) {
writeDocument(writer, testFields.length);
}
writer.close();
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
//Do some arbitrary tests
try {
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
for (int i = 0; i < 10; i++) {
assertTrue(reader != null);
checkTermVector(reader, 5, testFields[0]);
checkTermVector(reader, 2, testFields[2]);
}
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
}

View File

@ -103,7 +103,7 @@ public class TestBasics extends TestCase {
searcher = new IndexSearcher(directory);
}
public void testTerm() throws Exception {
Query query = new TermQuery(new Term("field", "seventy"));
checkHits(query, new int[]

View File

@ -0,0 +1,104 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
public class TestQueryTermVector extends TestCase {
public TestQueryTermVector(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void testConstructor() {
String [] queryTerm = {"foo", "bar", "foo", "again", "foo", "bar", "go", "go", "go"};
//Items are sorted lexicographically
String [] gold = {"again", "bar", "foo", "go"};
int [] goldFreqs = {1, 2, 3, 3};
QueryTermVector result = new QueryTermVector(queryTerm);
assertTrue(result != null);
String [] terms = result.getTerms();
assertTrue(terms.length == 4);
int [] freq = result.getTermFrequencies();
assertTrue(freq.length == 4);
checkGold(terms, gold, freq, goldFreqs);
result = new QueryTermVector(null);
assertTrue(result.getTerms().length == 0);
result = new QueryTermVector("foo bar foo again foo bar go go go", new WhitespaceAnalyzer());
assertTrue(result != null);
terms = result.getTerms();
assertTrue(terms.length == 4);
freq = result.getTermFrequencies();
assertTrue(freq.length == 4);
checkGold(terms, gold, freq, goldFreqs);
}
private void checkGold(String[] terms, String[] gold, int[] freq, int[] goldFreqs) {
for (int i = 0; i < terms.length; i++) {
assertTrue(terms[i].equals(gold[i]));
assertTrue(freq[i] == goldFreqs[i]);
}
}
}

View File

@ -0,0 +1,261 @@
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.English;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class TestTermVectors extends TestCase {
private IndexSearcher searcher;
private RAMDirectory directory = new RAMDirectory();
public TestTermVectors(String s) {
super(s);
}
public void setUp() throws Exception {
IndexWriter writer
= new IndexWriter(directory, new SimpleAnalyzer(), true);
//writer.setUseCompoundFile(true);
//writer.infoStream = System.out;
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < 1000; i++) {
Document doc = new Document();
doc.add(Field.Text("field", English.intToEnglish(i), true));
writer.addDocument(doc);
}
writer.close();
searcher = new IndexSearcher(directory);
}
protected void tearDown() {
}
public void test() {
assertTrue(searcher != null);
}
public void testTermVectors() {
Query query = new TermQuery(new Term("field", "seventy"));
try {
Hits hits = searcher.search(query);
assertEquals(100, hits.length());
for (int i = 0; i < hits.length(); i++)
{
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null);
assertTrue(vector.length == 1);
//assertTrue();
}
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50));
//System.out.println("Explain: " + searcher.explain(query, hits.id(50)));
//System.out.println("Vector: " + vector[0].toString());
} catch (IOException e) {
assertTrue(false);
}
}
public void testTermPositionVectors() {
Query query = new TermQuery(new Term("field", "fifty"));
try {
Hits hits = searcher.search(query);
assertEquals(100, hits.length());
for (int i = 0; i < hits.length(); i++)
{
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
assertTrue(vector != null);
assertTrue(vector.length == 1);
//assertTrue();
}
} catch (IOException e) {
assertTrue(false);
}
}
public void testKnownSetOfDocuments() {
String [] termArray = {"eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored",
"with", "an"};
String test1 = "eating chocolate in a computer lab"; //6 terms
String test2 = "computer in a computer lab"; //5 terms
String test3 = "a chocolate lab grows old"; //5 terms
String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
Map test4Map = new HashMap();
test4Map.put("chocolate", new Integer(3));
test4Map.put("lab", new Integer(2));
test4Map.put("eating", new Integer(1));
test4Map.put("computer", new Integer(1));
test4Map.put("with", new Integer(1));
test4Map.put("a", new Integer(1));
test4Map.put("colored", new Integer(1));
test4Map.put("in", new Integer(1));
test4Map.put("an", new Integer(1));
test4Map.put("computer", new Integer(1));
test4Map.put("old", new Integer(1));
Document testDoc1 = new Document();
setupDoc(testDoc1, test1);
Document testDoc2 = new Document();
setupDoc(testDoc2, test2);
Document testDoc3 = new Document();
setupDoc(testDoc3, test3);
Document testDoc4 = new Document();
setupDoc(testDoc4, test4);
Directory dir = new RAMDirectory();
try {
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
assertTrue(writer != null);
writer.addDocument(testDoc1);
writer.addDocument(testDoc2);
writer.addDocument(testDoc3);
writer.addDocument(testDoc4);
writer.close();
IndexSearcher knownSearcher = new IndexSearcher(dir);
TermEnum termEnum = knownSearcher.reader.terms();
TermDocs termDocs = knownSearcher.reader.termDocs();
//System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
Similarity sim = knownSearcher.getSimilarity();
while (termEnum.next() == true)
{
Term term = termEnum.term();
//System.out.println("Term: " + term);
termDocs.seek(term);
while (termDocs.next())
{
int docId = termDocs.doc();
int freq = termDocs.freq();
//System.out.println("Doc Id: " + docId + " freq " + freq);
TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field");
float tf = sim.tf(freq);
float idf = sim.idf(term, knownSearcher);
//float qNorm = sim.queryNorm()
//This is fine since we don't have stop words
float lNorm = sim.lengthNorm("field", vector.getTerms().length);
//float coord = sim.coord()
//System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
assertTrue(vector != null);
String[] vTerms = vector.getTerms();
int [] freqs = vector.getTermFrequencies();
for (int i = 0; i < vTerms.length; i++)
{
if (term.text().equals(vTerms[i]) == true)
{
assertTrue(freqs[i] == freq);
}
}
}
//System.out.println("--------");
}
Query query = new TermQuery(new Term("field", "chocolate"));
Hits hits = knownSearcher.search(query);
//doc 3 should be the first hit b/c it is the shortest match
assertTrue(hits.length() == 3);
float score = hits.score(0);
/*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
assertTrue(testDoc3.toString().equals(hits.doc(0).toString()));
assertTrue(testDoc4.toString().equals(hits.doc(1).toString()));
assertTrue(testDoc1.toString().equals(hits.doc(2).toString()));
TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field");
assertTrue(vector != null);
//System.out.println("Vector: " + vector);
String[] terms = vector.getTerms();
int [] freqs = vector.getTermFrequencies();
assertTrue(terms != null && terms.length == 10);
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
//System.out.println("Term: " + term);
int freq = freqs[i];
assertTrue(test4.indexOf(term) != -1);
Integer freqInt = (Integer)test4Map.get(term);
assertTrue(freqInt != null);
assertTrue(freqInt.intValue() == freq);
}
knownSearcher.close();
} catch (IOException e) {
e.printStackTrace();
assertTrue(false);
}
}
private void setupDoc(Document doc, String text)
{
doc.add(Field.Text("field", text, true));
//System.out.println("Document: " + doc);
}
}

View File

@ -0,0 +1,88 @@
package org.apache.lucene.util;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
public class StringHelperTest extends TestCase {
public StringHelperTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void testStringDifference() {
String test1 = "test";
String test2 = "testing";
int result = StringHelper.stringDifference(test1, test2);
assertTrue(result == 4);
test2 = "foo";
result = StringHelper.stringDifference(test1, test2);
assertTrue(result == 0);
test2 = "test";
result = StringHelper.stringDifference(test1, test2);
assertTrue(result == 4);
}
}

View File

@ -14,7 +14,7 @@
<p>
This document defines the index file formats used
in Lucene version 1.3.
in Lucene version 1.4.
</p>
<p>
@ -224,7 +224,11 @@
multiplied into the score for hits on that field.
</p>
</li>
<li><p>Term Vectors. For each field in each document, the term vector
(sometimes called document vector) is stored. A term vector consists
of the term text, term frequency and term position.
</p>
</li>
<li><p>Deleted documents.
An optional file indicating which documents are deleted.
</p>
@ -804,9 +808,10 @@
</p>
<p>
Currently only the low-order bit is used of FieldBits is used. It is
one for
indexed fields, and zero for non-indexed fields.
The low-order bit is one for
indexed fields, and zero for non-indexed fields. The second lowest-order
bit is one for fields that have term vectors stored, and zero for fields
without term vectors.
</p>
<p>
@ -1113,6 +1118,52 @@
</ol>
</subsection>
<subsection name="Term Vectors">
Term Vector support is an optional on a field by field basis. It consists of 4
files.
<ol>
<li>
<p>The Document Index or .tvx file.</p>
<p>This contains, for each document, a pointer to the document data in the Document
(.tvd) file.
</p>
<p>DocumentIndex (.tvx) --&gt; FormatVersion&lt;DocumentPosition&gt;<sup>NumDocs</sup></p>
<p>FormatVersion --&gt; Int</p>
<p>DocumentPosition --&gt; UInt64</p>
<p>This is used to find the position of the Document in the .tvd file.</p>
</li>
<li>
<p>The Document or .tvd file.</p>
<p>This contains, for each document, the number of fields, a list of the fields with
term vector info and finally a list of pointers to the field information in the .tvf
(Term Vector Fields) file.</p>
<p>
Document (.tvd) --&gt; FormatVersion&lt;NumFields, FieldNums, FieldPositions,&gt;<sup>NumDocs</sup>
</p>
<p>FormatVersion --&gt; Int</p>
<p>NumFields --&gt; VInt</p>
<p>FieldNums --&gt; &lt;FieldNumDelta&gt;<sup>NumFields</sup></p>
<p>FieldNumDelta --&gt; VInt</p>
<p>FieldPositions --&gt; &lt;FieldPosition&gt;<sup>NumFields</sup></p>
<p>FieldPosition --&gt; VLong</p>
<p>The .tvd file is used to map out the fields that have term vectors stored and
where the field information is in the .tvf file.</p>
</li>
<li>
<p>The Field or .tvf file.</p>
<p>This file contains, for each field that has a term vector stored, a list of
the terms and their frequencies.</p>
<p>Field (.tvf) --&gt; FormatVersion&lt;NumTerms, NumDistinct, TermFreqs&gt;<sup>NumFields</sup></p>
<p>FormatVersion --&gt; Int</p>
<p>NumTerms --&gt; VInt</p>
<p>NumDistinct --&gt; VInt -- Future Use</p>
<p>TermFreqs --&gt; &lt;TermText, TermFreq&gt;<sup>NumTerms</sup></p>
<p>TermText --&gt; String</p>
<p>TermFreq --&gt; VInt</p>
<p></p>
</li>
</ol>
</subsection>
<subsection name="Deleted Documents">