mirror of https://github.com/apache/lucene.git
Added term vector support.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150206 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
abb62bda9c
commit
12eee6df5a
|
@ -54,6 +54,10 @@ $Id$
|
|||
9. Added MultiReader, an IndexReader that combines multiple other
|
||||
IndexReaders. (Cutting)
|
||||
|
||||
10. Added support for term vectors. See Field#isTermVectorStored().
|
||||
(Grant Ingersoll, Cutting & Dmitry)
|
||||
|
||||
|
||||
1.3 final
|
||||
|
||||
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
|
||||
|
|
|
@ -71,6 +71,7 @@ import org.apache.lucene.search.Hits; // for javadoc
|
|||
public final class Field implements java.io.Serializable {
|
||||
private String name = "body";
|
||||
private String stringValue = null;
|
||||
private boolean storeTermVector = false;
|
||||
private Reader readerValue = null;
|
||||
private boolean isStored = false;
|
||||
private boolean isIndexed = true;
|
||||
|
@ -114,7 +115,8 @@ public final class Field implements java.io.Serializable {
|
|||
}
|
||||
|
||||
/** Constructs a String-valued Field that is not tokenized, but is indexed
|
||||
and stored. Useful for non-text fields, e.g. date or url. */
|
||||
and stored. Useful for non-text fields, e.g. date or url.
|
||||
*/
|
||||
public static final Field Keyword(String name, String value) {
|
||||
return new Field(name, value, true, true, false);
|
||||
}
|
||||
|
@ -127,9 +129,9 @@ public final class Field implements java.io.Serializable {
|
|||
|
||||
/** Constructs a String-valued Field that is tokenized and indexed,
|
||||
and is stored in the index, for return with hits. Useful for short text
|
||||
fields, like "title" or "subject". */
|
||||
fields, like "title" or "subject". Term vector will not be stored for this field. */
|
||||
public static final Field Text(String name, String value) {
|
||||
return new Field(name, value, true, true, true);
|
||||
return Text(name, value, false);
|
||||
}
|
||||
|
||||
/** Constructs a Date-valued Field that is not tokenized and is indexed,
|
||||
|
@ -139,16 +141,38 @@ public final class Field implements java.io.Serializable {
|
|||
}
|
||||
|
||||
/** Constructs a String-valued Field that is tokenized and indexed,
|
||||
but that is not stored in the index. */
|
||||
and is stored in the index, for return with hits. Useful for short text
|
||||
fields, like "title" or "subject". */
|
||||
public static final Field Text(String name, String value, boolean storeTermVector) {
|
||||
return new Field(name, value, true, true, true, storeTermVector);
|
||||
}
|
||||
|
||||
/** Constructs a String-valued Field that is tokenized and indexed,
|
||||
but that is not stored in the index. Term vector will not be stored for this field. */
|
||||
public static final Field UnStored(String name, String value) {
|
||||
return new Field(name, value, false, true, true);
|
||||
return UnStored(name, value, false);
|
||||
}
|
||||
|
||||
/** Constructs a String-valued Field that is tokenized and indexed,
|
||||
but that is not stored in the index. */
|
||||
public static final Field UnStored(String name, String value, boolean storeTermVector) {
|
||||
return new Field(name, value, false, true, true, storeTermVector);
|
||||
}
|
||||
|
||||
/** Constructs a Reader-valued Field that is tokenized and indexed, but is
|
||||
not stored in the index verbatim. Useful for longer text fields, like
|
||||
"body". Term vector will not be stored for this field. */
|
||||
public static final Field Text(String name, Reader value) {
|
||||
return Text(name, value, false);
|
||||
}
|
||||
|
||||
/** Constructs a Reader-valued Field that is tokenized and indexed, but is
|
||||
not stored in the index verbatim. Useful for longer text fields, like
|
||||
"body". */
|
||||
public static final Field Text(String name, Reader value) {
|
||||
return new Field(name, value);
|
||||
public static final Field Text(String name, Reader value, boolean storeTermVector) {
|
||||
Field f = new Field(name, value);
|
||||
f.storeTermVector = storeTermVector;
|
||||
return f;
|
||||
}
|
||||
|
||||
/** The name of the field (e.g., "date", "subject", "title", or "body")
|
||||
|
@ -162,19 +186,41 @@ public final class Field implements java.io.Serializable {
|
|||
is used. Exactly one of stringValue() and readerValue() must be set. */
|
||||
public Reader readerValue() { return readerValue; }
|
||||
|
||||
|
||||
/** Create a field by specifying all parameters except for <code>storeTermVector</code>,
|
||||
* which is set to <code>false</code>.
|
||||
*/
|
||||
public Field(String name, String string,
|
||||
boolean store, boolean index, boolean token) {
|
||||
this(name, string, store, index, token, false);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param string The string to process
|
||||
* @param store true if the field should store the string
|
||||
* @param index true if the field should be indexed
|
||||
* @param token true if the field should be tokenized
|
||||
* @param storeTermVector true if we should store the Term Vector info
|
||||
*/
|
||||
public Field(String name, String string,
|
||||
boolean store, boolean index, boolean token, boolean storeTermVector) {
|
||||
if (name == null)
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
if (string == null)
|
||||
throw new IllegalArgumentException("value cannot be null");
|
||||
if (!index && storeTermVector)
|
||||
throw new IllegalArgumentException("cannot store a term vector for fields that are not indexed.");
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.stringValue = string;
|
||||
this.isStored = store;
|
||||
this.isIndexed = index;
|
||||
this.isTokenized = token;
|
||||
this.storeTermVector = storeTermVector;
|
||||
}
|
||||
|
||||
Field(String name, Reader reader) {
|
||||
if (name == null)
|
||||
throw new IllegalArgumentException("name cannot be null");
|
||||
|
@ -199,6 +245,16 @@ public final class Field implements java.io.Serializable {
|
|||
Reader-valued. */
|
||||
public final boolean isTokenized() { return isTokenized; }
|
||||
|
||||
/** True iff the term or terms used to index this field are stored as a term
|
||||
* vector, avaliable from {@link IndexReader#getTermFreqVector(int,String)}.
|
||||
* These methods do not provide access to the original content of the field,
|
||||
* only to terms used to index it. If the original content must be
|
||||
* preserved, use the <code>stored</code> attribute instead.
|
||||
*
|
||||
* @see IndexReader#getTermFreqVector(int, String)
|
||||
*/
|
||||
public final boolean isTermVectorStored() { return storeTermVector; }
|
||||
|
||||
/** Prints a Field for human consumption. */
|
||||
public final String toString() {
|
||||
if (isStored && isIndexed && !isTokenized)
|
||||
|
@ -209,8 +265,14 @@ public final class Field implements java.io.Serializable {
|
|||
return "Text<" + name + ":" + stringValue + ">";
|
||||
else if (!isStored && isIndexed && isTokenized && readerValue!=null)
|
||||
return "Text<" + name + ":" + readerValue + ">";
|
||||
else if (!isStored && isIndexed && isTokenized)
|
||||
{
|
||||
return "UnStored<" + name + ">";
|
||||
}
|
||||
else
|
||||
{
|
||||
return super.toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@ import java.io.IOException;
|
|||
* @author Dmitry Serebrennikov
|
||||
* @version $Id$
|
||||
*/
|
||||
public class CompoundFileReader extends Directory {
|
||||
class CompoundFileReader extends Directory {
|
||||
|
||||
private static final class FileEntry {
|
||||
long offset;
|
||||
|
|
|
@ -77,6 +77,13 @@ final class DocumentWriter {
|
|||
private FieldInfos fieldInfos;
|
||||
private int maxFieldLength;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param directory The directory to write the document information to
|
||||
* @param analyzer The analyzer to use for the document
|
||||
* @param similarity The Similarity function
|
||||
* @param maxFieldLength The maximum number of tokens a field may have
|
||||
*/
|
||||
DocumentWriter(Directory directory, Analyzer analyzer,
|
||||
Similarity similarity, int maxFieldLength) {
|
||||
this.directory = directory;
|
||||
|
@ -86,7 +93,7 @@ final class DocumentWriter {
|
|||
}
|
||||
|
||||
final void addDocument(String segment, Document doc)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
// write field names
|
||||
fieldInfos = new FieldInfos();
|
||||
fieldInfos.add(doc);
|
||||
|
@ -94,7 +101,7 @@ final class DocumentWriter {
|
|||
|
||||
// write field values
|
||||
FieldsWriter fieldsWriter =
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
try {
|
||||
fieldsWriter.addDocument(doc);
|
||||
} finally {
|
||||
|
@ -144,7 +151,7 @@ final class DocumentWriter {
|
|||
|
||||
// Tokenizes the fields of a document into Postings.
|
||||
private final void invertDocument(Document doc)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field) fields.nextElement();
|
||||
|
@ -166,7 +173,7 @@ final class DocumentWriter {
|
|||
reader = new StringReader(field.stringValue());
|
||||
else
|
||||
throw new IllegalArgumentException
|
||||
("field must have either String or Reader value");
|
||||
("field must have either String or Reader value");
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
||||
|
@ -277,15 +284,17 @@ final class DocumentWriter {
|
|||
}
|
||||
|
||||
private final void writePostings(Posting[] postings, String segment)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
OutputStream freq = null, prox = null;
|
||||
TermInfosWriter tis = null;
|
||||
|
||||
TermVectorsWriter termVectorWriter = null;
|
||||
try {
|
||||
//open files for inverse index storage
|
||||
freq = directory.createFile(segment + ".frq");
|
||||
prox = directory.createFile(segment + ".prx");
|
||||
tis = new TermInfosWriter(directory, segment, fieldInfos);
|
||||
TermInfo ti = new TermInfo();
|
||||
String currentField = null;
|
||||
|
||||
for (int i = 0; i < postings.length; i++) {
|
||||
Posting posting = postings[i];
|
||||
|
@ -295,38 +304,65 @@ final class DocumentWriter {
|
|||
tis.add(posting.term, ti);
|
||||
|
||||
// add an entry to the freq file
|
||||
int f = posting.freq;
|
||||
if (f == 1) // optimize freq=1
|
||||
int postingFreq = posting.freq;
|
||||
if (postingFreq == 1) // optimize freq=1
|
||||
freq.writeVInt(1); // set low bit of doc num.
|
||||
else {
|
||||
freq.writeVInt(0); // the document number
|
||||
freq.writeVInt(f); // frequency in doc
|
||||
freq.writeVInt(postingFreq); // frequency in doc
|
||||
}
|
||||
|
||||
int lastPosition = 0; // write positions
|
||||
int[] positions = posting.positions;
|
||||
for (int j = 0; j < f; j++) { // use delta-encoding
|
||||
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
|
||||
int position = positions[j];
|
||||
prox.writeVInt(position - lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
// check to see if we switched to a new field
|
||||
String termField = posting.term.field();
|
||||
if (currentField != termField) {
|
||||
// changing field - see if there is something to save
|
||||
currentField = termField;
|
||||
FieldInfo fi = fieldInfos.fieldInfo(currentField);
|
||||
if (fi.storeTermVector) {
|
||||
if (termVectorWriter == null) {
|
||||
termVectorWriter =
|
||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||
termVectorWriter.openDocument();
|
||||
}
|
||||
termVectorWriter.openField(currentField);
|
||||
} else if (termVectorWriter != null) {
|
||||
termVectorWriter.closeField();
|
||||
}
|
||||
}
|
||||
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
|
||||
termVectorWriter.addTerm(posting.term.text(), postingFreq);
|
||||
}
|
||||
}
|
||||
if (termVectorWriter != null)
|
||||
termVectorWriter.closeDocument();
|
||||
} finally {
|
||||
if (freq != null) freq.close();
|
||||
if (prox != null) prox.close();
|
||||
if (tis != null) tis.close();
|
||||
// make an effort to close all streams we can but remember and re-throw
|
||||
// the first exception encountered in this process
|
||||
IOException keep = null;
|
||||
if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
|
||||
if (keep != null) throw (IOException) keep.fillInStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private final void writeNorms(Document doc, String segment)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field) fields.nextElement();
|
||||
if (field.isIndexed()) {
|
||||
int n = fieldInfos.fieldNumber(field.name());
|
||||
float norm =
|
||||
fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
|
||||
fieldBoosts[n] * similarity.lengthNorm(field.name(), fieldLengths[n]);
|
||||
OutputStream norms = directory.createFile(segment + ".f" + n);
|
||||
try {
|
||||
norms.writeByte(similarity.encodeNorm(norm));
|
||||
|
|
|
@ -59,9 +59,13 @@ final class FieldInfo {
|
|||
boolean isIndexed;
|
||||
int number;
|
||||
|
||||
FieldInfo(String na, boolean tk, int nu) {
|
||||
// true if term vector for this field should be stored
|
||||
boolean storeTermVector;
|
||||
|
||||
FieldInfo(String na, boolean tk, int nu, boolean storeTermVector) {
|
||||
name = na;
|
||||
isIndexed = tk;
|
||||
number = nu;
|
||||
this.storeTermVector = storeTermVector;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -54,11 +54,7 @@ package org.apache.lucene.index;
|
|||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Hashtable;
|
||||
import java.util.Vector;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -68,6 +64,12 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
/** Access to the Field Info file that describes document fields and whether or
|
||||
* not they are indexed. Each segment has a separate Field Info file. Objects
|
||||
* of this class are thread-safe for multiple readers, but only one thread can
|
||||
* be adding documents at a time, with no other reader or writer threads
|
||||
* accessing this object.
|
||||
*/
|
||||
final class FieldInfos {
|
||||
private Vector byNumber = new Vector();
|
||||
private Hashtable byName = new Hashtable();
|
||||
|
@ -76,6 +78,15 @@ final class FieldInfos {
|
|||
add("", false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a FieldInfos object using the directory and the name of the file
|
||||
* InputStream
|
||||
* @param d The directory to open the InputStream from
|
||||
* @param name The name of the file to open the InputStream from in the Directory
|
||||
* @throws IOException
|
||||
*
|
||||
* @see #read
|
||||
*/
|
||||
FieldInfos(Directory d, String name) throws IOException {
|
||||
InputStream input = d.openFile(name);
|
||||
try {
|
||||
|
@ -86,36 +97,83 @@ final class FieldInfos {
|
|||
}
|
||||
|
||||
/** Adds field info for a Document. */
|
||||
final void add(Document doc) {
|
||||
Enumeration fields = doc.fields();
|
||||
public void add(Document doc) {
|
||||
Enumeration fields = doc.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field)fields.nextElement();
|
||||
add(field.name(), field.isIndexed());
|
||||
Field field = (Field) fields.nextElement();
|
||||
add(field.name(), field.isIndexed(), field.isTermVectorStored());
|
||||
}
|
||||
}
|
||||
|
||||
final void add(Collection names, boolean isIndexed) {
|
||||
/**
|
||||
* @param names The names of the fields
|
||||
* @param storeTermVectors Whether the fields store term vectors or not
|
||||
*/
|
||||
public void addIndexed(Collection names, boolean storeTermVectors) {
|
||||
Iterator i = names.iterator();
|
||||
int j = 0;
|
||||
while (i.hasNext()) {
|
||||
add((String)i.next(), true, storeTermVectors);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assumes the field is not storing term vectors
|
||||
* @param names The names of the fields
|
||||
* @param isIndexed Whether the fields are indexed or not
|
||||
*
|
||||
* @see #add(String, boolean)
|
||||
*/
|
||||
public void add(Collection names, boolean isIndexed) {
|
||||
Iterator i = names.iterator();
|
||||
int j = 0;
|
||||
while (i.hasNext()) {
|
||||
add((String)i.next(), isIndexed);
|
||||
}
|
||||
}
|
||||
|
||||
final void add(String name, boolean isIndexed) {
|
||||
FieldInfo fi = fieldInfo(name);
|
||||
if (fi == null)
|
||||
addInternal(name, isIndexed);
|
||||
else if (fi.isIndexed != isIndexed)
|
||||
fi.isIndexed = true;
|
||||
}
|
||||
/**
|
||||
* Calls three parameter add with false for the storeTermVector parameter
|
||||
* @param name The name of the Field
|
||||
* @param isIndexed true if the field is indexed
|
||||
* @see #add(String, boolean, boolean)
|
||||
*/
|
||||
public void add(String name, boolean isIndexed) {
|
||||
add(name, isIndexed, false);
|
||||
}
|
||||
|
||||
private final void addInternal(String name, boolean isIndexed) {
|
||||
FieldInfo fi = new FieldInfo(name, isIndexed, byNumber.size());
|
||||
|
||||
/** If the field is not yet known, adds it. If it is known, checks to make
|
||||
* sure that the isIndexed flag is the same as was given previously for this
|
||||
* field. If not - marks it as being indexed. Same goes for storeTermVector
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param isIndexed true if the field is indexed
|
||||
* @param storeTermVector true if the term vector should be stored
|
||||
*/
|
||||
public void add(String name, boolean isIndexed, boolean storeTermVector) {
|
||||
FieldInfo fi = fieldInfo(name);
|
||||
if (fi == null) {
|
||||
addInternal(name, isIndexed, storeTermVector);
|
||||
} else {
|
||||
if (fi.isIndexed != isIndexed) {
|
||||
fi.isIndexed = true; // once indexed, always index
|
||||
}
|
||||
if (fi.storeTermVector != storeTermVector) {
|
||||
fi.storeTermVector = true; // once vector, always vector
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addInternal(String name, boolean isIndexed,
|
||||
boolean storeTermVector) {
|
||||
FieldInfo fi =
|
||||
new FieldInfo(name, isIndexed, byNumber.size(), storeTermVector);
|
||||
byNumber.addElement(fi);
|
||||
byName.put(name, fi);
|
||||
}
|
||||
|
||||
final int fieldNumber(String fieldName) {
|
||||
public int fieldNumber(String fieldName) {
|
||||
FieldInfo fi = fieldInfo(fieldName);
|
||||
if (fi != null)
|
||||
return fi.number;
|
||||
|
@ -123,23 +181,32 @@ final class FieldInfos {
|
|||
return -1;
|
||||
}
|
||||
|
||||
final FieldInfo fieldInfo(String fieldName) {
|
||||
return (FieldInfo)byName.get(fieldName);
|
||||
public FieldInfo fieldInfo(String fieldName) {
|
||||
return (FieldInfo) byName.get(fieldName);
|
||||
}
|
||||
|
||||
final String fieldName(int fieldNumber) {
|
||||
public String fieldName(int fieldNumber) {
|
||||
return fieldInfo(fieldNumber).name;
|
||||
}
|
||||
|
||||
final FieldInfo fieldInfo(int fieldNumber) {
|
||||
return (FieldInfo)byNumber.elementAt(fieldNumber);
|
||||
public FieldInfo fieldInfo(int fieldNumber) {
|
||||
return (FieldInfo) byNumber.elementAt(fieldNumber);
|
||||
}
|
||||
|
||||
final int size() {
|
||||
public int size() {
|
||||
return byNumber.size();
|
||||
}
|
||||
|
||||
final void write(Directory d, String name) throws IOException {
|
||||
public boolean hasVectors() {
|
||||
boolean hasVectors = false;
|
||||
for (int i = 0; i < size(); i++) {
|
||||
if (fieldInfo(i).storeTermVector)
|
||||
hasVectors = true;
|
||||
}
|
||||
return hasVectors;
|
||||
}
|
||||
|
||||
public void write(Directory d, String name) throws IOException {
|
||||
OutputStream output = d.createFile(name);
|
||||
try {
|
||||
write(output);
|
||||
|
@ -148,19 +215,29 @@ final class FieldInfos {
|
|||
}
|
||||
}
|
||||
|
||||
final void write(OutputStream output) throws IOException {
|
||||
public void write(OutputStream output) throws IOException {
|
||||
output.writeVInt(size());
|
||||
for (int i = 0; i < size(); i++) {
|
||||
FieldInfo fi = fieldInfo(i);
|
||||
byte bits = 0x0;
|
||||
if (fi.isIndexed) bits |= 0x1;
|
||||
if (fi.storeTermVector) bits |= 0x2;
|
||||
output.writeString(fi.name);
|
||||
output.writeByte((byte)(fi.isIndexed ? 1 : 0));
|
||||
//Was REMOVE
|
||||
//output.writeByte((byte)(fi.isIndexed ? 1 : 0));
|
||||
output.writeByte(bits);
|
||||
}
|
||||
}
|
||||
|
||||
private final void read(InputStream input) throws IOException {
|
||||
int size = input.readVInt();
|
||||
for (int i = 0; i < size; i++)
|
||||
addInternal(input.readString().intern(),
|
||||
input.readByte() != 0);
|
||||
private void read(InputStream input) throws IOException {
|
||||
int size = input.readVInt();//read in the size
|
||||
for (int i = 0; i < size; i++) {
|
||||
String name = input.readString().intern();
|
||||
byte bits = input.readByte();
|
||||
boolean isIndexed = (bits & 0x1) != 0;
|
||||
boolean storeTermVector = (bits & 0x2) != 0;
|
||||
addInternal(name, isIndexed, storeTermVector);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -63,6 +63,7 @@ import org.apache.lucene.document.Field;
|
|||
|
||||
/**
|
||||
* Class responsible for access to stored document fields.
|
||||
*
|
||||
* It uses <segment>.fdt and <segment>.fdx; files.
|
||||
*
|
||||
* @version $Id$
|
||||
|
@ -108,7 +109,7 @@ final class FieldsReader {
|
|||
fieldsStream.readString(), // read value
|
||||
true, // stored
|
||||
fi.isIndexed, // indexed
|
||||
(bits & 1) != 0)); // tokenized
|
||||
(bits & 1) != 0, fi.storeTermVector)); // vector
|
||||
}
|
||||
|
||||
return doc;
|
||||
|
|
|
@ -66,7 +66,7 @@ import org.apache.lucene.document.Document;
|
|||
* contained index reader. Subclasses of <code>FilterIndexReader</code> may
|
||||
* further override some of these methods and may also provide additional
|
||||
* methods and fields.
|
||||
*/
|
||||
*/
|
||||
public class FilterIndexReader extends IndexReader {
|
||||
|
||||
/** Base class for filtering {@link TermDocs} implementations. */
|
||||
|
@ -89,7 +89,7 @@ public class FilterIndexReader extends IndexReader {
|
|||
|
||||
/** Base class for filtering {@link TermPositions} implementations. */
|
||||
public static class FilterTermPositions
|
||||
extends FilterTermDocs implements TermPositions {
|
||||
extends FilterTermDocs implements TermPositions {
|
||||
|
||||
public FilterTermPositions(TermPositions in) { super(in); }
|
||||
|
||||
|
@ -118,10 +118,20 @@ public class FilterIndexReader extends IndexReader {
|
|||
this.in = in;
|
||||
}
|
||||
|
||||
public TermFreqVector[] getTermFreqVectors(int docNumber)
|
||||
throws IOException {
|
||||
return in.getTermFreqVectors(docNumber);
|
||||
}
|
||||
|
||||
public TermFreqVector getTermFreqVector(int docNumber, String field)
|
||||
throws IOException {
|
||||
return in.getTermFreqVector(docNumber, field);
|
||||
}
|
||||
|
||||
public int numDocs() { return in.numDocs(); }
|
||||
public int maxDoc() { return in.maxDoc(); }
|
||||
|
||||
public Document document(int n) throws IOException {return in.document(n);}
|
||||
public Document document(int n) throws IOException { return in.document(n); }
|
||||
|
||||
public boolean isDeleted(int n) { return in.isDeleted(n); }
|
||||
public boolean hasDeletions() { return in.hasDeletions(); }
|
||||
|
@ -132,7 +142,7 @@ public class FilterIndexReader extends IndexReader {
|
|||
in.norms(f, bytes, offset);
|
||||
}
|
||||
public void setNorm(int d, String f, byte b) throws IOException {
|
||||
in.setNorm(d,f,b);
|
||||
in.setNorm(d, f, b);
|
||||
}
|
||||
|
||||
public TermEnum terms() throws IOException { return in.terms(); }
|
||||
|
@ -141,6 +151,7 @@ public class FilterIndexReader extends IndexReader {
|
|||
public int docFreq(Term t) throws IOException { return in.docFreq(t); }
|
||||
|
||||
public TermDocs termDocs() throws IOException { return in.termDocs(); }
|
||||
|
||||
public TermPositions termPositions() throws IOException {
|
||||
return in.termPositions();
|
||||
}
|
||||
|
@ -151,7 +162,18 @@ public class FilterIndexReader extends IndexReader {
|
|||
public Collection getFieldNames() throws IOException {
|
||||
return in.getFieldNames();
|
||||
}
|
||||
|
||||
public Collection getFieldNames(boolean indexed) throws IOException {
|
||||
return in.getFieldNames(indexed);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
|
||||
* else only indexed fields without term vector info
|
||||
* @return Collection of Strings indicating the names of the fields
|
||||
*/
|
||||
public Collection getIndexedFieldNames(boolean storedTermVector) {
|
||||
return in.getIndexedFieldNames(storedTermVector);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,20 +66,20 @@ import org.apache.lucene.document.Field; // for javadoc
|
|||
import org.apache.lucene.search.Similarity;
|
||||
|
||||
/** IndexReader is an abstract class, providing an interface for accessing an
|
||||
index. Search of an index is done entirely through this abstract interface,
|
||||
so that any subclass which implements it is searchable.
|
||||
index. Search of an index is done entirely through this abstract interface,
|
||||
so that any subclass which implements it is searchable.
|
||||
|
||||
<p> Concrete subclasses of IndexReader are usually constructed with a call to
|
||||
the static method {@link #open}.
|
||||
<p> Concrete subclasses of IndexReader are usually constructed with a call to
|
||||
the static method {@link #open}.
|
||||
|
||||
<p> For efficiency, in this API documents are often referred to via
|
||||
<i>document numbers</i>, non-negative integers which each name a unique
|
||||
document in the index. These document numbers are ephemeral--they may change
|
||||
as documents are added to and deleted from an index. Clients should thus not
|
||||
rely on a given document having the same number between sessions.
|
||||
<p> For efficiency, in this API documents are often referred to via
|
||||
<i>document numbers</i>, non-negative integers which each name a unique
|
||||
document in the index. These document numbers are ephemeral--they may change
|
||||
as documents are added to and deleted from an index. Clients should thus not
|
||||
rely on a given document having the same number between sessions.
|
||||
|
||||
@author Doug Cutting
|
||||
@version $Id$
|
||||
@author Doug Cutting
|
||||
@version $Id$
|
||||
*/
|
||||
public abstract class IndexReader {
|
||||
protected IndexReader(Directory directory) {
|
||||
|
@ -92,21 +92,21 @@ public abstract class IndexReader {
|
|||
private Lock writeLock;
|
||||
SegmentInfos segmentInfos = null;
|
||||
private boolean stale = false;
|
||||
|
||||
|
||||
/** Returns an IndexReader reading the index in an FSDirectory in the named
|
||||
path. */
|
||||
path. */
|
||||
public static IndexReader open(String path) throws IOException {
|
||||
return open(FSDirectory.getDirectory(path, false));
|
||||
}
|
||||
|
||||
/** Returns an IndexReader reading the index in an FSDirectory in the named
|
||||
path. */
|
||||
path. */
|
||||
public static IndexReader open(File path) throws IOException {
|
||||
return open(FSDirectory.getDirectory(path, false));
|
||||
}
|
||||
|
||||
/** Returns an IndexReader reading the index in the given Directory. */
|
||||
public static IndexReader open(final Directory directory) throws IOException{
|
||||
public static IndexReader open(final Directory directory) throws IOException {
|
||||
synchronized (directory) { // in- & inter-process sync
|
||||
return (IndexReader)new Lock.With(
|
||||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME),
|
||||
|
@ -117,10 +117,10 @@ public abstract class IndexReader {
|
|||
if (infos.size() == 1) { // index is optimized
|
||||
return new SegmentReader(infos, infos.info(0), true);
|
||||
} else {
|
||||
IndexReader[] readers = new IndexReader[infos.size()];
|
||||
for (int i = 0; i < infos.size(); i++)
|
||||
readers[i] = new SegmentReader(infos, infos.info(i), i==infos.size()-1);
|
||||
return new MultiReader(directory, readers);
|
||||
IndexReader[] readers = new IndexReader[infos.size()];
|
||||
for (int i = 0; i < infos.size(); i++)
|
||||
readers[i] = new SegmentReader(infos, infos.info(i), i==infos.size()-1);
|
||||
return new MultiReader(directory, readers);
|
||||
}
|
||||
}
|
||||
}.run();
|
||||
|
@ -174,7 +174,7 @@ public abstract class IndexReader {
|
|||
public static long lastModified(Directory directory) throws IOException {
|
||||
return directory.fileModified("segments");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads version number from segments files. The version number counts the
|
||||
* number of changes of the index.
|
||||
|
@ -186,7 +186,7 @@ public abstract class IndexReader {
|
|||
public static long getCurrentVersion(String directory) throws IOException {
|
||||
return getCurrentVersion(new File(directory));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads version number from segments files. The version number counts the
|
||||
* number of changes of the index.
|
||||
|
@ -201,7 +201,7 @@ public abstract class IndexReader {
|
|||
dir.close();
|
||||
return version;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads version number from segments files. The version number counts the
|
||||
* number of changes of the index.
|
||||
|
@ -214,6 +214,27 @@ public abstract class IndexReader {
|
|||
return SegmentInfos.readCurrentVersion(directory);
|
||||
}
|
||||
|
||||
/** Return an array of term frequency vectors for the specified document.
|
||||
* The array contains a vector for each vectorized field in the document.
|
||||
* Each vector vector contains term numbers and frequencies for all terms
|
||||
* in a given vectorized field.
|
||||
* If no such fields existed, the method returns null.
|
||||
*
|
||||
* @see Field#isTermVectorStored()
|
||||
*/
|
||||
abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
|
||||
throws IOException;
|
||||
|
||||
/** Return a term frequency vector for the specified document and field. The
|
||||
* vector returned contains term numbers and frequencies for all terms in
|
||||
* the specified field of this document, if the field had storeTermVector
|
||||
* flag set. If the flag was not set, the method returns null.
|
||||
*
|
||||
* @see Field#isTermVectorStored()
|
||||
*/
|
||||
abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> if an index exists at the specified directory.
|
||||
* If the directory does not exist or if there is no index in it.
|
||||
|
@ -250,13 +271,13 @@ public abstract class IndexReader {
|
|||
public abstract int numDocs();
|
||||
|
||||
/** Returns one greater than the largest possible document number.
|
||||
This may be used to, e.g., determine how big to allocate an array which
|
||||
will have an element for every document number in an index.
|
||||
This may be used to, e.g., determine how big to allocate an array which
|
||||
will have an element for every document number in an index.
|
||||
*/
|
||||
public abstract int maxDoc();
|
||||
|
||||
/** Returns the stored fields of the <code>n</code><sup>th</sup>
|
||||
<code>Document</code> in this index. */
|
||||
<code>Document</code> in this index. */
|
||||
public abstract Document document(int n) throws IOException;
|
||||
|
||||
/** Returns true if document <i>n</i> has been deleted */
|
||||
|
@ -264,7 +285,7 @@ public abstract class IndexReader {
|
|||
|
||||
/** Returns true if any documents have been deleted */
|
||||
public abstract boolean hasDeletions();
|
||||
|
||||
|
||||
/** Returns the byte-encoded normalization factor for the named field of
|
||||
* every document. This is used by the search code to score documents.
|
||||
*
|
||||
|
@ -283,14 +304,14 @@ public abstract class IndexReader {
|
|||
/** Expert: Resets the normalization factor for the named field of the named
|
||||
* document. The norm represents the product of the field's {@link
|
||||
* Field#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
|
||||
* int) length normalization}. Thus, to preserve the length normalization
|
||||
* int) length normalization}. Thus, to preserve the length normalization
|
||||
* values when resetting this, one should base the new value upon the old.
|
||||
*
|
||||
* @see #norms(String)
|
||||
* @see Similarity#decodeNorm(byte)
|
||||
*/
|
||||
public abstract void setNorm(int doc, String field, byte value)
|
||||
throws IOException;
|
||||
throws IOException;
|
||||
|
||||
/** Expert: Resets the normalization factor for the named field of the named
|
||||
* document.
|
||||
|
@ -299,20 +320,20 @@ public abstract class IndexReader {
|
|||
* @see Similarity#decodeNorm(byte)
|
||||
*/
|
||||
public void setNorm(int doc, String field, float value)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
setNorm(doc, field, Similarity.encodeNorm(value));
|
||||
}
|
||||
|
||||
|
||||
/** Returns an enumeration of all the terms in the index.
|
||||
The enumeration is ordered by Term.compareTo(). Each term
|
||||
is greater than all that precede it in the enumeration.
|
||||
The enumeration is ordered by Term.compareTo(). Each term
|
||||
is greater than all that precede it in the enumeration.
|
||||
*/
|
||||
public abstract TermEnum terms() throws IOException;
|
||||
|
||||
/** Returns an enumeration of all terms after a given term.
|
||||
The enumeration is ordered by Term.compareTo(). Each term
|
||||
is greater than all that precede it in the enumeration.
|
||||
The enumeration is ordered by Term.compareTo(). Each term
|
||||
is greater than all that precede it in the enumeration.
|
||||
*/
|
||||
public abstract TermEnum terms(Term t) throws IOException;
|
||||
|
||||
|
@ -320,15 +341,15 @@ public abstract class IndexReader {
|
|||
public abstract int docFreq(Term t) throws IOException;
|
||||
|
||||
/** Returns an enumeration of all the documents which contain
|
||||
<code>term</code>. For each document, the document number, the frequency of
|
||||
the term in that document is also provided, for use in search scoring.
|
||||
Thus, this method implements the mapping:
|
||||
<p><ul>
|
||||
Term => <docNum, freq><sup>*</sup>
|
||||
</ul>
|
||||
<p>The enumeration is ordered by document number. Each document number
|
||||
is greater than all that precede it in the enumeration.
|
||||
*/
|
||||
<code>term</code>. For each document, the document number, the frequency of
|
||||
the term in that document is also provided, for use in search scoring.
|
||||
Thus, this method implements the mapping:
|
||||
<p><ul>
|
||||
Term => <docNum, freq><sup>*</sup>
|
||||
</ul>
|
||||
<p>The enumeration is ordered by document number. Each document number
|
||||
is greater than all that precede it in the enumeration.
|
||||
*/
|
||||
public TermDocs termDocs(Term term) throws IOException {
|
||||
TermDocs termDocs = termDocs();
|
||||
termDocs.seek(term);
|
||||
|
@ -339,21 +360,21 @@ public abstract class IndexReader {
|
|||
public abstract TermDocs termDocs() throws IOException;
|
||||
|
||||
/** Returns an enumeration of all the documents which contain
|
||||
<code>term</code>. For each document, in addition to the document number
|
||||
and frequency of the term in that document, a list of all of the ordinal
|
||||
positions of the term in the document is available. Thus, this method
|
||||
implements the mapping:
|
||||
<code>term</code>. For each document, in addition to the document number
|
||||
and frequency of the term in that document, a list of all of the ordinal
|
||||
positions of the term in the document is available. Thus, this method
|
||||
implements the mapping:
|
||||
|
||||
<p><ul>
|
||||
Term => <docNum, freq,
|
||||
<pos<sub>1</sub>, pos<sub>2</sub>, ...
|
||||
pos<sub>freq-1</sub>>
|
||||
><sup>*</sup>
|
||||
</ul>
|
||||
<p> This positional information faciliates phrase and proximity searching.
|
||||
<p>The enumeration is ordered by document number. Each document number is
|
||||
greater than all that precede it in the enumeration.
|
||||
*/
|
||||
<p><ul>
|
||||
Term => <docNum, freq,
|
||||
<pos<sub>1</sub>, pos<sub>2</sub>, ...
|
||||
pos<sub>freq-1</sub>>
|
||||
><sup>*</sup>
|
||||
</ul>
|
||||
<p> This positional information faciliates phrase and proximity searching.
|
||||
<p>The enumeration is ordered by document number. Each document number is
|
||||
greater than all that precede it in the enumeration.
|
||||
*/
|
||||
public TermPositions termPositions(Term term) throws IOException {
|
||||
TermPositions termPositions = termPositions();
|
||||
termPositions.seek(term);
|
||||
|
@ -364,16 +385,16 @@ public abstract class IndexReader {
|
|||
public abstract TermPositions termPositions() throws IOException;
|
||||
|
||||
/** Deletes the document numbered <code>docNum</code>. Once a document is
|
||||
deleted it will not appear in TermDocs or TermPostitions enumerations.
|
||||
Attempts to read its field with the {@link #document}
|
||||
method will result in an error. The presence of this document may still be
|
||||
reflected in the {@link #docFreq} statistic, though
|
||||
this will be corrected eventually as the index is further modified.
|
||||
*/
|
||||
deleted it will not appear in TermDocs or TermPostitions enumerations.
|
||||
Attempts to read its field with the {@link #document}
|
||||
method will result in an error. The presence of this document may still be
|
||||
reflected in the {@link #docFreq} statistic, though
|
||||
this will be corrected eventually as the index is further modified.
|
||||
*/
|
||||
public final synchronized void delete(int docNum) throws IOException {
|
||||
if(stale)
|
||||
if (stale)
|
||||
throw new IOException("IndexReader out of date and no longer valid for deletion");
|
||||
|
||||
|
||||
if (writeLock == null) {
|
||||
Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
|
||||
if (!writeLock.obtain(IndexWriter.WRITE_LOCK_TIMEOUT)) // obtain write lock
|
||||
|
@ -382,11 +403,11 @@ public abstract class IndexReader {
|
|||
|
||||
// we have to check whether index has changed since this reader was opened.
|
||||
// if so, this reader is no longer valid for deletion
|
||||
if(segmentInfos != null && SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()){
|
||||
stale = true;
|
||||
this.writeLock.release();
|
||||
this.writeLock = null;
|
||||
throw new IOException("IndexReader out of date and no longer valid for deletion");
|
||||
if (segmentInfos != null && SegmentInfos.readCurrentVersion(directory) > segmentInfos.getVersion()) {
|
||||
stale = true;
|
||||
this.writeLock.release();
|
||||
this.writeLock = null;
|
||||
throw new IOException("IndexReader out of date and no longer valid for deletion");
|
||||
}
|
||||
}
|
||||
doDelete(docNum);
|
||||
|
@ -398,14 +419,14 @@ public abstract class IndexReader {
|
|||
protected abstract void doDelete(int docNum) throws IOException;
|
||||
|
||||
/** Deletes all documents containing <code>term</code>.
|
||||
This is useful if one uses a document field to hold a unique ID string for
|
||||
the document. Then to delete such a document, one merely constructs a
|
||||
term with the appropriate field and the unique ID string as its text and
|
||||
passes it to this method. Returns the number of documents deleted.
|
||||
*/
|
||||
This is useful if one uses a document field to hold a unique ID string for
|
||||
the document. Then to delete such a document, one merely constructs a
|
||||
term with the appropriate field and the unique ID string as its text and
|
||||
passes it to this method. Returns the number of documents deleted.
|
||||
*/
|
||||
public final int delete(Term term) throws IOException {
|
||||
TermDocs docs = termDocs(term);
|
||||
if ( docs == null ) return 0;
|
||||
if (docs == null) return 0;
|
||||
int n = 0;
|
||||
try {
|
||||
while (docs.next()) {
|
||||
|
@ -444,25 +465,33 @@ public abstract class IndexReader {
|
|||
writeLock = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns a list of all unique field names that exist in the index pointed to by
|
||||
* this IndexReader.
|
||||
* Returns a list of all unique field names that exist in the index pointed
|
||||
* to by this IndexReader.
|
||||
* @return Collection of Strings indicating the names of the fields
|
||||
* @throws IOException if there is a problem with accessing the index
|
||||
*/
|
||||
public abstract Collection getFieldNames() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a list of all unique field names that exist in the index pointed to by
|
||||
* this IndexReader. The boolean argument specifies whether the fields returned
|
||||
* are indexed or not.
|
||||
* Returns a list of all unique field names that exist in the index pointed
|
||||
* to by this IndexReader. The boolean argument specifies whether the fields
|
||||
* returned are indexed or not.
|
||||
* @param indexed <code>true</code> if only indexed fields should be returned;
|
||||
* <code>false</code> if only unindexed fields should be returned.
|
||||
* @return Collection of Strings indicating the names of the fields
|
||||
* @throws IOException if there is a problem with accessing the index
|
||||
*/
|
||||
public abstract Collection getFieldNames(boolean indexed) throws IOException;
|
||||
public abstract Collection getFieldNames(boolean indexed) throws IOException;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
|
||||
* else only indexed fields without term vector info
|
||||
* @return Collection of Strings indicating the names of the fields
|
||||
*/
|
||||
public abstract Collection getIndexedFieldNames(boolean storedTermVector);
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> iff the index in the named directory is
|
||||
|
@ -470,12 +499,12 @@ public abstract class IndexReader {
|
|||
* @param directory the directory to check for a lock
|
||||
* @throws IOException if there is a problem with accessing the index
|
||||
*/
|
||||
public static boolean isLocked(Directory directory) throws IOException {
|
||||
return
|
||||
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
|
||||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
|
||||
public static boolean isLocked(Directory directory) throws IOException {
|
||||
return
|
||||
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked() ||
|
||||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).isLocked();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <code>true</code> iff the index in the named directory is
|
||||
|
@ -483,19 +512,19 @@ public abstract class IndexReader {
|
|||
* @param directory the directory to check for a lock
|
||||
* @throws IOException if there is a problem with accessing the index
|
||||
*/
|
||||
public static boolean isLocked(String directory) throws IOException {
|
||||
return isLocked(FSDirectory.getDirectory(directory, false));
|
||||
}
|
||||
public static boolean isLocked(String directory) throws IOException {
|
||||
return isLocked(FSDirectory.getDirectory(directory, false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Forcibly unlocks the index in the named directory.
|
||||
* <P>
|
||||
* Caution: this should only be used by failure recovery code,
|
||||
* when it is known that no other process nor thread is in fact
|
||||
* currently accessing this index.
|
||||
*/
|
||||
public static void unlock(Directory directory) throws IOException {
|
||||
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
|
||||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).release();
|
||||
}
|
||||
/**
|
||||
* Forcibly unlocks the index in the named directory.
|
||||
* <P>
|
||||
* Caution: this should only be used by failure recovery code,
|
||||
* when it is known that no other process nor thread is in fact
|
||||
* currently accessing this index.
|
||||
*/
|
||||
public static void unlock(Directory directory) throws IOException {
|
||||
directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
|
||||
directory.makeLock(IndexWriter.COMMIT_LOCK_NAME).release();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,7 +75,7 @@ public class MultiReader extends IndexReader {
|
|||
private int maxDoc = 0;
|
||||
private int numDocs = -1;
|
||||
private boolean hasDeletions = false;
|
||||
|
||||
|
||||
/** Construct reading the named set of readers. */
|
||||
public MultiReader(IndexReader[] readers) throws IOException {
|
||||
this(readers.length == 0 ? null : readers[0].directory(), readers);
|
||||
|
@ -97,6 +97,25 @@ public class MultiReader extends IndexReader {
|
|||
starts[readers.length] = maxDoc;
|
||||
}
|
||||
|
||||
|
||||
/** Return an array of term frequency vectors for the specified document.
|
||||
* The array contains a vector for each vectorized field in the document.
|
||||
* Each vector vector contains term numbers and frequencies for all terms
|
||||
* in a given vectorized field.
|
||||
* If no such fields existed, the method returns null.
|
||||
*/
|
||||
public TermFreqVector[] getTermFreqVectors(int n)
|
||||
throws IOException {
|
||||
int i = readerIndex(n); // find segment num
|
||||
return readers[i].getTermFreqVectors(n - starts[i]); // dispatch to segment
|
||||
}
|
||||
|
||||
public TermFreqVector getTermFreqVector(int n, String field)
|
||||
throws IOException {
|
||||
int i = readerIndex(n); // find segment num
|
||||
return readers[i].getTermFreqVector(n - starts[i], field);
|
||||
}
|
||||
|
||||
public synchronized int numDocs() {
|
||||
if (numDocs == -1) { // check cache
|
||||
int n = 0; // cache miss--recompute
|
||||
|
@ -245,6 +264,18 @@ public class MultiReader extends IndexReader {
|
|||
}
|
||||
return fieldSet;
|
||||
}
|
||||
|
||||
public Collection getIndexedFieldNames(boolean storedTermVector) {
|
||||
// maintain a unique set of field names
|
||||
Set fieldSet = new HashSet();
|
||||
for (int i = 0; i < readers.length; i++) {
|
||||
IndexReader reader = readers[i];
|
||||
Collection names = reader.getIndexedFieldNames(storedTermVector);
|
||||
fieldSet.addAll(names);
|
||||
}
|
||||
return fieldSet;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class MultiTermEnum extends TermEnum {
|
||||
|
|
|
@ -61,10 +61,19 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.util.BitVector;
|
||||
|
||||
/**
|
||||
* The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
|
||||
* into a single Segment. After adding the appropriate readers, call the merge method to combine the
|
||||
* segments.
|
||||
*<P>
|
||||
* If the compoundFile flag is set, then the segments will be merged into a compound file.
|
||||
*
|
||||
*
|
||||
* @see #merge
|
||||
* @see #add
|
||||
*/
|
||||
final class SegmentMerger {
|
||||
private boolean useCompoundFile;
|
||||
private Directory directory;
|
||||
|
@ -77,51 +86,78 @@ final class SegmentMerger {
|
|||
private static final String COMPOUND_EXTENSIONS[] = new String[] {
|
||||
"fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
|
||||
};
|
||||
|
||||
private static final String VECTOR_EXTENSIONS[] = new String[] {
|
||||
"tvx", "tvd", "tvf"
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
* @param dir The Directory to merge the other segments into
|
||||
* @param name The name of the new segment
|
||||
* @param compoundFile true if the new segment should use a compoundFile
|
||||
*/
|
||||
SegmentMerger(Directory dir, String name, boolean compoundFile) {
|
||||
directory = dir;
|
||||
segment = name;
|
||||
useCompoundFile = compoundFile;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an IndexReader to the collection of readers that are to be merged
|
||||
* @param reader
|
||||
*/
|
||||
final void add(IndexReader reader) {
|
||||
readers.addElement(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param i The index of the reader to return
|
||||
* @return The ith reader to be merged
|
||||
*/
|
||||
final IndexReader segmentReader(int i) {
|
||||
return (IndexReader)readers.elementAt(i);
|
||||
return (IndexReader) readers.elementAt(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the readers specified by the {@link #add} method into the directory passed to the constructor
|
||||
* @return The number of documents that were merged
|
||||
* @throws IOException
|
||||
*/
|
||||
final int merge() throws IOException {
|
||||
int value;
|
||||
try {
|
||||
value = mergeFields();
|
||||
mergeTerms();
|
||||
mergeNorms();
|
||||
|
||||
if (fieldInfos.hasVectors())
|
||||
mergeVectors();
|
||||
|
||||
} finally {
|
||||
for (int i = 0; i < readers.size(); i++) { // close readers
|
||||
IndexReader reader = (IndexReader)readers.elementAt(i);
|
||||
reader.close();
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (useCompoundFile)
|
||||
createCompoundFile();
|
||||
createCompoundFile();
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
private final void createCompoundFile()
|
||||
throws IOException {
|
||||
CompoundFileWriter cfsWriter =
|
||||
new CompoundFileWriter(directory, segment + ".cfs");
|
||||
|
||||
ArrayList files =
|
||||
new ArrayList(COMPOUND_EXTENSIONS.length + fieldInfos.size());
|
||||
private final void createCompoundFile()
|
||||
throws IOException {
|
||||
CompoundFileWriter cfsWriter =
|
||||
new CompoundFileWriter(directory, segment + ".cfs");
|
||||
|
||||
ArrayList files =
|
||||
new ArrayList(COMPOUND_EXTENSIONS.length + fieldInfos.size());
|
||||
|
||||
// Basic files
|
||||
for (int i=0; i<COMPOUND_EXTENSIONS.length; i++) {
|
||||
files.add(segment + "." + COMPOUND_EXTENSIONS[i]);
|
||||
for (int i = 0; i < COMPOUND_EXTENSIONS.length; i++) {
|
||||
files.add(segment + "." + COMPOUND_EXTENSIONS[i]);
|
||||
}
|
||||
|
||||
// Field norm files
|
||||
|
@ -132,9 +168,16 @@ final class SegmentMerger {
|
|||
}
|
||||
}
|
||||
|
||||
// Vector files
|
||||
if (fieldInfos.hasVectors()) {
|
||||
for (int i = 0; i < VECTOR_EXTENSIONS.length; i++) {
|
||||
files.add(segment + "." + VECTOR_EXTENSIONS[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Now merge all added files
|
||||
Iterator it = files.iterator();
|
||||
while(it.hasNext()) {
|
||||
while (it.hasNext()) {
|
||||
cfsWriter.addFile((String) it.next());
|
||||
}
|
||||
|
||||
|
@ -143,33 +186,38 @@ final class SegmentMerger {
|
|||
|
||||
// Now delete the source files
|
||||
it = files.iterator();
|
||||
while(it.hasNext()) {
|
||||
while (it.hasNext()) {
|
||||
directory.deleteFile((String) it.next());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The number of documents in all of the readers
|
||||
* @throws IOException
|
||||
*/
|
||||
private final int mergeFields() throws IOException {
|
||||
fieldInfos = new FieldInfos(); // merge field names
|
||||
int docCount = 0;
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader)readers.elementAt(i);
|
||||
fieldInfos.add(reader.getFieldNames(true), true);
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
fieldInfos.addIndexed(reader.getIndexedFieldNames(true), true);
|
||||
fieldInfos.addIndexed(reader.getIndexedFieldNames(false), false);
|
||||
fieldInfos.add(reader.getFieldNames(false), false);
|
||||
}
|
||||
fieldInfos.write(directory, segment + ".fnm");
|
||||
|
||||
FieldsWriter fieldsWriter = // merge field values
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
|
||||
FieldsWriter fieldsWriter = // merge field values
|
||||
new FieldsWriter(directory, segment, fieldInfos);
|
||||
try {
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader)readers.elementAt(i);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int j = 0; j < maxDoc; j++)
|
||||
if (!reader.isDeleted(j)){ // skip deleted docs
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int j = 0; j < maxDoc; j++)
|
||||
if (!reader.isDeleted(j)) { // skip deleted docs
|
||||
fieldsWriter.addDocument(reader.document(j));
|
||||
docCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fieldsWriter.close();
|
||||
|
@ -177,6 +225,50 @@ final class SegmentMerger {
|
|||
return docCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge the TermVectors from each of the segments into the new one.
|
||||
* @throws IOException
|
||||
*/
|
||||
private final void mergeVectors() throws IOException {
|
||||
TermVectorsWriter termVectorsWriter =
|
||||
new TermVectorsWriter(directory, segment, fieldInfos);
|
||||
|
||||
try {
|
||||
for (int r = 0; r < readers.size(); r++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(r);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int docNum = 0; docNum < maxDoc; docNum++) {
|
||||
// skip deleted docs
|
||||
if (reader.isDeleted(docNum)) {
|
||||
continue;
|
||||
}
|
||||
termVectorsWriter.openDocument();
|
||||
|
||||
// get all term vectors
|
||||
TermFreqVector[] sourceTermVector =
|
||||
reader.getTermFreqVectors(docNum);
|
||||
|
||||
if (sourceTermVector != null) {
|
||||
for (int f = 0; f < sourceTermVector.length; f++) {
|
||||
// translate field numbers
|
||||
TermFreqVector termVector = sourceTermVector[f];
|
||||
termVectorsWriter.openField(termVector.getField());
|
||||
String [] terms = termVector.getTerms();
|
||||
int [] freqs = termVector.getTermFrequencies();
|
||||
|
||||
for (int t = 0; t < terms.length; t++) {
|
||||
termVectorsWriter.addTerm(terms[t], freqs[t]);
|
||||
}
|
||||
}
|
||||
termVectorsWriter.closeDocument();
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
termVectorsWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
private OutputStream freqOutput = null;
|
||||
private OutputStream proxOutput = null;
|
||||
private TermInfosWriter termInfosWriter = null;
|
||||
|
@ -187,15 +279,15 @@ final class SegmentMerger {
|
|||
freqOutput = directory.createFile(segment + ".frq");
|
||||
proxOutput = directory.createFile(segment + ".prx");
|
||||
termInfosWriter =
|
||||
new TermInfosWriter(directory, segment, fieldInfos);
|
||||
|
||||
new TermInfosWriter(directory, segment, fieldInfos);
|
||||
|
||||
mergeTermInfos();
|
||||
|
||||
|
||||
} finally {
|
||||
if (freqOutput != null) freqOutput.close();
|
||||
if (proxOutput != null) proxOutput.close();
|
||||
if (termInfosWriter != null) termInfosWriter.close();
|
||||
if (queue != null) queue.close();
|
||||
if (freqOutput != null) freqOutput.close();
|
||||
if (proxOutput != null) proxOutput.close();
|
||||
if (termInfosWriter != null) termInfosWriter.close();
|
||||
if (queue != null) queue.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -203,7 +295,7 @@ final class SegmentMerger {
|
|||
queue = new SegmentMergeQueue(readers.size());
|
||||
int base = 0;
|
||||
for (int i = 0; i < readers.size(); i++) {
|
||||
IndexReader reader = (IndexReader)readers.elementAt(i);
|
||||
IndexReader reader = (IndexReader) readers.elementAt(i);
|
||||
TermEnum termEnum = reader.terms();
|
||||
SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
|
||||
base += reader.numDocs();
|
||||
|
@ -214,20 +306,20 @@ final class SegmentMerger {
|
|||
}
|
||||
|
||||
SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
|
||||
|
||||
|
||||
while (queue.size() > 0) {
|
||||
int matchSize = 0; // pop matching terms
|
||||
match[matchSize++] = (SegmentMergeInfo)queue.pop();
|
||||
match[matchSize++] = (SegmentMergeInfo) queue.pop();
|
||||
Term term = match[0].term;
|
||||
SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
|
||||
|
||||
SegmentMergeInfo top = (SegmentMergeInfo) queue.top();
|
||||
|
||||
while (top != null && term.compareTo(top.term) == 0) {
|
||||
match[matchSize++] = (SegmentMergeInfo)queue.pop();
|
||||
top = (SegmentMergeInfo)queue.top();
|
||||
match[matchSize++] = (SegmentMergeInfo) queue.pop();
|
||||
top = (SegmentMergeInfo) queue.top();
|
||||
}
|
||||
|
||||
mergeTermInfo(match, matchSize); // add new TermInfo
|
||||
|
||||
|
||||
while (matchSize > 0) {
|
||||
SegmentMergeInfo smi = match[--matchSize];
|
||||
if (smi.next())
|
||||
|
@ -240,8 +332,15 @@ final class SegmentMerger {
|
|||
|
||||
private final TermInfo termInfo = new TermInfo(); // minimize consing
|
||||
|
||||
/** Merge one term found in one or more segments. The array <code>smis</code>
|
||||
* contains segments that are positioned at the same term. <code>N</code>
|
||||
* is the number of cells in the array actually occupied.
|
||||
*
|
||||
* @param smis array of segments
|
||||
* @param n number of cells in the array actually occupied
|
||||
*/
|
||||
private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
long freqPointer = freqOutput.getFilePointer();
|
||||
long proxPointer = proxOutput.getFilePointer();
|
||||
|
||||
|
@ -251,13 +350,21 @@ final class SegmentMerger {
|
|||
|
||||
if (df > 0) {
|
||||
// add an entry to the dictionary with pointers to prox and freq files
|
||||
termInfo.set(df, freqPointer, proxPointer, (int)(skipPointer-freqPointer));
|
||||
termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
|
||||
termInfosWriter.add(smis[0].term, termInfo);
|
||||
}
|
||||
}
|
||||
|
||||
/** Process postings from multiple segments all positioned on the
|
||||
* same term. Writes out merged entries into freqOutput and
|
||||
* the proxOutput streams.
|
||||
*
|
||||
* @param smis array of segments
|
||||
* @param n number of cells in the array actually occupied
|
||||
* @return number of documents across all segments where this term was found
|
||||
*/
|
||||
private final int appendPostings(SegmentMergeInfo[] smis, int n)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
final int skipInterval = termInfosWriter.skipInterval;
|
||||
int lastDoc = 0;
|
||||
int df = 0; // number of docs w/ term
|
||||
|
@ -285,7 +392,7 @@ final class SegmentMerger {
|
|||
|
||||
int docCode = (doc - lastDoc) << 1; // use low bit to flag freq=1
|
||||
lastDoc = doc;
|
||||
|
||||
|
||||
int freq = postings.freq();
|
||||
if (freq == 1) {
|
||||
freqOutput.writeVInt(docCode | 1); // write doc & freq=1
|
||||
|
@ -293,10 +400,10 @@ final class SegmentMerger {
|
|||
freqOutput.writeVInt(docCode); // write doc
|
||||
freqOutput.writeVInt(freq); // write frequency in doc
|
||||
}
|
||||
|
||||
|
||||
int lastPosition = 0; // write position deltas
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int position = postings.nextPosition();
|
||||
for (int j = 0; j < freq; j++) {
|
||||
int position = postings.nextPosition();
|
||||
proxOutput.writeVInt(position - lastPosition);
|
||||
lastPosition = position;
|
||||
}
|
||||
|
@ -321,9 +428,9 @@ final class SegmentMerger {
|
|||
long freqPointer = freqOutput.getFilePointer();
|
||||
long proxPointer = proxOutput.getFilePointer();
|
||||
|
||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||
skipBuffer.writeVInt((int)(freqPointer - lastSkipFreqPointer));
|
||||
skipBuffer.writeVInt((int)(proxPointer - lastSkipProxPointer));
|
||||
skipBuffer.writeVInt(doc - lastSkipDoc);
|
||||
skipBuffer.writeVInt((int) (freqPointer - lastSkipFreqPointer));
|
||||
skipBuffer.writeVInt((int) (proxPointer - lastSkipProxPointer));
|
||||
|
||||
lastSkipDoc = doc;
|
||||
lastSkipFreqPointer = freqPointer;
|
||||
|
@ -340,22 +447,22 @@ final class SegmentMerger {
|
|||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed) {
|
||||
OutputStream output = directory.createFile(segment + ".f" + i);
|
||||
try {
|
||||
for (int j = 0; j < readers.size(); j++) {
|
||||
IndexReader reader = (IndexReader)readers.elementAt(j);
|
||||
byte[] input = reader.norms(fi.name);
|
||||
OutputStream output = directory.createFile(segment + ".f" + i);
|
||||
try {
|
||||
for (int j = 0; j < readers.size(); j++) {
|
||||
IndexReader reader = (IndexReader) readers.elementAt(j);
|
||||
byte[] input = reader.norms(fi.name);
|
||||
int maxDoc = reader.maxDoc();
|
||||
for (int k = 0; k < maxDoc; k++) {
|
||||
byte norm = input != null ? input[k] : (byte)0;
|
||||
byte norm = input != null ? input[k] : (byte) 0;
|
||||
if (!reader.isDeleted(k)) {
|
||||
output.writeByte(norm);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
output.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -82,6 +82,7 @@ final class SegmentReader extends IndexReader {
|
|||
private FieldsReader fieldsReader;
|
||||
|
||||
TermInfosReader tis;
|
||||
TermVectorsReader termVectorsReader;
|
||||
|
||||
BitVector deletedDocs = null;
|
||||
private boolean deletedDocsDirty = false;
|
||||
|
@ -109,21 +110,22 @@ final class SegmentReader extends IndexReader {
|
|||
out.close();
|
||||
}
|
||||
String fileName = segment + ".f" + fieldInfos.fieldNumber(name);
|
||||
directory().renameFile(segment + ".tmp", fileName);
|
||||
directory().renameFile(segment + ".tmp", fileName);
|
||||
this.dirty = false;
|
||||
}
|
||||
}
|
||||
|
||||
private Hashtable norms = new Hashtable();
|
||||
|
||||
SegmentReader(SegmentInfos sis, SegmentInfo si, boolean closeDir)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
this(si);
|
||||
closeDirectory = closeDir;
|
||||
segmentInfos = sis;
|
||||
}
|
||||
|
||||
SegmentReader(SegmentInfo si)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
super(si.dir);
|
||||
segment = si.name;
|
||||
|
||||
|
@ -149,13 +151,17 @@ final class SegmentReader extends IndexReader {
|
|||
freqStream = cfsDir.openFile(segment + ".frq");
|
||||
proxStream = cfsDir.openFile(segment + ".prx");
|
||||
openNorms(cfsDir);
|
||||
|
||||
if (fieldInfos.hasVectors()) { // open term vector files only as needed
|
||||
termVectorsReader = new TermVectorsReader(cfsDir, segment, fieldInfos);
|
||||
}
|
||||
}
|
||||
|
||||
protected final synchronized void doClose() throws IOException {
|
||||
if (deletedDocsDirty || normsDirty) {
|
||||
synchronized (directory()) { // in- & inter-process sync
|
||||
new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
|
||||
IndexWriter.COMMIT_LOCK_TIMEOUT) {
|
||||
IndexWriter.COMMIT_LOCK_TIMEOUT) {
|
||||
public Object doBody() throws IOException {
|
||||
|
||||
if (deletedDocsDirty) { // re-write deleted
|
||||
|
@ -164,18 +170,18 @@ final class SegmentReader extends IndexReader {
|
|||
}
|
||||
|
||||
if (normsDirty) { // re-write norms
|
||||
Enumeration keys = norms.keys();
|
||||
Enumeration values = norms.elements();
|
||||
Enumeration keys = norms.keys();
|
||||
Enumeration values = norms.elements();
|
||||
while (values.hasMoreElements()) {
|
||||
String field = (String)keys.nextElement();
|
||||
Norm norm = (Norm)values.nextElement();
|
||||
String field = (String) keys.nextElement();
|
||||
Norm norm = (Norm) values.nextElement();
|
||||
if (norm.dirty) {
|
||||
norm.reWrite(field);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(segmentInfos != null)
|
||||
if (segmentInfos != null)
|
||||
segmentInfos.write(directory());
|
||||
else
|
||||
directory().touchFile("segments");
|
||||
|
@ -196,6 +202,7 @@ final class SegmentReader extends IndexReader {
|
|||
proxStream.close();
|
||||
|
||||
closeNorms();
|
||||
if (termVectorsReader != null) termVectorsReader.close();
|
||||
|
||||
if (cfsReader != null)
|
||||
cfsReader.close();
|
||||
|
@ -212,6 +219,7 @@ final class SegmentReader extends IndexReader {
|
|||
return deletedDocs != null;
|
||||
}
|
||||
|
||||
|
||||
static final boolean usesCompoundFile(SegmentInfo si) throws IOException {
|
||||
return si.dir.fileExists(si.name + ".cfs");
|
||||
}
|
||||
|
@ -226,7 +234,7 @@ final class SegmentReader extends IndexReader {
|
|||
public synchronized void undeleteAll() throws IOException {
|
||||
synchronized (directory()) { // in- & inter-process sync
|
||||
new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
|
||||
IndexWriter.COMMIT_LOCK_TIMEOUT) {
|
||||
IndexWriter.COMMIT_LOCK_TIMEOUT) {
|
||||
public Object doBody() throws IOException {
|
||||
if (directory().fileExists(segment + ".del")) {
|
||||
directory().deleteFile(segment + ".del");
|
||||
|
@ -242,11 +250,11 @@ final class SegmentReader extends IndexReader {
|
|||
|
||||
final Vector files() throws IOException {
|
||||
Vector files = new Vector(16);
|
||||
final String ext[] = new String[] {
|
||||
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del"
|
||||
};
|
||||
final String ext[] = new String[]{
|
||||
"cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
|
||||
"tvx", "tvd", "tvf", "tvp" };
|
||||
|
||||
for (int i=0; i<ext.length; i++) {
|
||||
for (int i = 0; i < ext.length; i++) {
|
||||
String name = segment + "." + ext[i];
|
||||
if (directory().fileExists(name))
|
||||
files.addElement(name);
|
||||
|
@ -271,7 +279,7 @@ final class SegmentReader extends IndexReader {
|
|||
public final synchronized Document document(int n) throws IOException {
|
||||
if (isDeleted(n))
|
||||
throw new IllegalArgumentException
|
||||
("attempt to access a deleted document");
|
||||
("attempt to access a deleted document");
|
||||
return fieldsReader.doc(n);
|
||||
}
|
||||
|
||||
|
@ -329,12 +337,31 @@ final class SegmentReader extends IndexReader {
|
|||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed == indexed)
|
||||
fieldSet.add(fi.name);
|
||||
}
|
||||
return fieldSet;
|
||||
}
|
||||
return fieldSet;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param storedTermVector if true, returns only Indexed fields that have term vector info,
|
||||
* else only indexed fields without term vector info
|
||||
* @return Collection of Strings indicating the names of the fields
|
||||
*/
|
||||
public Collection getIndexedFieldNames(boolean storedTermVector) {
|
||||
// maintain a unique set of field names
|
||||
Set fieldSet = new HashSet();
|
||||
for (int i = 0; i < fieldInfos.size(); i++) {
|
||||
FieldInfo fi = fieldInfos.fieldInfo(i);
|
||||
if (fi.isIndexed == true && fi.storeTermVector == storedTermVector){
|
||||
fieldSet.add(fi.name);
|
||||
}
|
||||
}
|
||||
return fieldSet;
|
||||
|
||||
}
|
||||
|
||||
public synchronized byte[] norms(String field) throws IOException {
|
||||
Norm norm = (Norm)norms.get(field);
|
||||
Norm norm = (Norm) norms.get(field);
|
||||
if (norm == null) // not an indexed field
|
||||
return null;
|
||||
if (norm.bytes == null) { // value not yet read
|
||||
|
@ -346,8 +373,8 @@ final class SegmentReader extends IndexReader {
|
|||
}
|
||||
|
||||
public synchronized void setNorm(int doc, String field, byte value)
|
||||
throws IOException {
|
||||
Norm norm = (Norm)norms.get(field);
|
||||
throws IOException {
|
||||
Norm norm = (Norm) norms.get(field);
|
||||
if (norm == null) // not an indexed field
|
||||
return;
|
||||
norm.dirty = true; // mark it dirty
|
||||
|
@ -360,7 +387,7 @@ final class SegmentReader extends IndexReader {
|
|||
public synchronized void norms(String field, byte[] bytes, int offset)
|
||||
throws IOException {
|
||||
|
||||
Norm norm = (Norm)norms.get(field);
|
||||
Norm norm = (Norm) norms.get(field);
|
||||
if (norm == null)
|
||||
return; // use zeros in array
|
||||
|
||||
|
@ -369,7 +396,7 @@ final class SegmentReader extends IndexReader {
|
|||
return;
|
||||
}
|
||||
|
||||
InputStream normStream = (InputStream)norm.in.clone();
|
||||
InputStream normStream = (InputStream) norm.in.clone();
|
||||
try { // read from disk
|
||||
normStream.seek(0);
|
||||
normStream.readBytes(bytes, offset, maxDoc());
|
||||
|
@ -392,11 +419,40 @@ final class SegmentReader extends IndexReader {
|
|||
|
||||
private final void closeNorms() throws IOException {
|
||||
synchronized (norms) {
|
||||
Enumeration enumerator = norms.elements();
|
||||
Enumeration enumerator = norms.elements();
|
||||
while (enumerator.hasMoreElements()) {
|
||||
Norm norm = (Norm)enumerator.nextElement();
|
||||
Norm norm = (Norm) enumerator.nextElement();
|
||||
norm.in.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Return a term frequency vector for the specified document and field. The
|
||||
* vector returned contains term numbers and frequencies for all terms in
|
||||
* the specified field of this document, if the field had storeTermVector
|
||||
* flag set. If the flag was not set, the method returns null.
|
||||
*/
|
||||
public TermFreqVector getTermFreqVector(int docNumber, String field)
|
||||
throws IOException {
|
||||
// Check if this field is invalid or has no stored term vector
|
||||
FieldInfo fi = fieldInfos.fieldInfo(field);
|
||||
if (fi == null || !fi.storeTermVector) return null;
|
||||
|
||||
return termVectorsReader.get(docNumber, field);
|
||||
}
|
||||
|
||||
|
||||
/** Return an array of term frequency vectors for the specified document.
|
||||
* The array contains a vector for each vectorized field in the document.
|
||||
* Each vector vector contains term numbers and frequencies for all terms
|
||||
* in a given vectorized field.
|
||||
* If no such fields existed, the method returns null.
|
||||
*/
|
||||
public TermFreqVector[] getTermFreqVectors(int docNumber)
|
||||
throws IOException {
|
||||
if (termVectorsReader == null)
|
||||
return null;
|
||||
|
||||
return termVectorsReader.get(docNumber);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,27 +77,27 @@ class SegmentTermDocs implements TermDocs {
|
|||
private boolean haveSkipped;
|
||||
|
||||
SegmentTermDocs(SegmentReader parent)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
this.parent = parent;
|
||||
this.freqStream = (InputStream)parent.freqStream.clone();
|
||||
this.freqStream = (InputStream) parent.freqStream.clone();
|
||||
this.deletedDocs = parent.deletedDocs;
|
||||
this.skipInterval = parent.tis.getSkipInterval();
|
||||
}
|
||||
|
||||
|
||||
public void seek(Term term) throws IOException {
|
||||
TermInfo ti = parent.tis.get(term);
|
||||
seek(ti);
|
||||
}
|
||||
|
||||
|
||||
public void seek(TermEnum enum) throws IOException {
|
||||
TermInfo ti;
|
||||
if (enum instanceof SegmentTermEnum) // optimized case
|
||||
ti = ((SegmentTermEnum)enum).termInfo();
|
||||
ti = ((SegmentTermEnum) enum).termInfo();
|
||||
else // punt case
|
||||
ti = parent.tis.get(enum.term());
|
||||
seek(ti);
|
||||
}
|
||||
|
||||
|
||||
void seek(TermInfo ti) throws IOException {
|
||||
count = 0;
|
||||
if (ti == null) {
|
||||
|
@ -114,7 +114,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
haveSkipped = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void close() throws IOException {
|
||||
freqStream.close();
|
||||
}
|
||||
|
@ -128,19 +128,19 @@ class SegmentTermDocs implements TermDocs {
|
|||
public boolean next() throws IOException {
|
||||
while (true) {
|
||||
if (count == df)
|
||||
return false;
|
||||
return false;
|
||||
|
||||
int docCode = freqStream.readVInt();
|
||||
doc += docCode >>> 1; // shift off low bit
|
||||
if ((docCode & 1) != 0) // if low bit is set
|
||||
freq = 1; // freq is one
|
||||
freq = 1; // freq is one
|
||||
else
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
|
||||
count++;
|
||||
|
||||
|
||||
if (deletedDocs == null || !deletedDocs.get(doc))
|
||||
break;
|
||||
break;
|
||||
skippingDoc();
|
||||
}
|
||||
return true;
|
||||
|
@ -148,7 +148,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
|
||||
/** Optimized implementation. */
|
||||
public int read(final int[] docs, final int[] freqs)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
final int length = docs.length;
|
||||
int i = 0;
|
||||
while (i < length && count < df) {
|
||||
|
@ -157,17 +157,17 @@ class SegmentTermDocs implements TermDocs {
|
|||
final int docCode = freqStream.readVInt();
|
||||
doc += docCode >>> 1; // shift off low bit
|
||||
if ((docCode & 1) != 0) // if low bit is set
|
||||
freq = 1; // freq is one
|
||||
freq = 1; // freq is one
|
||||
else
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
freq = freqStream.readVInt(); // else read freq
|
||||
count++;
|
||||
|
||||
|
||||
if (deletedDocs == null || !deletedDocs.get(doc)) {
|
||||
docs[i] = doc;
|
||||
freqs[i] = freq;
|
||||
++i;
|
||||
docs[i] = doc;
|
||||
freqs[i] = freq;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
|
@ -179,7 +179,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
if (df > skipInterval) { // optimized case
|
||||
|
||||
if (skipStream == null)
|
||||
skipStream = (InputStream)freqStream.clone(); // lazily clone
|
||||
skipStream = (InputStream) freqStream.clone(); // lazily clone
|
||||
|
||||
if (!haveSkipped) { // lazily seek skip stream
|
||||
skipStream.seek(skipPointer);
|
||||
|
@ -190,8 +190,8 @@ class SegmentTermDocs implements TermDocs {
|
|||
int lastSkipDoc = skipDoc;
|
||||
long lastFreqPointer = freqStream.getFilePointer();
|
||||
long lastProxPointer = -1;
|
||||
int numSkipped = -1 -(count % skipInterval);
|
||||
|
||||
int numSkipped = -1 - (count % skipInterval);
|
||||
|
||||
while (target > skipDoc) {
|
||||
lastSkipDoc = skipDoc;
|
||||
lastFreqPointer = freqPointer;
|
||||
|
@ -205,7 +205,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
skipDoc += skipStream.readVInt();
|
||||
freqPointer += skipStream.readVInt();
|
||||
proxPointer += skipStream.readVInt();
|
||||
|
||||
|
||||
skipCount++;
|
||||
}
|
||||
|
||||
|
@ -213,7 +213,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
if (lastFreqPointer > freqStream.getFilePointer()) {
|
||||
freqStream.seek(lastFreqPointer);
|
||||
skipProx(lastProxPointer);
|
||||
|
||||
|
||||
doc = lastSkipDoc;
|
||||
count += numSkipped;
|
||||
}
|
||||
|
@ -223,7 +223,7 @@ class SegmentTermDocs implements TermDocs {
|
|||
// done skipping, now just scan
|
||||
do {
|
||||
if (!next())
|
||||
return false;
|
||||
return false;
|
||||
} while (target > doc);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -76,9 +76,9 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
private char[] buffer = {};
|
||||
|
||||
SegmentTermEnum(InputStream i, FieldInfos fis, boolean isi)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
input = i;
|
||||
fieldInfos = fis;
|
||||
fieldInfos = fis;
|
||||
isIndex = isi;
|
||||
|
||||
int firstInt = input.readInt();
|
||||
|
@ -98,24 +98,24 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
// check that it is a format we can understand
|
||||
if (format < TermInfosWriter.FORMAT)
|
||||
throw new IOException("Unknown format version:" + format);
|
||||
|
||||
|
||||
size = input.readLong(); // read the size
|
||||
|
||||
|
||||
if (!isIndex) {
|
||||
indexInterval = input.readInt();
|
||||
skipInterval = input.readInt();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected Object clone() {
|
||||
SegmentTermEnum clone = null;
|
||||
try {
|
||||
clone = (SegmentTermEnum)super.clone();
|
||||
clone = (SegmentTermEnum) super.clone();
|
||||
} catch (CloneNotSupportedException e) {}
|
||||
|
||||
clone.input = (InputStream)input.clone();
|
||||
clone.input = (InputStream) input.clone();
|
||||
clone.termInfo = new TermInfo(termInfo);
|
||||
if (term != null) clone.growBuffer(term.text.length());
|
||||
|
||||
|
@ -123,7 +123,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
}
|
||||
|
||||
final void seek(long pointer, int p, Term t, TermInfo ti)
|
||||
throws IOException {
|
||||
throws IOException {
|
||||
input.seek(pointer);
|
||||
position = p;
|
||||
term = t;
|
||||
|
@ -134,7 +134,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
|
||||
/** Increments the enumeration to the next element. True if one exists.*/
|
||||
public final boolean next() throws IOException {
|
||||
if (position++ >= size-1) {
|
||||
if (position++ >= size - 1) {
|
||||
term = null;
|
||||
return false;
|
||||
}
|
||||
|
@ -145,7 +145,7 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
termInfo.docFreq = input.readVInt(); // read doc freq
|
||||
termInfo.freqPointer += input.readVLong(); // read freq pointer
|
||||
termInfo.proxPointer += input.readVLong(); // read prox pointer
|
||||
|
||||
|
||||
if (!isIndex) {
|
||||
if (termInfo.docFreq > skipInterval) {
|
||||
termInfo.skipOffset = input.readVInt();
|
||||
|
@ -164,10 +164,10 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
int totalLength = start + length;
|
||||
if (buffer.length < totalLength)
|
||||
growBuffer(totalLength);
|
||||
|
||||
|
||||
input.readChars(buffer, start, length);
|
||||
return new Term(fieldInfos.fieldName(input.readVInt()),
|
||||
new String(buffer, 0, totalLength), false);
|
||||
new String(buffer, 0, totalLength), false);
|
||||
}
|
||||
|
||||
private final void growBuffer(int length) {
|
||||
|
@ -177,25 +177,25 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
|
|||
}
|
||||
|
||||
/** Returns the current Term in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
public final Term term() {
|
||||
return term;
|
||||
}
|
||||
|
||||
/** Returns the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
final TermInfo termInfo() {
|
||||
return new TermInfo(termInfo);
|
||||
}
|
||||
|
||||
/** Sets the argument to the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
final void termInfo(TermInfo ti) {
|
||||
ti.set(termInfo);
|
||||
}
|
||||
|
||||
/** Returns the docFreq from the current TermInfo in the enumeration.
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
Initially invalid, valid after next() called for the first time.*/
|
||||
public final int docFreq() {
|
||||
return termInfo.docFreq;
|
||||
}
|
||||
|
|
|
@ -106,7 +106,7 @@ extends SegmentTermDocs implements TermPositions {
|
|||
|
||||
public final int read(final int[] docs, final int[] freqs)
|
||||
throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead.");
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
package org.apache.lucene.index;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*/
|
||||
class SegmentTermVector implements TermFreqVector {
|
||||
private String field;
|
||||
private String terms[];
|
||||
private int termFreqs[];
|
||||
|
||||
SegmentTermVector(String field, String terms[], int termFreqs[]) {
|
||||
this.field = field;
|
||||
this.terms = terms;
|
||||
this.termFreqs = termFreqs;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The number of the field this vector is associated with
|
||||
*/
|
||||
public String getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append('{');
|
||||
sb.append(field).append(": ");
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
if (i>0) sb.append(", ");
|
||||
sb.append(terms[i]).append('/').append(termFreqs[i]);
|
||||
}
|
||||
sb.append('}');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
public String toString(IndexReader ir)
|
||||
throws IOException
|
||||
{
|
||||
return toString();
|
||||
/*StringBuffer sb = new StringBuffer();
|
||||
//TODO: Reimplement
|
||||
|
||||
sb.append('{');
|
||||
sb.append(field).append(": ");
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
if (i>0) sb.append(", ");
|
||||
Term t = ir.getTerm(terms[i]);
|
||||
String text = t == null ? "UNKNOWN(" + i + ")" : t.text;
|
||||
sb.append(text).append('/').append(termFreqs[i]);
|
||||
if (termProx != null) appendTermProx(sb.append('/'), termProx[i]);
|
||||
}
|
||||
sb.append('}');
|
||||
return sb.toString();*/
|
||||
}
|
||||
|
||||
|
||||
/** Number of terms in the term vector. If there are no terms in the
|
||||
* vector, returns 0.
|
||||
*/
|
||||
public int size() {
|
||||
return terms == null ? 0 : terms.length;
|
||||
}
|
||||
|
||||
/** Array of term numbers in ascending order. If there are no terms in
|
||||
* the vector, returns null.
|
||||
*/
|
||||
public String [] getTerms() {
|
||||
return terms;
|
||||
}
|
||||
|
||||
/** Array of term frequencies. Locations of the array correspond one to one
|
||||
* to the term numbers in the array obtained from <code>getTermNumbers</code>
|
||||
* method. Each location in the array contains the number of times this
|
||||
* term occurs in the document or the document field. If there are no terms in
|
||||
* the vector, returns null.
|
||||
*/
|
||||
public int[] getTermFrequencies() {
|
||||
return termFreqs;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Return an index in the term numbers array returned from <code>getTermNumbers</code>
|
||||
* at which the term with the specified <code>termNumber</code> appears. If this
|
||||
* term does not appear in the array, return -1.
|
||||
*/
|
||||
public int indexOf(String termText) {
|
||||
int res = Arrays.binarySearch(terms, termText);
|
||||
return res >= 0 ? res : -1;
|
||||
}
|
||||
|
||||
/** Just like <code>indexOf(int)</code> but searches for a number of terms
|
||||
* at the same time. Returns an array that has the same size as the number
|
||||
* of terms searched for, each slot containing the result of searching for
|
||||
* that term number. Array of term numbers must be sorted in ascending order.
|
||||
*
|
||||
* @param termNumbers array containing term numbers to look for
|
||||
* @param start index in the array where the list of termNumbers starts
|
||||
* @param len the number of termNumbers in the list
|
||||
*/
|
||||
public int[] indexesOf(String [] termNumbers, int start, int len) {
|
||||
// TODO: there must be a more efficient way of doing this.
|
||||
// At least, we could advance the lower bound of the terms array
|
||||
// as we find valid indexes. Also, it might be possible to leverage
|
||||
// this even more by starting in the middle of the termNumbers array
|
||||
// and thus dividing the terms array maybe in half with each found index.
|
||||
int res[] = new int[len];
|
||||
|
||||
for (int i=0; i < len; i++) {
|
||||
res[i] = indexOf(termNumbers[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
|
@ -57,13 +57,13 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
|
||||
/** TermDocs provides an interface for enumerating <document, frequency>
|
||||
pairs for a term. <p> The document portion names each document containing
|
||||
the term. Documents are indicated by number. The frequency portion gives
|
||||
the number of times the term occurred in each document. <p> The pairs are
|
||||
ordered by document number.
|
||||
pairs for a term. <p> The document portion names each document containing
|
||||
the term. Documents are indicated by number. The frequency portion gives
|
||||
the number of times the term occurred in each document. <p> The pairs are
|
||||
ordered by document number.
|
||||
|
||||
@see IndexReader#termDocs
|
||||
*/
|
||||
@see IndexReader#termDocs
|
||||
*/
|
||||
|
||||
public interface TermDocs {
|
||||
/** Sets this to the data for a term.
|
||||
|
@ -77,15 +77,15 @@ public interface TermDocs {
|
|||
void seek(TermEnum termEnum) throws IOException;
|
||||
|
||||
/** Returns the current document number. <p> This is invalid until {@link
|
||||
#next()} is called for the first time.*/
|
||||
#next()} is called for the first time.*/
|
||||
int doc();
|
||||
|
||||
/** Returns the frequency of the term within the current document. <p> This
|
||||
is invalid until {@link #next()} is called for the first time.*/
|
||||
is invalid until {@link #next()} is called for the first time.*/
|
||||
int freq();
|
||||
|
||||
/** Moves to the next pair in the enumeration. <p> Returns true iff there is
|
||||
such a next pair in the enumeration. */
|
||||
such a next pair in the enumeration. */
|
||||
boolean next() throws IOException;
|
||||
|
||||
/** Attempts to read multiple entries from the enumeration, up to length of
|
||||
|
|
|
@ -73,4 +73,27 @@ public abstract class TermEnum {
|
|||
|
||||
/** Closes the enumeration to further activity, freeing resources. */
|
||||
public abstract void close() throws IOException;
|
||||
|
||||
// Term Vector support
|
||||
|
||||
/** Skips terms to the first beyond the current whose value is
|
||||
* greater or equal to <i>target</i>. <p>Returns true iff there is such
|
||||
* an entry. <p>Behaves as if written: <pre>
|
||||
* public boolean skipTo(Term target) {
|
||||
* do {
|
||||
* if (!next())
|
||||
* return false;
|
||||
* } while (target > term());
|
||||
* return true;
|
||||
* }
|
||||
* </pre>
|
||||
* Some implementations are considerably more efficient than that.
|
||||
*/
|
||||
public boolean skipTo(Term target) throws IOException {
|
||||
do {
|
||||
if (!next())
|
||||
return false;
|
||||
} while (target.compareTo(term()) > 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Provides access to stored term vector of
|
||||
* a document field.
|
||||
*/
|
||||
public interface TermFreqVector {
|
||||
/**
|
||||
*
|
||||
* @return The field this vector is associated with.
|
||||
*
|
||||
*/
|
||||
public String getField();
|
||||
|
||||
/**
|
||||
* @return The number of terms in the term vector.
|
||||
*/
|
||||
public int size();
|
||||
|
||||
/**
|
||||
* @return An Array of term texts in ascending order.
|
||||
*/
|
||||
public String[] getTerms();
|
||||
|
||||
|
||||
/** Array of term frequencies. Locations of the array correspond one to one
|
||||
* to the term numbers in the array obtained from <code>getTermNumbers</code>
|
||||
* method. Each location in the array contains the number of times this
|
||||
* term occurs in the document or the document field.
|
||||
*/
|
||||
public int[] getTermFrequencies();
|
||||
|
||||
|
||||
/** Return a string representation of the vector.
|
||||
*/
|
||||
public String toString();
|
||||
|
||||
|
||||
/** Return a string representation of the vector, but use the provided IndexReader
|
||||
* to obtain text for each term and include the text instead of term numbers.
|
||||
*/
|
||||
public String toString(IndexReader ir) throws IOException;
|
||||
|
||||
|
||||
/** Return an index in the term numbers array returned from <code>getTermNumbers</code>
|
||||
* at which the term with the specified <code>termNumber</code> appears. If this
|
||||
* term does not appear in the array, return -1.
|
||||
*/
|
||||
public int indexOf(String term);
|
||||
|
||||
|
||||
/** Just like <code>indexOf(int)</code> but searches for a number of terms
|
||||
* at the same time. Returns an array that has the same size as the number
|
||||
* of terms searched for, each slot containing the result of searching for
|
||||
* that term number.
|
||||
*
|
||||
* @param terms array containing terms to look for
|
||||
* @param start index in the array where the list of terms starts
|
||||
* @param len the number of terms in the list
|
||||
*/
|
||||
public int[] indexesOf(String[] terms, int start, int len);
|
||||
|
||||
}
|
|
@ -57,6 +57,7 @@ package org.apache.lucene.index;
|
|||
import java.io.IOException;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
|
||||
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
|
||||
Directory. A TermInfos can be written once, in order. */
|
||||
|
@ -156,10 +157,10 @@ final class TermInfosWriter {
|
|||
lastTi.set(ti);
|
||||
size++;
|
||||
}
|
||||
|
||||
|
||||
private final void writeTerm(Term term)
|
||||
throws IOException {
|
||||
int start = stringDifference(lastTerm.text, term.text);
|
||||
int start = StringHelper.stringDifference(lastTerm.text, term.text);
|
||||
int length = term.text.length() - start;
|
||||
|
||||
output.writeVInt(start); // write shared prefix length
|
||||
|
@ -171,15 +172,7 @@ final class TermInfosWriter {
|
|||
lastTerm = term;
|
||||
}
|
||||
|
||||
private static final int stringDifference(String s1, String s2) {
|
||||
int len1 = s1.length();
|
||||
int len2 = s2.length();
|
||||
int len = len1 < len2 ? len1 : len2;
|
||||
for (int i = 0; i < len; i++)
|
||||
if (s1.charAt(i) != s2.charAt(i))
|
||||
return i;
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
/** Called to complete TermInfos creation. */
|
||||
final void close() throws IOException {
|
||||
|
@ -190,4 +183,5 @@ final class TermInfosWriter {
|
|||
if (!isIndex)
|
||||
other.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/** Extends <code>TermFreqVector</code> to provide additional information about
|
||||
* positions in which each of the terms is found.
|
||||
*/
|
||||
public interface TermPositionVector extends TermFreqVector {
|
||||
|
||||
/** Returns an array of positions in which the term is found.
|
||||
* Terms are identified by the index at which its number appears in the
|
||||
* term number array obtained from <code>getTermNumbers</code> method.
|
||||
*/
|
||||
public int[] getTermPositions(int index);
|
||||
}
|
|
@ -0,0 +1,221 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.InputStream;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** TODO: relax synchro!
|
||||
*/
|
||||
class TermVectorsReader {
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private InputStream tvx;
|
||||
private InputStream tvd;
|
||||
private InputStream tvf;
|
||||
private int size;
|
||||
|
||||
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)
|
||||
throws IOException {
|
||||
if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
|
||||
tvx = d.openFile(segment + TermVectorsWriter.TVX_EXTENSION);
|
||||
checkValidFormat(tvx);
|
||||
tvd = d.openFile(segment + TermVectorsWriter.TVD_EXTENSION);
|
||||
checkValidFormat(tvd);
|
||||
tvf = d.openFile(segment + TermVectorsWriter.TVF_EXTENSION);
|
||||
checkValidFormat(tvf);
|
||||
size = (int) tvx.length() / 8;
|
||||
}
|
||||
|
||||
this.fieldInfos = fieldInfos;
|
||||
}
|
||||
|
||||
private void checkValidFormat(InputStream in) throws IOException
|
||||
{
|
||||
int format = in.readInt();
|
||||
if (format > TermVectorsWriter.FORMAT_VERSION)
|
||||
{
|
||||
throw new IOException("Incompatible format version: " + format + " expected "
|
||||
+ TermVectorsWriter.FORMAT_VERSION + " or less");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
synchronized void close() throws IOException {
|
||||
// why don't we trap the exception and at least make sure that
|
||||
// all streams that we can close are closed?
|
||||
if (tvx != null) tvx.close();
|
||||
if (tvd != null) tvd.close();
|
||||
if (tvf != null) tvf.close();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The number of documents in the reader
|
||||
*/
|
||||
int size() {
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the term vector for the given document and field
|
||||
* @param docNum The document number to retrieve the vector for
|
||||
* @param field The field within the document to retrieve
|
||||
* @return The TermFreqVector for the document and field or null
|
||||
*/
|
||||
synchronized TermFreqVector get(int docNum, String field) {
|
||||
// Check if no term vectors are available for this segment at all
|
||||
int fieldNumber = fieldInfos.fieldNumber(field);
|
||||
TermFreqVector result = null;
|
||||
if (tvx != null) {
|
||||
try {
|
||||
//We need to account for the FORMAT_SIZE at when seeking in the tvx
|
||||
//We don't need to do this in other seeks because we already have the file pointer
|
||||
//that was written in another file
|
||||
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
//System.out.println("TVX Pointer: " + tvx.getFilePointer());
|
||||
long position = tvx.readLong();
|
||||
|
||||
tvd.seek(position);
|
||||
int fieldCount = tvd.readVInt();
|
||||
//System.out.println("Num Fields: " + fieldCount);
|
||||
// There are only a few fields per document. We opt for a full scan
|
||||
// rather then requiring that they be ordered. We need to read through
|
||||
// all of the fields anyway to get to the tvf pointers.
|
||||
int number = 0;
|
||||
int found = -1;
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
number += tvd.readVInt();
|
||||
if (number == fieldNumber) found = i;
|
||||
}
|
||||
|
||||
// This field, although valid in the segment, was not found in this document
|
||||
if (found != -1) {
|
||||
// Compute position in the tvf file
|
||||
position = 0;
|
||||
for (int i = 0; i <= found; i++)
|
||||
{
|
||||
position += tvd.readVLong();
|
||||
}
|
||||
result = readTermVector(field, position);
|
||||
}
|
||||
else {
|
||||
//System.out.println("Field not found");
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
//e.printStackTrace();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
System.out.println("No tvx file");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/** Return all term vectors stored for this document or null if the could not be read in. */
|
||||
synchronized TermFreqVector[] get(int docNum) {
|
||||
TermFreqVector[] result = null;
|
||||
// Check if no term vectors are available for this segment at all
|
||||
if (tvx != null) {
|
||||
try {
|
||||
//We need to offset by
|
||||
tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
|
||||
long position = tvx.readLong();
|
||||
|
||||
tvd.seek(position);
|
||||
int fieldCount = tvd.readVInt();
|
||||
|
||||
// No fields are vectorized for this document
|
||||
if (fieldCount != 0) {
|
||||
int number = 0;
|
||||
String[] fields = new String[fieldCount];
|
||||
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
number += tvd.readVInt();
|
||||
fields[i] = fieldInfos.fieldName(number);
|
||||
}
|
||||
|
||||
// Compute position in the tvf file
|
||||
position = 0;
|
||||
long[] tvfPointers = new long[fieldCount];
|
||||
for (int i = 0; i < fieldCount; i++) {
|
||||
position += tvd.readVLong();
|
||||
tvfPointers[i] = position;
|
||||
}
|
||||
|
||||
result = readTermVectors(fields, tvfPointers);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
System.out.println("No tvx file");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private SegmentTermVector[] readTermVectors(String fields[], long tvfPointers[])
|
||||
throws IOException {
|
||||
SegmentTermVector res[] = new SegmentTermVector[fields.length];
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
res[i] = readTermVector(fields[i], tvfPointers[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param fieldNum The field to read in
|
||||
* @param tvfPointer The pointer within the tvf file where we should start reading
|
||||
* @return The TermVector located at that position
|
||||
* @throws IOException
|
||||
*/
|
||||
private SegmentTermVector readTermVector(String field, long tvfPointer)
|
||||
throws IOException {
|
||||
|
||||
// Now read the data from specified position
|
||||
//We don't need to offset by the FORMAT here since the pointer already includes the offset
|
||||
tvf.seek(tvfPointer);
|
||||
|
||||
int numTerms = tvf.readVInt();
|
||||
//System.out.println("Num Terms: " + numTerms);
|
||||
// If no terms - return a constant empty termvector
|
||||
if (numTerms == 0) return new SegmentTermVector(field, null, null);
|
||||
|
||||
int length = numTerms + tvf.readVInt();
|
||||
|
||||
String terms[] = new String[numTerms];
|
||||
|
||||
int termFreqs[] = new int[numTerms];
|
||||
|
||||
int start = 0;
|
||||
int deltaLength = 0;
|
||||
int totalLength = 0;
|
||||
char [] buffer = {};
|
||||
String previousString = "";
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
start = tvf.readVInt();
|
||||
deltaLength = tvf.readVInt();
|
||||
totalLength = start + deltaLength;
|
||||
if (buffer.length < totalLength)
|
||||
{
|
||||
buffer = new char[totalLength];
|
||||
for (int j = 0; j < previousString.length(); j++) // copy contents
|
||||
buffer[j] = previousString.charAt(j);
|
||||
}
|
||||
tvf.readChars(buffer, start, deltaLength);
|
||||
terms[i] = new String(buffer, 0, totalLength);
|
||||
previousString = terms[i];
|
||||
termFreqs[i] = tvf.readVInt();
|
||||
}
|
||||
SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs);
|
||||
return tv;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,301 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
|
||||
/**
|
||||
* Writer works by opening a document and then opening the fields within the document and then
|
||||
* writing out the vectors for each field.
|
||||
*
|
||||
* Rough usage:
|
||||
*
|
||||
<CODE>
|
||||
for each document
|
||||
{
|
||||
writer.openDocument();
|
||||
for each field on the document
|
||||
{
|
||||
writer.openField(field);
|
||||
for all of the terms
|
||||
{
|
||||
writer.addTerm(...)
|
||||
}
|
||||
writer.closeField
|
||||
}
|
||||
writer.closeDocument()
|
||||
}
|
||||
</CODE>
|
||||
*/
|
||||
final class TermVectorsWriter {
|
||||
public static final int FORMAT_VERSION = 1;
|
||||
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
|
||||
public static final int FORMAT_SIZE = 4;
|
||||
|
||||
//TODO: Figure out how to write with or w/o position information and read back in
|
||||
public static final String TVX_EXTENSION = ".tvx";
|
||||
public static final String TVD_EXTENSION = ".tvd";
|
||||
public static final String TVF_EXTENSION = ".tvf";
|
||||
private OutputStream tvx = null, tvd = null, tvf = null;
|
||||
private Vector fields = null;
|
||||
private Vector terms = null;
|
||||
private FieldInfos fieldInfos;
|
||||
|
||||
private TVField currentField = null;
|
||||
private long currentDocPointer = -1;
|
||||
|
||||
/** Create term vectors writer for the specified segment in specified
|
||||
* directory. A new TermVectorsWriter should be created for each
|
||||
* segment. The parameter <code>maxFields</code> indicates how many total
|
||||
* fields are found in this document. Not all of these fields may require
|
||||
* termvectors to be stored, so the number of calls to
|
||||
* <code>openField</code> is less or equal to this number.
|
||||
*/
|
||||
public TermVectorsWriter(Directory directory, String segment,
|
||||
FieldInfos fieldInfos)
|
||||
throws IOException {
|
||||
// Open files for TermVector storage
|
||||
tvx = directory.createFile(segment + TVX_EXTENSION);
|
||||
tvx.writeInt(FORMAT_VERSION);
|
||||
tvd = directory.createFile(segment + TVD_EXTENSION);
|
||||
tvd.writeInt(FORMAT_VERSION);
|
||||
tvf = directory.createFile(segment + TVF_EXTENSION);
|
||||
tvf.writeInt(FORMAT_VERSION);
|
||||
|
||||
this.fieldInfos = fieldInfos;
|
||||
fields = new Vector(fieldInfos.size());
|
||||
terms = new Vector();
|
||||
}
|
||||
|
||||
|
||||
public final void openDocument()
|
||||
throws IOException {
|
||||
closeDocument();
|
||||
|
||||
currentDocPointer = tvd.getFilePointer();
|
||||
}
|
||||
|
||||
|
||||
public final void closeDocument()
|
||||
throws IOException {
|
||||
if (isDocumentOpen()) {
|
||||
closeField();
|
||||
writeDoc();
|
||||
fields.clear();
|
||||
currentDocPointer = -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public final boolean isDocumentOpen() {
|
||||
return currentDocPointer != -1;
|
||||
}
|
||||
|
||||
|
||||
/** Start processing a field. This can be followed by a number of calls to
|
||||
* addTerm, and a final call to closeField to indicate the end of
|
||||
* processing of this field. If a field was previously open, it is
|
||||
* closed automatically.
|
||||
*/
|
||||
public final void openField(String field)
|
||||
throws IOException {
|
||||
if (!isDocumentOpen()) throw new IllegalStateException("Cannot open field when no document is open.");
|
||||
|
||||
closeField();
|
||||
currentField = new TVField(fieldInfos.fieldNumber(field));
|
||||
}
|
||||
|
||||
/** Finished processing current field. This should be followed by a call to
|
||||
* openField before future calls to addTerm.
|
||||
*/
|
||||
public final void closeField()
|
||||
throws IOException {
|
||||
if (isFieldOpen()) {
|
||||
/* DEBUG */
|
||||
//System.out.println("closeField()");
|
||||
/* DEBUG */
|
||||
|
||||
// save field and terms
|
||||
writeField();
|
||||
fields.add(currentField);
|
||||
terms.clear();
|
||||
currentField = null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Return true if a field is currently open. */
|
||||
public final boolean isFieldOpen() {
|
||||
return currentField != null;
|
||||
}
|
||||
|
||||
/** Add term to the field's term vector. Field must already be open
|
||||
* of NullPointerException is thrown. Terms should be added in
|
||||
* increasing order of terms, one call per unique termNum. ProxPointer
|
||||
* is a pointer into the TermPosition file (prx). Freq is the number of
|
||||
* times this term appears in this field, in this document.
|
||||
*/
|
||||
public final void addTerm(String termText, int freq) {
|
||||
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add terms when document is not open");
|
||||
if (!isFieldOpen()) throw new IllegalStateException("Cannot add terms when field is not open");
|
||||
|
||||
addTermInternal(termText, freq);
|
||||
}
|
||||
|
||||
private final void addTermInternal(String termText, int freq) {
|
||||
currentField.length += freq;
|
||||
TVTerm term = new TVTerm();
|
||||
term.termText = termText;
|
||||
term.freq = freq;
|
||||
terms.add(term);
|
||||
}
|
||||
|
||||
|
||||
/** Add specified vectors to the document.
|
||||
*/
|
||||
public final void addVectors(TermFreqVector[] vectors)
|
||||
throws IOException {
|
||||
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vectors when document is not open");
|
||||
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vectors when field is open");
|
||||
|
||||
for (int i = 0; i < vectors.length; i++) {
|
||||
addTermFreqVector(vectors[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Add specified vector to the document. Document must be open but no field
|
||||
* should be open or exception is thrown. The same document can have <code>addTerm</code>
|
||||
* and <code>addVectors</code> calls mixed, however a given field must either be
|
||||
* populated with <code>addTerm</code> or with <code>addVector</code>. *
|
||||
*/
|
||||
public final void addTermFreqVector(TermFreqVector vector)
|
||||
throws IOException {
|
||||
if (!isDocumentOpen()) throw new IllegalStateException("Cannot add term vector when document is not open");
|
||||
if (isFieldOpen()) throw new IllegalStateException("Cannot add term vector when field is open");
|
||||
addTermFreqVectorInternal(vector);
|
||||
}
|
||||
|
||||
private final void addTermFreqVectorInternal(TermFreqVector vector)
|
||||
throws IOException {
|
||||
openField(vector.getField());
|
||||
for (int i = 0; i < vector.size(); i++) {
|
||||
addTermInternal(vector.getTerms()[i], vector.getTermFrequencies()[i]);
|
||||
}
|
||||
closeField();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/** Close all streams. */
|
||||
final void close() throws IOException {
|
||||
try {
|
||||
closeDocument();
|
||||
} finally {
|
||||
// make an effort to close all streams we can but remember and re-throw
|
||||
// the first exception encountered in this process
|
||||
IOException keep = null;
|
||||
if (tvx != null)
|
||||
try {
|
||||
tvx.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (tvd != null)
|
||||
try {
|
||||
tvd.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (tvf != null)
|
||||
try {
|
||||
tvf.close();
|
||||
} catch (IOException e) {
|
||||
if (keep == null) keep = e;
|
||||
}
|
||||
if (keep != null) throw (IOException) keep.fillInStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void writeField() throws IOException {
|
||||
// remember where this field is written
|
||||
currentField.tvfPointer = tvf.getFilePointer();
|
||||
//System.out.println("Field Pointer: " + currentField.tvfPointer);
|
||||
final int size;
|
||||
|
||||
tvf.writeVInt(size = terms.size());
|
||||
tvf.writeVInt(currentField.length - size);
|
||||
String lastTermText = "";
|
||||
// write term ids and positions
|
||||
for (int i = 0; i < size; i++) {
|
||||
TVTerm term = (TVTerm) terms.elementAt(i);
|
||||
//tvf.writeString(term.termText);
|
||||
int start = StringHelper.stringDifference(lastTermText, term.termText);
|
||||
int length = term.termText.length() - start;
|
||||
tvf.writeVInt(start); // write shared prefix length
|
||||
tvf.writeVInt(length); // write delta length
|
||||
tvf.writeChars(term.termText, start, length); // write delta chars
|
||||
tvf.writeVInt(term.freq);
|
||||
lastTermText = term.termText;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
private void writeDoc() throws IOException {
|
||||
if (isFieldOpen()) throw new IllegalStateException("Field is still open while writing document");
|
||||
//System.out.println("Writing doc pointer: " + currentDocPointer);
|
||||
// write document index record
|
||||
tvx.writeLong(currentDocPointer);
|
||||
|
||||
// write document data record
|
||||
final int size;
|
||||
|
||||
// write the number of fields
|
||||
tvd.writeVInt(size = fields.size());
|
||||
|
||||
// write field numbers
|
||||
int lastFieldNumber = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
TVField field = (TVField) fields.elementAt(i);
|
||||
tvd.writeVInt(field.number - lastFieldNumber);
|
||||
|
||||
lastFieldNumber = field.number;
|
||||
}
|
||||
|
||||
// write field pointers
|
||||
long lastFieldPointer = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
TVField field = (TVField) fields.elementAt(i);
|
||||
tvd.writeVLong(field.tvfPointer - lastFieldPointer);
|
||||
|
||||
lastFieldPointer = field.tvfPointer;
|
||||
}
|
||||
//System.out.println("After writing doc pointer: " + tvx.getFilePointer());
|
||||
}
|
||||
|
||||
|
||||
private static class TVField {
|
||||
int number;
|
||||
long tvfPointer = 0;
|
||||
int length = 0; // number of distinct term positions
|
||||
|
||||
TVField(int number) {
|
||||
this.number = number;
|
||||
}
|
||||
}
|
||||
|
||||
private static class TVTerm {
|
||||
String termText;
|
||||
int freq = 0;
|
||||
//int positions[] = null;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,216 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class QueryTermVector implements TermFreqVector {
|
||||
private String [] terms = new String[0];
|
||||
private int [] termFreqs = new int[0];
|
||||
|
||||
public String getField() { return null; }
|
||||
|
||||
/**
|
||||
*
|
||||
* @param queryTerms The original list of terms from the query, can contain duplicates
|
||||
*/
|
||||
public QueryTermVector(String [] queryTerms) {
|
||||
|
||||
processTerms(queryTerms);
|
||||
}
|
||||
|
||||
public QueryTermVector(String queryString, Analyzer analyzer) {
|
||||
if (analyzer != null)
|
||||
{
|
||||
TokenStream stream = analyzer.tokenStream("", new StringReader(queryString));
|
||||
if (stream != null)
|
||||
{
|
||||
Token next = null;
|
||||
List terms = new ArrayList();
|
||||
try {
|
||||
while ((next = stream.next()) != null)
|
||||
{
|
||||
terms.add(next.termText());
|
||||
}
|
||||
processTerms((String[])terms.toArray(new String[terms.size()]));
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processTerms(String[] queryTerms) {
|
||||
if (queryTerms != null) {
|
||||
Arrays.sort(queryTerms);
|
||||
Map tmpSet = new HashMap(queryTerms.length);
|
||||
//filter out duplicates
|
||||
List tmpList = new ArrayList(queryTerms.length);
|
||||
List tmpFreqs = new ArrayList(queryTerms.length);
|
||||
int j = 0;
|
||||
for (int i = 0; i < queryTerms.length; i++) {
|
||||
String term = queryTerms[i];
|
||||
Integer position = (Integer)tmpSet.get(term);
|
||||
if (position == null) {
|
||||
tmpSet.put(term, new Integer(j++));
|
||||
tmpList.add(term);
|
||||
tmpFreqs.add(new Integer(1));
|
||||
}
|
||||
else {
|
||||
Integer integer = (Integer)tmpFreqs.get(position.intValue());
|
||||
tmpFreqs.set(position.intValue(), new Integer(integer.intValue() + 1));
|
||||
}
|
||||
}
|
||||
terms = (String[])tmpList.toArray(terms);
|
||||
//termFreqs = (int[])tmpFreqs.toArray(termFreqs);
|
||||
termFreqs = new int[tmpFreqs.size()];
|
||||
int i = 0;
|
||||
for (Iterator iter = tmpFreqs.iterator(); iter.hasNext();) {
|
||||
Integer integer = (Integer) iter.next();
|
||||
termFreqs[i++] = integer.intValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public final String toString() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append('{');
|
||||
for (int i=0; i<terms.length; i++) {
|
||||
if (i>0) sb.append(", ");
|
||||
sb.append(terms[i]).append('/').append(termFreqs[i]);
|
||||
}
|
||||
sb.append('}');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return The number of terms in the term vector.
|
||||
*/
|
||||
public int size() {
|
||||
return terms.length;
|
||||
}
|
||||
|
||||
/** Returns an array of positions in which the term is found or null if no position information is
|
||||
* available or positions are not implemented.
|
||||
* Terms are identified by the index at which its number appears in the
|
||||
* term array obtained from <code>getTerms</code> method.
|
||||
*/
|
||||
public int[] getTermPositions(int index) {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return An Array of term texts in ascending order.
|
||||
*/
|
||||
public String[] getTerms() {
|
||||
return terms;
|
||||
}
|
||||
|
||||
/** Array of term frequencies. Locations of the array correspond one to one
|
||||
* to the term numbers in the array obtained from <code>getTermNumbers</code>
|
||||
* method. Each location in the array contains the number of times this
|
||||
* term occurs in the document or the document field.
|
||||
*/
|
||||
public int[] getTermFrequencies() {
|
||||
return termFreqs;
|
||||
}
|
||||
|
||||
/** Return a string representation of the vector, but use the provided IndexReader
|
||||
* to obtain text for each term and include the text instead of term numbers.
|
||||
*/
|
||||
public String toString(IndexReader ir) throws IOException {
|
||||
return toString();
|
||||
}
|
||||
|
||||
/** Return an index in the term numbers array returned from <code>getTermNumbers</code>
|
||||
* at which the term with the specified <code>termNumber</code> appears. If this
|
||||
* term does not appear in the array, return -1.
|
||||
*/
|
||||
public int indexOf(String term) {
|
||||
int res = Arrays.binarySearch(terms, term);
|
||||
return res >= 0 ? res : -1;
|
||||
}
|
||||
|
||||
/** Just like <code>indexOf(int)</code> but searches for a number of terms
|
||||
* at the same time. Returns an array that has the same size as the number
|
||||
* of terms searched for, each slot containing the result of searching for
|
||||
* that term number.
|
||||
*
|
||||
* @param terms array containing terms to look for
|
||||
* @param start index in the array where the list of terms starts
|
||||
* @param len the number of terms in the list
|
||||
*/
|
||||
public int[] indexesOf(String[] terms, int start, int len) {
|
||||
int res[] = new int[len];
|
||||
|
||||
for (int i=0; i < len; i++) {
|
||||
res[i] = indexOf(terms[i]);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Methods for manipulating strings
|
||||
*
|
||||
**/
|
||||
public abstract class StringHelper {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param s1 The first string to compare
|
||||
* @param s2 The second string to compare
|
||||
* @return The first position where the two strings differ.
|
||||
*/
|
||||
public static final int stringDifference(String s1, String s2) {
|
||||
int len1 = s1.length();
|
||||
int len2 = s2.length();
|
||||
int len = len1 < len2 ? len1 : len2;
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (s1.charAt(i) != s2.charAt(i)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
private StringHelper() {
|
||||
}
|
||||
}
|
|
@ -0,0 +1,159 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: Grant Ingersoll
|
||||
* Date: Feb 2, 2004
|
||||
* Time: 6:16:12 PM
|
||||
* $Id$
|
||||
* Copyright 2004. Center For Natural Language Processing
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Enumeration;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
class DocHelper {
|
||||
public static final String FIELD_1_TEXT = "field one text";
|
||||
public static final String TEXT_FIELD_1_KEY = "textField1";
|
||||
public static Field textField1 = Field.Text(TEXT_FIELD_1_KEY, FIELD_1_TEXT, false);
|
||||
|
||||
public static final String FIELD_2_TEXT = "field field field two text";
|
||||
//Fields will be lexicographically sorted. So, the order is: field, text, two
|
||||
public static final int [] FIELD_2_FREQS = {3, 1, 1};
|
||||
public static final String TEXT_FIELD_2_KEY = "textField2";
|
||||
public static Field textField2 = Field.Text(TEXT_FIELD_2_KEY, FIELD_2_TEXT, true);
|
||||
|
||||
public static final String KEYWORD_TEXT = "Keyword";
|
||||
public static final String KEYWORD_FIELD_KEY = "keyField";
|
||||
public static Field keyField = Field.Keyword(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
|
||||
|
||||
public static final String UNINDEXED_FIELD_TEXT = "unindexed field text";
|
||||
public static final String UNINDEXED_FIELD_KEY = "unIndField";
|
||||
public static Field unIndField = Field.UnIndexed(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
|
||||
|
||||
public static final String UNSTORED_1_FIELD_TEXT = "unstored field text";
|
||||
public static final String UNSTORED_FIELD_1_KEY = "unStoredField1";
|
||||
public static Field unStoredField1 = Field.UnStored(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT, false);
|
||||
|
||||
public static final String UNSTORED_2_FIELD_TEXT = "unstored field text";
|
||||
public static final String UNSTORED_FIELD_2_KEY = "unStoredField2";
|
||||
public static Field unStoredField2 = Field.UnStored(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT, true);
|
||||
|
||||
// public static Set fieldNamesSet = null;
|
||||
// public static Set fieldValuesSet = null;
|
||||
public static Map nameValues = null;
|
||||
|
||||
static
|
||||
{
|
||||
|
||||
nameValues = new HashMap();
|
||||
nameValues.put(TEXT_FIELD_1_KEY, FIELD_1_TEXT);
|
||||
nameValues.put(TEXT_FIELD_2_KEY, FIELD_2_TEXT);
|
||||
nameValues.put(KEYWORD_FIELD_KEY, KEYWORD_TEXT);
|
||||
nameValues.put(UNINDEXED_FIELD_KEY, UNINDEXED_FIELD_TEXT);
|
||||
nameValues.put(UNSTORED_FIELD_1_KEY, UNSTORED_1_FIELD_TEXT);
|
||||
nameValues.put(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the fields above to a document
|
||||
* @param doc The document to write
|
||||
*/
|
||||
public static void setupDoc(Document doc) {
|
||||
doc.add(textField1);
|
||||
doc.add(textField2);
|
||||
doc.add(keyField);
|
||||
doc.add(unIndField);
|
||||
doc.add(unStoredField1);
|
||||
doc.add(unStoredField2);
|
||||
}
|
||||
/**
|
||||
* Writes the document to the directory using a segment named "test"
|
||||
* @param dir
|
||||
* @param doc
|
||||
*/
|
||||
public static void writeDoc(Directory dir, Document doc)
|
||||
{
|
||||
|
||||
writeDoc(dir, "test", doc);
|
||||
}
|
||||
/**
|
||||
* Writes the document to the directory in the given segment
|
||||
* @param dir
|
||||
* @param segment
|
||||
* @param doc
|
||||
*/
|
||||
public static void writeDoc(Directory dir, String segment, Document doc)
|
||||
{
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
writeDoc(dir, analyzer, similarity, segment, doc);
|
||||
}
|
||||
/**
|
||||
* Writes the document to the directory segment named "test" using the specified analyzer and similarity
|
||||
* @param dir
|
||||
* @param analyzer
|
||||
* @param similarity
|
||||
* @param doc
|
||||
*/
|
||||
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, Document doc)
|
||||
{
|
||||
writeDoc(dir, analyzer, similarity, "test", doc);
|
||||
}
|
||||
/**
|
||||
* Writes the document to the directory segment using the analyzer and the similarity score
|
||||
* @param dir
|
||||
* @param analyzer
|
||||
* @param similarity
|
||||
* @param segment
|
||||
* @param doc
|
||||
*/
|
||||
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc)
|
||||
{
|
||||
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
||||
try {
|
||||
writer.addDocument(segment, doc);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static int numFields(Document doc) {
|
||||
Enumeration fields = doc.fields();
|
||||
int result = 0;
|
||||
while (fields.hasMoreElements()) {
|
||||
fields.nextElement();
|
||||
result++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
/*
|
||||
fieldNamesSet = new HashSet();
|
||||
fieldNamesSet.add(TEXT_FIELD_1_KEY);
|
||||
fieldNamesSet.add(TEXT_FIELD_2_KEY);
|
||||
fieldNamesSet.add(KEYWORD_FIELD_KEY);
|
||||
fieldNamesSet.add(UNINDEXED_FIELD_KEY);
|
||||
fieldNamesSet.add(UNSTORED_FIELD_1_KEY);
|
||||
fieldNamesSet.add(UNSTORED_FIELD_2_KEY);
|
||||
fieldValuesSet = new HashSet();
|
||||
fieldValuesSet.add(FIELD_1_TEXT);
|
||||
fieldValuesSet.add(FIELD_2_TEXT);
|
||||
fieldValuesSet.add(KEYWORD_TEXT);
|
||||
fieldValuesSet.add(UNINDEXED_FIELD_TEXT);
|
||||
fieldValuesSet.add(UNSTORED_1_FIELD_TEXT);
|
||||
fieldValuesSet.add(UNSTORED_2_FIELD_TEXT);
|
||||
*/
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestDocumentWriter extends TestCase {
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private Document testDoc = new Document();
|
||||
|
||||
|
||||
public TestDocumentWriter(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
DocHelper.setupDoc(testDoc);
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
|
||||
}
|
||||
|
||||
public void testAddDocument() {
|
||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
|
||||
assertTrue(writer != null);
|
||||
try {
|
||||
writer.addDocument("test", testDoc);
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
|
||||
assertTrue(reader != null);
|
||||
Document doc = reader.document(0);
|
||||
assertTrue(doc != null);
|
||||
|
||||
//System.out.println("Document: " + doc);
|
||||
Field [] fields = doc.getFields("textField2");
|
||||
assertTrue(fields != null && fields.length == 1);
|
||||
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
|
||||
assertTrue(fields[0].isTermVectorStored() == true);
|
||||
|
||||
fields = doc.getFields("textField1");
|
||||
assertTrue(fields != null && fields.length == 1);
|
||||
assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
|
||||
assertTrue(fields[0].isTermVectorStored() == false);
|
||||
|
||||
fields = doc.getFields("keyField");
|
||||
assertTrue(fields != null && fields.length == 1);
|
||||
assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.store.RAMOutputStream;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.OutputStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
//import org.cnlp.utils.properties.ResourceBundleHelper;
|
||||
|
||||
public class TestFieldInfos extends TestCase {
|
||||
|
||||
private Document testDoc = new Document();
|
||||
|
||||
public TestFieldInfos(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
DocHelper.setupDoc(testDoc);
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
}
|
||||
|
||||
public void test() {
|
||||
//Positive test of FieldInfos
|
||||
assertTrue(testDoc != null);
|
||||
FieldInfos fieldInfos = new FieldInfos();
|
||||
fieldInfos.add(testDoc);
|
||||
//Since the complement is stored as well in the fields map
|
||||
assertTrue(fieldInfos.size() == 7); //this is 7 b/c we are using the no-arg constructor
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
String name = "testFile";
|
||||
OutputStream output = dir.createFile(name);
|
||||
assertTrue(output != null);
|
||||
//Use a RAMOutputStream
|
||||
|
||||
try {
|
||||
fieldInfos.write(output);
|
||||
output.close();
|
||||
assertTrue(output.length() > 0);
|
||||
FieldInfos readIn = new FieldInfos(dir, name);
|
||||
assertTrue(fieldInfos.size() == readIn.size());
|
||||
FieldInfo info = readIn.fieldInfo("textField1");
|
||||
assertTrue(info != null);
|
||||
assertTrue(info.storeTermVector == false);
|
||||
|
||||
info = readIn.fieldInfo("textField2");
|
||||
assertTrue(info != null);
|
||||
assertTrue(info.storeTermVector == true);
|
||||
|
||||
dir.close();
|
||||
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestFieldsReader extends TestCase {
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private Document testDoc = new Document();
|
||||
private FieldInfos fieldInfos = null;
|
||||
|
||||
public TestFieldsReader(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
fieldInfos = new FieldInfos();
|
||||
DocHelper.setupDoc(testDoc);
|
||||
fieldInfos.add(testDoc);
|
||||
DocumentWriter writer = new DocumentWriter(dir, new WhitespaceAnalyzer(),
|
||||
Similarity.getDefault(), 50);
|
||||
assertTrue(writer != null);
|
||||
try {
|
||||
writer.addDocument("test", testDoc);
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(fieldInfos != null);
|
||||
try {
|
||||
FieldsReader reader = new FieldsReader(dir, "test", fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(reader.size() == 1);
|
||||
Document doc = reader.doc(0);
|
||||
assertTrue(doc != null);
|
||||
assertTrue(doc.getField("textField1") != null);
|
||||
Field field = doc.getField("textField2");
|
||||
assertTrue(field != null);
|
||||
assertTrue(field.isTermVectorStored() == true);
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -79,7 +79,7 @@ public class TestFilterIndexReader extends TestCase {
|
|||
|
||||
private static class TestReader extends FilterIndexReader {
|
||||
|
||||
/** Filter that only permits terms containing 'e'.*/
|
||||
/** Filter that only permits terms containing 'e'.*/
|
||||
private static class TestTermEnum extends FilterTermEnum {
|
||||
public TestTermEnum(TermEnum enum)
|
||||
throws IOException {
|
||||
|
|
|
@ -0,0 +1,136 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestMultiReader extends TestCase {
|
||||
private Directory dir = new RAMDirectory();
|
||||
private Document doc1 = new Document();
|
||||
private Document doc2 = new Document();
|
||||
private SegmentReader reader1;
|
||||
private SegmentReader reader2;
|
||||
private SegmentReader [] readers = new SegmentReader[2];
|
||||
private SegmentInfos sis = new SegmentInfos();
|
||||
|
||||
public TestMultiReader(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
DocHelper.setupDoc(doc1);
|
||||
DocHelper.setupDoc(doc2);
|
||||
DocHelper.writeDoc(dir, "seg-1", doc1);
|
||||
DocHelper.writeDoc(dir, "seg-2", doc2);
|
||||
|
||||
try {
|
||||
sis.write(dir);
|
||||
reader1 = new SegmentReader(new SegmentInfo("seg-1", 1, dir));
|
||||
reader2 = new SegmentReader(new SegmentInfo("seg-2", 1, dir));
|
||||
readers[0] = reader1;
|
||||
readers[1] = reader2;
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
/*IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
|
||||
writer.addDocument(doc1);
|
||||
writer.addDocument(doc2);
|
||||
writer.close();*/
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(reader1 != null);
|
||||
assertTrue(reader2 != null);
|
||||
assertTrue(sis != null);
|
||||
}
|
||||
|
||||
public void testDocument() {
|
||||
try {
|
||||
sis.read(dir);
|
||||
MultiReader reader = new MultiReader(dir, readers);
|
||||
assertTrue(reader != null);
|
||||
Document newDoc1 = reader.document(0);
|
||||
assertTrue(newDoc1 != null);
|
||||
assertTrue(DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - 2);
|
||||
Document newDoc2 = reader.document(1);
|
||||
assertTrue(newDoc2 != null);
|
||||
assertTrue(DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - 2);
|
||||
TermFreqVector vector = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
|
||||
assertTrue(vector != null);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testTermVectors() {
|
||||
try {
|
||||
MultiReader reader = new MultiReader(dir, readers);
|
||||
assertTrue(reader != null);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,163 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
public class TestSegmentMerger extends TestCase {
|
||||
//The variables for the new merged segment
|
||||
private Directory mergedDir = new RAMDirectory();
|
||||
private String mergedSegment = "test";
|
||||
//First segment to be merged
|
||||
private Directory merge1Dir = new RAMDirectory();
|
||||
private Document doc1 = new Document();
|
||||
private String merge1Segment = "test-1";
|
||||
private SegmentReader reader1 = null;
|
||||
//Second Segment to be merged
|
||||
private Directory merge2Dir = new RAMDirectory();
|
||||
private Document doc2 = new Document();
|
||||
private String merge2Segment = "test-2";
|
||||
private SegmentReader reader2 = null;
|
||||
|
||||
|
||||
public TestSegmentMerger(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
DocHelper.setupDoc(doc1);
|
||||
DocHelper.writeDoc(merge1Dir, merge1Segment, doc1);
|
||||
DocHelper.setupDoc(doc2);
|
||||
DocHelper.writeDoc(merge2Dir, merge2Segment, doc2);
|
||||
try {
|
||||
reader1 = new SegmentReader(new SegmentInfo(merge1Segment, 1, merge1Dir));
|
||||
reader2 = new SegmentReader(new SegmentInfo(merge2Segment, 1, merge2Dir));
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(mergedDir != null);
|
||||
assertTrue(merge1Dir != null);
|
||||
assertTrue(merge2Dir != null);
|
||||
assertTrue(reader1 != null);
|
||||
assertTrue(reader2 != null);
|
||||
}
|
||||
|
||||
public void testMerge() {
|
||||
//System.out.println("----------------TestMerge------------------");
|
||||
SegmentMerger merger = new SegmentMerger(mergedDir, mergedSegment, false);
|
||||
merger.add(reader1);
|
||||
merger.add(reader2);
|
||||
try {
|
||||
int docsMerged = merger.merge();
|
||||
assertTrue(docsMerged == 2);
|
||||
//Should be able to open a new SegmentReader against the new directory
|
||||
SegmentReader mergedReader = new SegmentReader(new SegmentInfo(mergedSegment, docsMerged, mergedDir));
|
||||
assertTrue(mergedReader != null);
|
||||
assertTrue(mergedReader.numDocs() == 2);
|
||||
Document newDoc1 = mergedReader.document(0);
|
||||
assertTrue(newDoc1 != null);
|
||||
//There are 2 unstored fields on the document
|
||||
assertTrue(DocHelper.numFields(newDoc1) == DocHelper.numFields(doc1) - 2);
|
||||
Document newDoc2 = mergedReader.document(1);
|
||||
assertTrue(newDoc2 != null);
|
||||
assertTrue(DocHelper.numFields(newDoc2) == DocHelper.numFields(doc2) - 2);
|
||||
|
||||
TermDocs termDocs = mergedReader.termDocs(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"));
|
||||
assertTrue(termDocs != null);
|
||||
assertTrue(termDocs.next() == true);
|
||||
|
||||
Collection stored = mergedReader.getIndexedFieldNames(true);
|
||||
assertTrue(stored != null);
|
||||
//System.out.println("stored size: " + stored.size());
|
||||
assertTrue(stored.size() == 2);
|
||||
|
||||
TermFreqVector vector = mergedReader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
|
||||
assertTrue(vector != null);
|
||||
String [] terms = vector.getTerms();
|
||||
assertTrue(terms != null);
|
||||
//System.out.println("Terms size: " + terms.length);
|
||||
assertTrue(terms.length == 3);
|
||||
int [] freqs = vector.getTermFrequencies();
|
||||
assertTrue(freqs != null);
|
||||
//System.out.println("Freqs size: " + freqs.length);
|
||||
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String term = terms[i];
|
||||
int freq = freqs[i];
|
||||
//System.out.println("Term: " + term + " Freq: " + freq);
|
||||
assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
|
||||
assertTrue(DocHelper.FIELD_2_FREQS[i] == freq);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
//System.out.println("---------------------end TestMerge-------------------");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,250 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.Enumeration;
|
||||
|
||||
public class TestSegmentReader extends TestCase {
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private Document testDoc = new Document();
|
||||
private SegmentReader reader = null;
|
||||
|
||||
public TestSegmentReader(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
//TODO: Setup the reader w/ multiple documents
|
||||
protected void setUp() {
|
||||
|
||||
try {
|
||||
DocHelper.setupDoc(testDoc);
|
||||
DocHelper.writeDoc(dir, testDoc);
|
||||
reader = new SegmentReader(new SegmentInfo("test", 1, dir));
|
||||
} catch (IOException e) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(reader != null);
|
||||
assertTrue(DocHelper.nameValues.size() > 0);
|
||||
assertTrue(DocHelper.numFields(testDoc) == 6);
|
||||
}
|
||||
|
||||
public void testDocument() {
|
||||
try {
|
||||
assertTrue(reader.numDocs() == 1);
|
||||
assertTrue(reader.maxDoc() >= 1);
|
||||
Document result = reader.document(0);
|
||||
assertTrue(result != null);
|
||||
//There are 2 unstored fields on the document that are not preserved across writing
|
||||
assertTrue(DocHelper.numFields(result) == DocHelper.numFields(testDoc) - 2);
|
||||
|
||||
Enumeration fields = result.fields();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field) fields.nextElement();
|
||||
assertTrue(field != null);
|
||||
assertTrue(DocHelper.nameValues.containsKey(field.name()));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testDelete() {
|
||||
Document docToDelete = new Document();
|
||||
DocHelper.setupDoc(docToDelete);
|
||||
DocHelper.writeDoc(dir, "seg-to-delete", docToDelete);
|
||||
try {
|
||||
SegmentReader deleteReader = new SegmentReader(new SegmentInfo("seg-to-delete", 1, dir));
|
||||
assertTrue(deleteReader != null);
|
||||
assertTrue(deleteReader.numDocs() == 1);
|
||||
deleteReader.delete(0);
|
||||
assertTrue(deleteReader.isDeleted(0) == true);
|
||||
assertTrue(deleteReader.hasDeletions() == true);
|
||||
assertTrue(deleteReader.numDocs() == 0);
|
||||
try {
|
||||
Document test = deleteReader.document(0);
|
||||
assertTrue(false);
|
||||
} catch (IllegalArgumentException e) {
|
||||
assertTrue(true);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testGetFieldNameVariations() {
|
||||
try {
|
||||
Collection result = reader.getFieldNames();
|
||||
assertTrue(result != null);
|
||||
assertTrue(result.size() == 7);
|
||||
for (Iterator iter = result.iterator(); iter.hasNext();) {
|
||||
String s = (String) iter.next();
|
||||
//System.out.println("Name: " + s);
|
||||
assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
|
||||
}
|
||||
result = reader.getFieldNames(true);
|
||||
assertTrue(result != null);
|
||||
// System.out.println("Size: " + result.size());
|
||||
assertTrue(result.size() == 5);
|
||||
for (Iterator iter = result.iterator(); iter.hasNext();) {
|
||||
String s = (String) iter.next();
|
||||
assertTrue(DocHelper.nameValues.containsKey(s) == true || s.equals(""));
|
||||
}
|
||||
|
||||
result = reader.getFieldNames(false);
|
||||
assertTrue(result != null);
|
||||
assertTrue(result.size() == 2);
|
||||
//Get all indexed fields that are storing term vectors
|
||||
result = reader.getIndexedFieldNames(true);
|
||||
assertTrue(result != null);
|
||||
assertTrue(result.size() == 2);
|
||||
|
||||
result = reader.getIndexedFieldNames(false);
|
||||
assertTrue(result != null);
|
||||
assertTrue(result.size() == 3);
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testTerms() {
|
||||
try {
|
||||
TermEnum terms = reader.terms();
|
||||
assertTrue(terms != null);
|
||||
while (terms.next() == true)
|
||||
{
|
||||
Term term = terms.term();
|
||||
assertTrue(term != null);
|
||||
//System.out.println("Term: " + term);
|
||||
String fieldValue = (String)DocHelper.nameValues.get(term.field());
|
||||
assertTrue(fieldValue.indexOf(term.text()) != -1);
|
||||
}
|
||||
|
||||
TermDocs termDocs = reader.termDocs();
|
||||
assertTrue(termDocs != null);
|
||||
termDocs.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
|
||||
assertTrue(termDocs.next() == true);
|
||||
|
||||
TermPositions positions = reader.termPositions();
|
||||
positions.seek(new Term(DocHelper.TEXT_FIELD_1_KEY, "field"));
|
||||
assertTrue(positions != null);
|
||||
assertTrue(positions.doc() == 0);
|
||||
assertTrue(positions.nextPosition() >= 0);
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testNorms() {
|
||||
//TODO: Not sure how these work/should be tested
|
||||
/*
|
||||
try {
|
||||
byte [] norms = reader.norms(DocHelper.TEXT_FIELD_1_KEY);
|
||||
System.out.println("Norms: " + norms);
|
||||
assertTrue(norms != null);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
public void testTermVectors() {
|
||||
try {
|
||||
TermFreqVector result = reader.getTermFreqVector(0, DocHelper.TEXT_FIELD_2_KEY);
|
||||
assertTrue(result != null);
|
||||
String [] terms = result.getTerms();
|
||||
int [] freqs = result.getTermFrequencies();
|
||||
assertTrue(terms != null && terms.length == 3 && freqs != null && freqs.length == 3);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String term = terms[i];
|
||||
int freq = freqs[i];
|
||||
assertTrue(DocHelper.FIELD_2_TEXT.indexOf(term) != -1);
|
||||
assertTrue(freq > 0);
|
||||
}
|
||||
|
||||
TermFreqVector [] results = reader.getTermFreqVectors(0);
|
||||
assertTrue(results != null);
|
||||
assertTrue(results.length == 2);
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestSegmentTermDocs extends TestCase {
|
||||
private Document testDoc = new Document();
|
||||
private Directory dir = new RAMDirectory();
|
||||
|
||||
public TestSegmentTermDocs(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
DocHelper.setupDoc(testDoc);
|
||||
DocHelper.writeDoc(dir, testDoc);
|
||||
}
|
||||
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
}
|
||||
|
||||
public void testTermDocs() {
|
||||
try {
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 1, dir));
|
||||
assertTrue(reader != null);
|
||||
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
|
||||
assertTrue(segTermDocs != null);
|
||||
segTermDocs.seek(new Term(DocHelper.TEXT_FIELD_2_KEY, "field"));
|
||||
if (segTermDocs.next() == true)
|
||||
{
|
||||
int docId = segTermDocs.doc();
|
||||
assertTrue(docId == 0);
|
||||
int freq = segTermDocs.freq();
|
||||
assertTrue(freq == 3);
|
||||
}
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testBadSeek() {
|
||||
try {
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 3, dir));
|
||||
assertTrue(reader != null);
|
||||
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
|
||||
assertTrue(segTermDocs != null);
|
||||
segTermDocs.seek(new Term("textField2", "bad"));
|
||||
assertTrue(segTermDocs.next() == false);
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
try {
|
||||
//After adding the document, we should be able to read it back in
|
||||
SegmentReader reader = new SegmentReader(new SegmentInfo("test", 3, dir));
|
||||
assertTrue(reader != null);
|
||||
SegmentTermDocs segTermDocs = new SegmentTermDocs(reader);
|
||||
assertTrue(segTermDocs != null);
|
||||
segTermDocs.seek(new Term("junk", "bad"));
|
||||
assertTrue(segTermDocs.next() == false);
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,106 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestTermVectorsReader extends TestCase {
|
||||
private TermVectorsWriter writer = null;
|
||||
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
||||
private String [] testFields = {"f1", "f2", "f3"};
|
||||
private String [] testTerms = {"this", "is", "a", "test"};
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private String seg = "testSegment";
|
||||
private FieldInfos fieldInfos = new FieldInfos();
|
||||
|
||||
public TestTermVectorsReader(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
for (int i = 0; i < testFields.length; i++) {
|
||||
fieldInfos.add(testFields[i], true, true);
|
||||
}
|
||||
|
||||
try {
|
||||
Arrays.sort(testTerms);
|
||||
for (int j = 0; j < 5; j++) {
|
||||
writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
writer.openDocument();
|
||||
|
||||
for (int k = 0; k < testFields.length; k++) {
|
||||
writer.openField(testFields[k]);
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
writer.addTerm(testTerms[i], i);
|
||||
}
|
||||
writer.closeField();
|
||||
}
|
||||
writer.closeDocument();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
//Check to see the files were created properly in setup
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
}
|
||||
|
||||
public void testReader() {
|
||||
try {
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
TermFreqVector vector = reader.get(0, testFields[0]);
|
||||
assertTrue(vector != null);
|
||||
String [] terms = vector.getTerms();
|
||||
assertTrue(terms != null);
|
||||
assertTrue(terms.length == testTerms.length);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String term = terms[i];
|
||||
//System.out.println("Term: " + term);
|
||||
assertTrue(term.equals(testTerms[i]));
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure exceptions and bad params are handled appropriately
|
||||
*/
|
||||
public void testBadParams() {
|
||||
try {
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
//Bad document number, good field number
|
||||
TermFreqVector vector = reader.get(50, testFields[0]);
|
||||
assertTrue(vector == null);
|
||||
} catch (Exception e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
try {
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
//good document number, bad field number
|
||||
TermFreqVector vector = reader.get(0, "f50");
|
||||
assertTrue(vector == null);
|
||||
} catch (Exception e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,240 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class TestTermVectorsWriter extends TestCase {
|
||||
|
||||
private String[] testTerms = {"this", "is", "a", "test"};
|
||||
private String [] testFields = {"f1", "f2", "f3"};
|
||||
private int[][] positions = new int[testTerms.length][];
|
||||
private RAMDirectory dir = new RAMDirectory();
|
||||
private String seg = "testSegment";
|
||||
private FieldInfos fieldInfos = new FieldInfos();
|
||||
|
||||
public TestTermVectorsWriter(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
|
||||
for (int i = 0; i < testFields.length; i++) {
|
||||
fieldInfos.add(testFields[i], true, true);
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
positions[i] = new int[5];
|
||||
for (int j = 0; j < positions[i].length; j++) {
|
||||
positions[i][j] = i * 100;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(dir != null);
|
||||
assertTrue(positions != null);
|
||||
}
|
||||
|
||||
/*public void testWriteNoPositions() {
|
||||
try {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, 50);
|
||||
writer.openDocument();
|
||||
assertTrue(writer.isDocumentOpen() == true);
|
||||
writer.openField(0);
|
||||
assertTrue(writer.isFieldOpen() == true);
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
writer.addTerm(testTerms[i], i);
|
||||
}
|
||||
writer.closeField();
|
||||
|
||||
writer.closeDocument();
|
||||
writer.close();
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
//Check to see the files were created
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
//Now read it back in
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg);
|
||||
assertTrue(reader != null);
|
||||
checkTermVector(reader, 0, 0);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
} */
|
||||
|
||||
public void testWriter() {
|
||||
try {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
writer.openDocument();
|
||||
assertTrue(writer.isDocumentOpen() == true);
|
||||
writeField(writer, testFields[0]);
|
||||
writer.closeDocument();
|
||||
writer.close();
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
//Check to see the files were created
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
//Now read it back in
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
checkTermVector(reader, 0, testFields[0]);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
private void checkTermVector(TermVectorsReader reader, int docNum, String field) throws IOException {
|
||||
TermFreqVector vector = reader.get(docNum, field);
|
||||
assertTrue(vector != null);
|
||||
String[] terms = vector.getTerms();
|
||||
assertTrue(terms != null);
|
||||
assertTrue(terms.length == testTerms.length);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String term = terms[i];
|
||||
assertTrue(term.equals(testTerms[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test one document, multiple fields
|
||||
*/
|
||||
public void testMultipleFields() {
|
||||
try {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
writeDocument(writer, testFields.length);
|
||||
|
||||
writer.close();
|
||||
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
//Check to see the files were created
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVD_EXTENSION));
|
||||
assertTrue(dir.fileExists(seg + TermVectorsWriter.TVX_EXTENSION));
|
||||
//Now read it back in
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
assertTrue(reader != null);
|
||||
|
||||
for (int j = 0; j < testFields.length; j++) {
|
||||
checkTermVector(reader, 0, testFields[j]);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeDocument(TermVectorsWriter writer, int numFields) throws IOException {
|
||||
writer.openDocument();
|
||||
assertTrue(writer.isDocumentOpen() == true);
|
||||
|
||||
for (int j = 0; j < numFields; j++) {
|
||||
writeField(writer, testFields[j]);
|
||||
}
|
||||
writer.closeDocument();
|
||||
assertTrue(writer.isDocumentOpen() == false);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param writer The writer to write to
|
||||
* @param j The field number
|
||||
* @throws IOException
|
||||
*/
|
||||
private void writeField(TermVectorsWriter writer, String f) throws IOException {
|
||||
writer.openField(f);
|
||||
assertTrue(writer.isFieldOpen() == true);
|
||||
for (int i = 0; i < testTerms.length; i++) {
|
||||
writer.addTerm(testTerms[i], i);
|
||||
}
|
||||
writer.closeField();
|
||||
}
|
||||
|
||||
|
||||
public void testMultipleDocuments() {
|
||||
|
||||
try {
|
||||
TermVectorsWriter writer = new TermVectorsWriter(dir, seg, fieldInfos);
|
||||
assertTrue(writer != null);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
writeDocument(writer, testFields.length);
|
||||
}
|
||||
writer.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
//Do some arbitrary tests
|
||||
try {
|
||||
TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertTrue(reader != null);
|
||||
checkTermVector(reader, 5, testFields[0]);
|
||||
checkTermVector(reader, 2, testFields[2]);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -103,7 +103,7 @@ public class TestBasics extends TestCase {
|
|||
|
||||
searcher = new IndexSearcher(directory);
|
||||
}
|
||||
|
||||
|
||||
public void testTerm() throws Exception {
|
||||
Query query = new TermQuery(new Term("field", "seventy"));
|
||||
checkHits(query, new int[]
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
|
||||
public class TestQueryTermVector extends TestCase {
|
||||
|
||||
|
||||
public TestQueryTermVector(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void testConstructor() {
|
||||
String [] queryTerm = {"foo", "bar", "foo", "again", "foo", "bar", "go", "go", "go"};
|
||||
//Items are sorted lexicographically
|
||||
String [] gold = {"again", "bar", "foo", "go"};
|
||||
int [] goldFreqs = {1, 2, 3, 3};
|
||||
QueryTermVector result = new QueryTermVector(queryTerm);
|
||||
assertTrue(result != null);
|
||||
String [] terms = result.getTerms();
|
||||
assertTrue(terms.length == 4);
|
||||
int [] freq = result.getTermFrequencies();
|
||||
assertTrue(freq.length == 4);
|
||||
checkGold(terms, gold, freq, goldFreqs);
|
||||
result = new QueryTermVector(null);
|
||||
assertTrue(result.getTerms().length == 0);
|
||||
|
||||
result = new QueryTermVector("foo bar foo again foo bar go go go", new WhitespaceAnalyzer());
|
||||
assertTrue(result != null);
|
||||
terms = result.getTerms();
|
||||
assertTrue(terms.length == 4);
|
||||
freq = result.getTermFrequencies();
|
||||
assertTrue(freq.length == 4);
|
||||
checkGold(terms, gold, freq, goldFreqs);
|
||||
}
|
||||
|
||||
private void checkGold(String[] terms, String[] gold, int[] freq, int[] goldFreqs) {
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
assertTrue(terms[i].equals(gold[i]));
|
||||
assertTrue(freq[i] == goldFreqs[i]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,261 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.English;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class TestTermVectors extends TestCase {
|
||||
private IndexSearcher searcher;
|
||||
private RAMDirectory directory = new RAMDirectory();
|
||||
public TestTermVectors(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
public void setUp() throws Exception {
|
||||
IndexWriter writer
|
||||
= new IndexWriter(directory, new SimpleAnalyzer(), true);
|
||||
//writer.setUseCompoundFile(true);
|
||||
//writer.infoStream = System.out;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(Field.Text("field", English.intToEnglish(i), true));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
searcher = new IndexSearcher(directory);
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertTrue(searcher != null);
|
||||
}
|
||||
|
||||
public void testTermVectors() {
|
||||
Query query = new TermQuery(new Term("field", "seventy"));
|
||||
try {
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(100, hits.length());
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
|
||||
assertTrue(vector != null);
|
||||
assertTrue(vector.length == 1);
|
||||
//assertTrue();
|
||||
}
|
||||
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(50));
|
||||
//System.out.println("Explain: " + searcher.explain(query, hits.id(50)));
|
||||
//System.out.println("Vector: " + vector[0].toString());
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testTermPositionVectors() {
|
||||
Query query = new TermQuery(new Term("field", "fifty"));
|
||||
try {
|
||||
Hits hits = searcher.search(query);
|
||||
assertEquals(100, hits.length());
|
||||
|
||||
for (int i = 0; i < hits.length(); i++)
|
||||
{
|
||||
TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i));
|
||||
assertTrue(vector != null);
|
||||
assertTrue(vector.length == 1);
|
||||
//assertTrue();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
|
||||
public void testKnownSetOfDocuments() {
|
||||
String [] termArray = {"eating", "chocolate", "in", "a", "computer", "lab", "grows", "old", "colored",
|
||||
"with", "an"};
|
||||
String test1 = "eating chocolate in a computer lab"; //6 terms
|
||||
String test2 = "computer in a computer lab"; //5 terms
|
||||
String test3 = "a chocolate lab grows old"; //5 terms
|
||||
String test4 = "eating chocolate with a chocolate lab in an old chocolate colored computer lab"; //13 terms
|
||||
Map test4Map = new HashMap();
|
||||
test4Map.put("chocolate", new Integer(3));
|
||||
test4Map.put("lab", new Integer(2));
|
||||
test4Map.put("eating", new Integer(1));
|
||||
test4Map.put("computer", new Integer(1));
|
||||
test4Map.put("with", new Integer(1));
|
||||
test4Map.put("a", new Integer(1));
|
||||
test4Map.put("colored", new Integer(1));
|
||||
test4Map.put("in", new Integer(1));
|
||||
test4Map.put("an", new Integer(1));
|
||||
test4Map.put("computer", new Integer(1));
|
||||
test4Map.put("old", new Integer(1));
|
||||
|
||||
Document testDoc1 = new Document();
|
||||
setupDoc(testDoc1, test1);
|
||||
Document testDoc2 = new Document();
|
||||
setupDoc(testDoc2, test2);
|
||||
Document testDoc3 = new Document();
|
||||
setupDoc(testDoc3, test3);
|
||||
Document testDoc4 = new Document();
|
||||
setupDoc(testDoc4, test4);
|
||||
|
||||
Directory dir = new RAMDirectory();
|
||||
|
||||
try {
|
||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
||||
assertTrue(writer != null);
|
||||
writer.addDocument(testDoc1);
|
||||
writer.addDocument(testDoc2);
|
||||
writer.addDocument(testDoc3);
|
||||
writer.addDocument(testDoc4);
|
||||
writer.close();
|
||||
IndexSearcher knownSearcher = new IndexSearcher(dir);
|
||||
TermEnum termEnum = knownSearcher.reader.terms();
|
||||
TermDocs termDocs = knownSearcher.reader.termDocs();
|
||||
//System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length);
|
||||
|
||||
Similarity sim = knownSearcher.getSimilarity();
|
||||
while (termEnum.next() == true)
|
||||
{
|
||||
Term term = termEnum.term();
|
||||
//System.out.println("Term: " + term);
|
||||
termDocs.seek(term);
|
||||
while (termDocs.next())
|
||||
{
|
||||
int docId = termDocs.doc();
|
||||
int freq = termDocs.freq();
|
||||
//System.out.println("Doc Id: " + docId + " freq " + freq);
|
||||
TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field");
|
||||
float tf = sim.tf(freq);
|
||||
float idf = sim.idf(term, knownSearcher);
|
||||
//float qNorm = sim.queryNorm()
|
||||
//This is fine since we don't have stop words
|
||||
float lNorm = sim.lengthNorm("field", vector.getTerms().length);
|
||||
//float coord = sim.coord()
|
||||
//System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm);
|
||||
assertTrue(vector != null);
|
||||
String[] vTerms = vector.getTerms();
|
||||
int [] freqs = vector.getTermFrequencies();
|
||||
for (int i = 0; i < vTerms.length; i++)
|
||||
{
|
||||
if (term.text().equals(vTerms[i]) == true)
|
||||
{
|
||||
assertTrue(freqs[i] == freq);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
//System.out.println("--------");
|
||||
}
|
||||
Query query = new TermQuery(new Term("field", "chocolate"));
|
||||
Hits hits = knownSearcher.search(query);
|
||||
//doc 3 should be the first hit b/c it is the shortest match
|
||||
assertTrue(hits.length() == 3);
|
||||
float score = hits.score(0);
|
||||
/*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString());
|
||||
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0)));
|
||||
System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString());
|
||||
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1)));
|
||||
System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString());
|
||||
System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/
|
||||
assertTrue(testDoc3.toString().equals(hits.doc(0).toString()));
|
||||
assertTrue(testDoc4.toString().equals(hits.doc(1).toString()));
|
||||
assertTrue(testDoc1.toString().equals(hits.doc(2).toString()));
|
||||
TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field");
|
||||
assertTrue(vector != null);
|
||||
//System.out.println("Vector: " + vector);
|
||||
String[] terms = vector.getTerms();
|
||||
int [] freqs = vector.getTermFrequencies();
|
||||
assertTrue(terms != null && terms.length == 10);
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
String term = terms[i];
|
||||
//System.out.println("Term: " + term);
|
||||
int freq = freqs[i];
|
||||
assertTrue(test4.indexOf(term) != -1);
|
||||
Integer freqInt = (Integer)test4Map.get(term);
|
||||
assertTrue(freqInt != null);
|
||||
assertTrue(freqInt.intValue() == freq);
|
||||
}
|
||||
knownSearcher.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
assertTrue(false);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private void setupDoc(Document doc, String text)
|
||||
{
|
||||
doc.add(Field.Text("field", text, true));
|
||||
//System.out.println("Document: " + doc);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class StringHelperTest extends TestCase {
|
||||
|
||||
|
||||
public StringHelperTest(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void testStringDifference() {
|
||||
String test1 = "test";
|
||||
String test2 = "testing";
|
||||
|
||||
int result = StringHelper.stringDifference(test1, test2);
|
||||
assertTrue(result == 4);
|
||||
|
||||
test2 = "foo";
|
||||
result = StringHelper.stringDifference(test1, test2);
|
||||
assertTrue(result == 0);
|
||||
|
||||
test2 = "test";
|
||||
result = StringHelper.stringDifference(test1, test2);
|
||||
assertTrue(result == 4);
|
||||
}
|
||||
}
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
<p>
|
||||
This document defines the index file formats used
|
||||
in Lucene version 1.3.
|
||||
in Lucene version 1.4.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
@ -224,7 +224,11 @@
|
|||
multiplied into the score for hits on that field.
|
||||
</p>
|
||||
</li>
|
||||
|
||||
<li><p>Term Vectors. For each field in each document, the term vector
|
||||
(sometimes called document vector) is stored. A term vector consists
|
||||
of the term text, term frequency and term position.
|
||||
</p>
|
||||
</li>
|
||||
<li><p>Deleted documents.
|
||||
An optional file indicating which documents are deleted.
|
||||
</p>
|
||||
|
@ -804,9 +808,10 @@
|
|||
</p>
|
||||
|
||||
<p>
|
||||
Currently only the low-order bit is used of FieldBits is used. It is
|
||||
one for
|
||||
indexed fields, and zero for non-indexed fields.
|
||||
The low-order bit is one for
|
||||
indexed fields, and zero for non-indexed fields. The second lowest-order
|
||||
bit is one for fields that have term vectors stored, and zero for fields
|
||||
without term vectors.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
|
@ -1113,6 +1118,52 @@
|
|||
</ol>
|
||||
|
||||
</subsection>
|
||||
<subsection name="Term Vectors">
|
||||
Term Vector support is an optional on a field by field basis. It consists of 4
|
||||
files.
|
||||
<ol>
|
||||
<li>
|
||||
<p>The Document Index or .tvx file.</p>
|
||||
<p>This contains, for each document, a pointer to the document data in the Document
|
||||
(.tvd) file.
|
||||
</p>
|
||||
<p>DocumentIndex (.tvx) --> FormatVersion<DocumentPosition><sup>NumDocs</sup></p>
|
||||
<p>FormatVersion --> Int</p>
|
||||
<p>DocumentPosition --> UInt64</p>
|
||||
<p>This is used to find the position of the Document in the .tvd file.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>The Document or .tvd file.</p>
|
||||
<p>This contains, for each document, the number of fields, a list of the fields with
|
||||
term vector info and finally a list of pointers to the field information in the .tvf
|
||||
(Term Vector Fields) file.</p>
|
||||
<p>
|
||||
Document (.tvd) --> FormatVersion<NumFields, FieldNums, FieldPositions,><sup>NumDocs</sup>
|
||||
</p>
|
||||
<p>FormatVersion --> Int</p>
|
||||
<p>NumFields --> VInt</p>
|
||||
<p>FieldNums --> <FieldNumDelta><sup>NumFields</sup></p>
|
||||
<p>FieldNumDelta --> VInt</p>
|
||||
<p>FieldPositions --> <FieldPosition><sup>NumFields</sup></p>
|
||||
<p>FieldPosition --> VLong</p>
|
||||
<p>The .tvd file is used to map out the fields that have term vectors stored and
|
||||
where the field information is in the .tvf file.</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>The Field or .tvf file.</p>
|
||||
<p>This file contains, for each field that has a term vector stored, a list of
|
||||
the terms and their frequencies.</p>
|
||||
<p>Field (.tvf) --> FormatVersion<NumTerms, NumDistinct, TermFreqs><sup>NumFields</sup></p>
|
||||
<p>FormatVersion --> Int</p>
|
||||
<p>NumTerms --> VInt</p>
|
||||
<p>NumDistinct --> VInt -- Future Use</p>
|
||||
<p>TermFreqs --> <TermText, TermFreq><sup>NumTerms</sup></p>
|
||||
<p>TermText --> String</p>
|
||||
<p>TermFreq --> VInt</p>
|
||||
<p></p>
|
||||
</li>
|
||||
</ol>
|
||||
</subsection>
|
||||
|
||||
<subsection name="Deleted Documents">
|
||||
|
||||
|
|
Loading…
Reference in New Issue