diff --git a/CHANGES.txt b/CHANGES.txt
index 0bb9ffe099d..6100929aacd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -47,7 +47,16 @@ API Changes
(Chris Hostetter, Otis Gospodnetic)
8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
- to enable extensibility of these classes.
+ to enable extensibility of these classes. (Michael Busch)
+
+ 9. LUCENE-580: Added the public method reset() to TokenStream. This method does
+ nothing by default, but may be overwritten by subclasses to support consuming
+ the TokenStream more than once. (Michael Busch)
+
+10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
+ argument, available as tokenStreamValue(). This is useful to avoid the need of
+ "dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
+
Bug fixes
diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
new file mode 100644
index 00000000000..c49729eca22
--- /dev/null
+++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@@ -0,0 +1,68 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * This class can be used if the Tokens of a TokenStream
+ * are intended to be consumed more than once. It caches
+ * all Tokens locally in a List.
+ *
+ * CachingTokenFilter implements the optional method
+ * {@link TokenStream#reset()}, which repositions the
+ * stream to the first Token.
+ *
+ */
+public class CachingTokenFilter extends TokenFilter {
+ private List cache;
+ private int index;
+
+ public CachingTokenFilter(TokenStream input) {
+ super(input);
+ }
+
+ public Token next() throws IOException {
+ if (cache == null) {
+ // fill cache lazily
+ cache = new LinkedList();
+ fillCache();
+ }
+
+ if (index == cache.size()) {
+ // the cache is exhausted, return null
+ return null;
+ }
+
+ return (Token) cache.get(index++);
+ }
+
+ public void reset() throws IOException {
+ index = 0;
+ }
+
+ private void fillCache() throws IOException {
+ Token token;
+ while ( (token = input.next()) != null) {
+ cache.add(token);
+ }
+ }
+
+}
diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java
index a1f97b0ecbf..98ba85a1a4a 100644
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@@ -35,6 +35,15 @@ public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS. */
public abstract Token next() throws IOException;
+ /** Resets this stream to the beginning. This is an
+ * optional operation, so subclasses may or may not
+ * implement this method. Reset() is not needed for
+ * the standard indexing process. However, if the Tokens
+ * of a TokenStream are intended to be consumed more than
+ * once, it is neccessary to implement reset().
+ */
+ public void reset() throws IOException {}
+
/** Releases resources associated with this stream. */
public void close() throws IOException {}
}
diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java
index 840d66e8b79..5a1b4c3e581 100644
--- a/src/java/org/apache/lucene/document/Field.java
+++ b/src/java/org/apache/lucene/document/Field.java
@@ -17,6 +17,7 @@ package org.apache.lucene.document;
* limitations under the License.
*/
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexWriter; // for javadoc
import org.apache.lucene.util.Parameter;
@@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
}
- /** The value of the field as a String, or null. If null, the Reader value
- * or binary value is used. Exactly one of stringValue(), readerValue(), and
- * binaryValue() must be set. */
+ /** The value of the field as a String, or null. If null, the Reader value,
+ * binary value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
- /** The value of the field as a Reader, or null. If null, the String value
- * or binary value is used. Exactly one of stringValue(), readerValue(),
- * and binaryValue() must be set. */
+ /** The value of the field as a Reader, or null. If null, the String value,
+ * binary value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
- /** The value of the field in Binary, or null. If null, the Reader or
- * String value is used. Exactly one of stringValue(), readerValue() and
- * binaryValue() must be set. */
+ /** The value of the field in Binary, or null. If null, the Reader value,
+ * String value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
+ /** The value of the field as a TokesStream, or null. If null, the Reader value,
+ * String value, or binary value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
+
/**
* Create a field by specifying its name, value and how it will
* be saved in the index. Term vectors will not be stored in the index.
@@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
setStoreTermVector(termVector);
}
+
+ /**
+ * Create a tokenized and indexed field that is not stored. Term vectors will
+ * not be stored. This is useful for pre-analyzed fields.
+ * The TokenStream is read only when the Document is added to the index,
+ * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
+ * has been called.
+ *
+ * @param name The name of the field
+ * @param tokenStream The TokenStream with the content
+ * @throws NullPointerException if name or tokenStream is null
+ */
+ public Field(String name, TokenStream tokenStream) {
+ this(name, tokenStream, TermVector.NO);
+ }
+
+ /**
+ * Create a tokenized and indexed field that is not stored, optionally with
+ * storing term vectors. This is useful for pre-analyzed fields.
+ * The TokenStream is read only when the Document is added to the index,
+ * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
+ * has been called.
+ *
+ * @param name The name of the field
+ * @param tokenStream The TokenStream with the content
+ * @param termVector Whether term vector should be stored
+ * @throws NullPointerException if name or tokenStream is null
+ */
+ public Field(String name, TokenStream tokenStream, TermVector termVector) {
+ if (name == null)
+ throw new NullPointerException("name cannot be null");
+ if (tokenStream == null)
+ throw new NullPointerException("tokenStream cannot be null");
+
+ this.name = name.intern(); // field names are interned
+ this.fieldsData = tokenStream;
+
+ this.isStored = false;
+ this.isCompressed = false;
+
+ this.isIndexed = true;
+ this.isTokenized = true;
+
+ this.isBinary = false;
+
+ setStoreTermVector(termVector);
+ }
+
/**
* Create a stored field with binary value. Optionally the value may be compressed.
diff --git a/src/java/org/apache/lucene/document/Fieldable.java b/src/java/org/apache/lucene/document/Fieldable.java
index 58494382b26..dba02262b69 100755
--- a/src/java/org/apache/lucene/document/Fieldable.java
+++ b/src/java/org/apache/lucene/document/Fieldable.java
@@ -19,6 +19,8 @@ package org.apache.lucene.document;
import java.io.Reader;
import java.io.Serializable;
+import org.apache.lucene.analysis.TokenStream;
+
/**
* Synonymous with {@link Field}.
*
@@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
*/
String name();
- /** The value of the field as a String, or null. If null, the Reader value
- * or binary value is used. Exactly one of stringValue(), readerValue(), and
- * binaryValue() must be set. */
- String stringValue();
-
- /** The value of the field as a Reader, or null. If null, the String value
- * or binary value is used. Exactly one of stringValue(), readerValue(),
- * and binaryValue() must be set. */
- Reader readerValue();
-
- /** The value of the field in Binary, or null. If null, the Reader or
- * String value is used. Exactly one of stringValue(), readerValue() and
- * binaryValue() must be set. */
- byte[] binaryValue();
+ /** The value of the field as a String, or null. If null, the Reader value,
+ * binary value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ public String stringValue();
+
+ /** The value of the field as a Reader, or null. If null, the String value,
+ * binary value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ public Reader readerValue();
+
+ /** The value of the field in Binary, or null. If null, the Reader value,
+ * String value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ public byte[] binaryValue();
+
+ /** The value of the field as a TokesStream, or null. If null, the Reader value,
+ * String value, or binary value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ public TokenStream tokenStreamValue();
/** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is
diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java
index 6d482dc5c7c..45ed02f7170 100644
--- a/src/java/org/apache/lucene/index/DocumentWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentWriter.java
@@ -162,18 +162,28 @@ final class DocumentWriter {
offset += stringValue.length();
length++;
} else
- {
- Reader reader; // find or make Reader
- if (field.readerValue() != null)
- reader = field.readerValue();
- else if (field.stringValue() != null)
- reader = new StringReader(field.stringValue());
- else
- throw new IllegalArgumentException
- ("field must have either String or Reader value");
-
- // Tokenize field and add to postingTable
- TokenStream stream = analyzer.tokenStream(fieldName, reader);
+ { // tokenized field
+ TokenStream stream = field.tokenStreamValue();
+
+ // the field does not have a TokenStream,
+ // so we have to obtain one from the analyzer
+ if (stream == null) {
+ Reader reader; // find or make Reader
+ if (field.readerValue() != null)
+ reader = field.readerValue();
+ else if (field.stringValue() != null)
+ reader = new StringReader(field.stringValue());
+ else
+ throw new IllegalArgumentException
+ ("field must have either String or Reader value");
+
+ // Tokenize field and add to postingTable
+ stream = analyzer.tokenStream(fieldName, reader);
+ }
+
+ // reset the TokenStream to the first token
+ stream.reset();
+
try {
Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) {
diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java
index 6a56883c38a..6cd9667b669 100644
--- a/src/java/org/apache/lucene/index/FieldsReader.java
+++ b/src/java/org/apache/lucene/index/FieldsReader.java
@@ -17,6 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@@ -331,11 +332,9 @@ final class FieldsReader {
return localFieldsStream;
}
- /**
- * The value of the field in Binary, or null. If null, the Reader or
- * String value is used. Exactly one of stringValue(), readerValue() and
- * binaryValue() must be set.
- */
+ /** The value of the field in Binary, or null. If null, the Reader value,
+ * String value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public byte[] binaryValue() {
ensureOpen();
if (fieldsData == null) {
@@ -358,21 +357,26 @@ final class FieldsReader {
return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
}
- /**
- * The value of the field as a Reader, or null. If null, the String value
- * or binary value is used. Exactly one of stringValue(), readerValue(),
- * and binaryValue() must be set.
- */
+ /** The value of the field as a Reader, or null. If null, the String value,
+ * binary value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() {
ensureOpen();
return fieldsData instanceof Reader ? (Reader) fieldsData : null;
}
- /**
- * The value of the field as a String, or null. If null, the Reader value
- * or binary value is used. Exactly one of stringValue(), readerValue(), and
- * binaryValue() must be set.
- */
+ /** The value of the field as a TokesStream, or null. If null, the Reader value,
+ * String value, or binary value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
+ public TokenStream tokenStreamValue() {
+ ensureOpen();
+ return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
+ }
+
+
+ /** The value of the field as a String, or null. If null, the Reader value,
+ * binary value, or TokenStream value is used. Exactly one of stringValue(),
+ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue() {
ensureOpen();
if (fieldsData == null) {
@@ -462,6 +466,11 @@ final class FieldsReader {
public byte[] binaryValue() {
return (byte[]) this.fieldsData;
}
+
+ public TokenStream tokenStreamValue() {
+ // not needed for merge
+ return null;
+ }
public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
this.isStored = true;
diff --git a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
new file mode 100644
index 00000000000..94a17313343
--- /dev/null
+++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
@@ -0,0 +1,103 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestCachingTokenFilter extends TestCase {
+ private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
+
+ public void testCaching() throws IOException {
+ Directory dir = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
+ Document doc = new Document();
+ TokenStream stream = new TokenStream() {
+ private int index = 0;
+
+ public Token next() throws IOException {
+ if (index == tokens.length) {
+ return null;
+ } else {
+ return new Token(tokens[index++], 0, 0);
+ }
+ }
+
+ };
+
+ stream = new CachingTokenFilter(stream);
+
+ doc.add(new Field("preanalyzed", stream, TermVector.NO));
+
+ // 1) we consume all tokens twice before we add the doc to the index
+ checkTokens(stream);
+ stream.reset();
+ checkTokens(stream);
+
+ // 2) now add the document to the index and verify if all tokens are indexed
+ // don't reset the stream here, the DocumentWriter should do that implicitly
+ writer.addDocument(doc);
+ writer.close();
+
+ IndexReader reader = IndexReader.open(dir);
+ TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
+ assertTrue(termPositions.next());
+ assertEquals(1, termPositions.freq());
+ assertEquals(0, termPositions.nextPosition());
+
+ termPositions.seek(new Term("preanalyzed", "term2"));
+ assertTrue(termPositions.next());
+ assertEquals(2, termPositions.freq());
+ assertEquals(1, termPositions.nextPosition());
+ assertEquals(3, termPositions.nextPosition());
+
+ termPositions.seek(new Term("preanalyzed", "term3"));
+ assertTrue(termPositions.next());
+ assertEquals(1, termPositions.freq());
+ assertEquals(2, termPositions.nextPosition());
+ reader.close();
+
+ // 3) reset stream and consume tokens again
+ stream.reset();
+ checkTokens(stream);
+ }
+
+ private void checkTokens(TokenStream stream) throws IOException {
+ int count = 0;
+ Token token;
+ while ((token = stream.next()) != null) {
+ assertTrue(count < tokens.length);
+ assertEquals(tokens[count], token.termText);
+ count++;
+ }
+
+ assertEquals(tokens.length, count);
+ }
+}
diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java
index 9279c7ff039..df3560dfa46 100644
--- a/src/test/org/apache/lucene/index/TestDocumentWriter.java
+++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java
@@ -19,10 +19,13 @@ package org.apache.lucene.index;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.*;
+import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.RAMDirectory;
@@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
assertEquals(0, termPositions.nextPosition());
assertEquals(502, termPositions.nextPosition());
}
+
+ public void testPreAnalyzedField() throws IOException {
+ Similarity similarity = Similarity.getDefault();
+ DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
+ Document doc = new Document();
+
+ doc.add(new Field("preanalyzed", new TokenStream() {
+ private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
+ private int index = 0;
+
+ public Token next() throws IOException {
+ if (index == tokens.length) {
+ return null;
+ } else {
+ return new Token(tokens[index++], 0, 0);
+ }
+ }
+
+ }, TermVector.NO));
+
+ String segName = "test";
+ writer.addDocument(segName, doc);
+ SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
+
+ TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
+ assertTrue(termPositions.next());
+ assertEquals(1, termPositions.freq());
+ assertEquals(0, termPositions.nextPosition());
+
+ termPositions.seek(new Term("preanalyzed", "term2"));
+ assertTrue(termPositions.next());
+ assertEquals(2, termPositions.freq());
+ assertEquals(1, termPositions.nextPosition());
+ assertEquals(3, termPositions.nextPosition());
+
+ termPositions.seek(new Term("preanalyzed", "term3"));
+ assertTrue(termPositions.next());
+ assertEquals(1, termPositions.freq());
+ assertEquals(2, termPositions.nextPosition());
+
+ }
}