LUCENE-580:

- Added the public method reset() to TokenStream.
- Added a new constructor to Field that takes a TokenStream as argument, useful for pre-analyzed fields.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2007-04-29 19:26:11 +00:00
parent 3c60a00b69
commit 80c0f267f6
9 changed files with 364 additions and 51 deletions

View File

@ -47,7 +47,16 @@ API Changes
(Chris Hostetter, Otis Gospodnetic) (Chris Hostetter, Otis Gospodnetic)
8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory 8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
to enable extensibility of these classes. to enable extensibility of these classes. (Michael Busch)
9. LUCENE-580: Added the public method reset() to TokenStream. This method does
nothing by default, but may be overwritten by subclasses to support consuming
the TokenStream more than once. (Michael Busch)
10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
argument, available as tokenStreamValue(). This is useful to avoid the need of
"dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
Bug fixes Bug fixes

View File

@ -0,0 +1,68 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
/**
* This class can be used if the Tokens of a TokenStream
* are intended to be consumed more than once. It caches
* all Tokens locally in a List.
*
* CachingTokenFilter implements the optional method
* {@link TokenStream#reset()}, which repositions the
* stream to the first Token.
*
*/
public class CachingTokenFilter extends TokenFilter {
private List cache;
private int index;
public CachingTokenFilter(TokenStream input) {
super(input);
}
public Token next() throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList();
fillCache();
}
if (index == cache.size()) {
// the cache is exhausted, return null
return null;
}
return (Token) cache.get(index++);
}
public void reset() throws IOException {
index = 0;
}
private void fillCache() throws IOException {
Token token;
while ( (token = input.next()) != null) {
cache.add(token);
}
}
}

View File

@ -35,6 +35,15 @@ public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS. */ /** Returns the next token in the stream, or null at EOS. */
public abstract Token next() throws IOException; public abstract Token next() throws IOException;
/** Resets this stream to the beginning. This is an
* optional operation, so subclasses may or may not
* implement this method. Reset() is not needed for
* the standard indexing process. However, if the Tokens
* of a TokenStream are intended to be consumed more than
* once, it is neccessary to implement reset().
*/
public void reset() throws IOException {}
/** Releases resources associated with this stream. */ /** Releases resources associated with this stream. */
public void close() throws IOException {} public void close() throws IOException {}
} }

View File

@ -17,6 +17,7 @@ package org.apache.lucene.document;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexWriter; // for javadoc import org.apache.lucene.index.IndexWriter; // for javadoc
import org.apache.lucene.util.Parameter; import org.apache.lucene.util.Parameter;
@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
} }
/** The value of the field as a String, or null. If null, the Reader value /** The value of the field as a String, or null. If null, the Reader value,
* or binary value is used. Exactly one of stringValue(), readerValue(), and * binary value, or TokenStream value is used. Exactly one of stringValue(),
* binaryValue() must be set. */ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
/** The value of the field as a Reader, or null. If null, the String value /** The value of the field as a Reader, or null. If null, the String value,
* or binary value is used. Exactly one of stringValue(), readerValue(), * binary value, or TokenStream value is used. Exactly one of stringValue(),
* and binaryValue() must be set. */ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
/** The value of the field in Binary, or null. If null, the Reader or /** The value of the field in Binary, or null. If null, the Reader value,
* String value is used. Exactly one of stringValue(), readerValue() and * String value, or TokenStream value is used. Exactly one of stringValue(),
* binaryValue() must be set. */ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; } public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
/** The value of the field as a TokesStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
/** /**
* Create a field by specifying its name, value and how it will * Create a field by specifying its name, value and how it will
* be saved in the index. Term vectors will not be stored in the index. * be saved in the index. Term vectors will not be stored in the index.
@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
setStoreTermVector(termVector); setStoreTermVector(termVector);
} }
/**
* Create a tokenized and indexed field that is not stored. Term vectors will
* not be stored. This is useful for pre-analyzed fields.
* The TokenStream is read only when the Document is added to the index,
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param tokenStream The TokenStream with the content
* @throws NullPointerException if name or tokenStream is <code>null</code>
*/
public Field(String name, TokenStream tokenStream) {
this(name, tokenStream, TermVector.NO);
}
/**
* Create a tokenized and indexed field that is not stored, optionally with
* storing term vectors. This is useful for pre-analyzed fields.
* The TokenStream is read only when the Document is added to the index,
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param tokenStream The TokenStream with the content
* @param termVector Whether term vector should be stored
* @throws NullPointerException if name or tokenStream is <code>null</code>
*/
public Field(String name, TokenStream tokenStream, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (tokenStream == null)
throw new NullPointerException("tokenStream cannot be null");
this.name = name.intern(); // field names are interned
this.fieldsData = tokenStream;
this.isStored = false;
this.isCompressed = false;
this.isIndexed = true;
this.isTokenized = true;
this.isBinary = false;
setStoreTermVector(termVector);
}
/** /**
* Create a stored field with binary value. Optionally the value may be compressed. * Create a stored field with binary value. Optionally the value may be compressed.

View File

@ -19,6 +19,8 @@ package org.apache.lucene.document;
import java.io.Reader; import java.io.Reader;
import java.io.Serializable; import java.io.Serializable;
import org.apache.lucene.analysis.TokenStream;
/** /**
* Synonymous with {@link Field}. * Synonymous with {@link Field}.
* *
@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
*/ */
String name(); String name();
/** The value of the field as a String, or null. If null, the Reader value /** The value of the field as a String, or null. If null, the Reader value,
* or binary value is used. Exactly one of stringValue(), readerValue(), and * binary value, or TokenStream value is used. Exactly one of stringValue(),
* binaryValue() must be set. */ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
String stringValue(); public String stringValue();
/** The value of the field as a Reader, or null. If null, the String value /** The value of the field as a Reader, or null. If null, the String value,
* or binary value is used. Exactly one of stringValue(), readerValue(), * binary value, or TokenStream value is used. Exactly one of stringValue(),
* and binaryValue() must be set. */ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
Reader readerValue(); public Reader readerValue();
/** The value of the field in Binary, or null. If null, the Reader or /** The value of the field in Binary, or null. If null, the Reader value,
* String value is used. Exactly one of stringValue(), readerValue() and * String value, or TokenStream value is used. Exactly one of stringValue(),
* binaryValue() must be set. */ * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
byte[] binaryValue(); public byte[] binaryValue();
/** The value of the field as a TokesStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue();
/** True iff the value of the field is to be stored in the index for return /** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is with search hits. It is an error for this to be true if a field is

View File

@ -162,18 +162,28 @@ final class DocumentWriter {
offset += stringValue.length(); offset += stringValue.length();
length++; length++;
} else } else
{ { // tokenized field
Reader reader; // find or make Reader TokenStream stream = field.tokenStreamValue();
if (field.readerValue() != null)
reader = field.readerValue(); // the field does not have a TokenStream,
else if (field.stringValue() != null) // so we have to obtain one from the analyzer
reader = new StringReader(field.stringValue()); if (stream == null) {
else Reader reader; // find or make Reader
throw new IllegalArgumentException if (field.readerValue() != null)
("field must have either String or Reader value"); reader = field.readerValue();
else if (field.stringValue() != null)
// Tokenize field and add to postingTable reader = new StringReader(field.stringValue());
TokenStream stream = analyzer.tokenStream(fieldName, reader); else
throw new IllegalArgumentException
("field must have either String or Reader value");
// Tokenize field and add to postingTable
stream = analyzer.tokenStream(fieldName, reader);
}
// reset the TokenStream to the first token
stream.reset();
try { try {
Token lastToken = null; Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) { for (Token t = stream.next(); t != null; t = stream.next()) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.index;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.*; import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
@ -331,11 +332,9 @@ final class FieldsReader {
return localFieldsStream; return localFieldsStream;
} }
/** /** The value of the field in Binary, or null. If null, the Reader value,
* The value of the field in Binary, or null. If null, the Reader or * String value, or TokenStream value is used. Exactly one of stringValue(),
* String value is used. Exactly one of stringValue(), readerValue() and * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
* binaryValue() must be set.
*/
public byte[] binaryValue() { public byte[] binaryValue() {
ensureOpen(); ensureOpen();
if (fieldsData == null) { if (fieldsData == null) {
@ -358,21 +357,26 @@ final class FieldsReader {
return fieldsData instanceof byte[] ? (byte[]) fieldsData : null; return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
} }
/** /** The value of the field as a Reader, or null. If null, the String value,
* The value of the field as a Reader, or null. If null, the String value * binary value, or TokenStream value is used. Exactly one of stringValue(),
* or binary value is used. Exactly one of stringValue(), readerValue(), * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
* and binaryValue() must be set.
*/
public Reader readerValue() { public Reader readerValue() {
ensureOpen(); ensureOpen();
return fieldsData instanceof Reader ? (Reader) fieldsData : null; return fieldsData instanceof Reader ? (Reader) fieldsData : null;
} }
/** /** The value of the field as a TokesStream, or null. If null, the Reader value,
* The value of the field as a String, or null. If null, the Reader value * String value, or binary value is used. Exactly one of stringValue(),
* or binary value is used. Exactly one of stringValue(), readerValue(), and * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
* binaryValue() must be set. public TokenStream tokenStreamValue() {
*/ ensureOpen();
return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
}
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue() { public String stringValue() {
ensureOpen(); ensureOpen();
if (fieldsData == null) { if (fieldsData == null) {
@ -462,6 +466,11 @@ final class FieldsReader {
public byte[] binaryValue() { public byte[] binaryValue() {
return (byte[]) this.fieldsData; return (byte[]) this.fieldsData;
} }
public TokenStream tokenStreamValue() {
// not needed for merge
return null;
}
public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) { public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
this.isStored = true; this.isStored = true;

View File

@ -0,0 +1,103 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
public class TestCachingTokenFilter extends TestCase {
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
public void testCaching() throws IOException {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
Document doc = new Document();
TokenStream stream = new TokenStream() {
private int index = 0;
public Token next() throws IOException {
if (index == tokens.length) {
return null;
} else {
return new Token(tokens[index++], 0, 0);
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new Field("preanalyzed", stream, TermVector.NO));
// 1) we consume all tokens twice before we add the doc to the index
checkTokens(stream);
stream.reset();
checkTokens(stream);
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term2"));
assertTrue(termPositions.next());
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term3"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
}
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;
Token token;
while ((token = stream.next()) != null) {
assertTrue(count < tokens.length);
assertEquals(tokens[count], token.termText);
count++;
}
assertEquals(tokens.length, count);
}
}

View File

@ -19,10 +19,13 @@ package org.apache.lucene.index;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.*; import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
assertEquals(0, termPositions.nextPosition()); assertEquals(0, termPositions.nextPosition());
assertEquals(502, termPositions.nextPosition()); assertEquals(502, termPositions.nextPosition());
} }
public void testPreAnalyzedField() throws IOException {
Similarity similarity = Similarity.getDefault();
DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
Document doc = new Document();
doc.add(new Field("preanalyzed", new TokenStream() {
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
private int index = 0;
public Token next() throws IOException {
if (index == tokens.length) {
return null;
} else {
return new Token(tokens[index++], 0, 0);
}
}
}, TermVector.NO));
String segName = "test";
writer.addDocument(segName, doc);
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term2"));
assertTrue(termPositions.next());
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term3"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
}
} }