LUCENE-580:

- Added the public method reset() to TokenStream.
- Added a new constructor to Field that takes a TokenStream as argument, useful for pre-analyzed fields.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2007-04-29 19:26:11 +00:00
parent 3c60a00b69
commit 80c0f267f6
9 changed files with 364 additions and 51 deletions

View File

@ -47,7 +47,16 @@ API Changes
(Chris Hostetter, Otis Gospodnetic)
8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
to enable extensibility of these classes.
to enable extensibility of these classes. (Michael Busch)
9. LUCENE-580: Added the public method reset() to TokenStream. This method does
nothing by default, but may be overwritten by subclasses to support consuming
the TokenStream more than once. (Michael Busch)
10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
argument, available as tokenStreamValue(). This is useful to avoid the need of
"dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
Bug fixes

View File

@ -0,0 +1,68 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
/**
* This class can be used if the Tokens of a TokenStream
* are intended to be consumed more than once. It caches
* all Tokens locally in a List.
*
* CachingTokenFilter implements the optional method
* {@link TokenStream#reset()}, which repositions the
* stream to the first Token.
*
*/
public class CachingTokenFilter extends TokenFilter {
private List cache;
private int index;
public CachingTokenFilter(TokenStream input) {
super(input);
}
public Token next() throws IOException {
if (cache == null) {
// fill cache lazily
cache = new LinkedList();
fillCache();
}
if (index == cache.size()) {
// the cache is exhausted, return null
return null;
}
return (Token) cache.get(index++);
}
public void reset() throws IOException {
index = 0;
}
private void fillCache() throws IOException {
Token token;
while ( (token = input.next()) != null) {
cache.add(token);
}
}
}

View File

@ -35,6 +35,15 @@ public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS. */
public abstract Token next() throws IOException;
/** Resets this stream to the beginning. This is an
* optional operation, so subclasses may or may not
* implement this method. Reset() is not needed for
* the standard indexing process. However, if the Tokens
* of a TokenStream are intended to be consumed more than
* once, it is neccessary to implement reset().
*/
public void reset() throws IOException {}
/** Releases resources associated with this stream. */
public void close() throws IOException {}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.document;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexWriter; // for javadoc
import org.apache.lucene.util.Parameter;
@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
}
/** The value of the field as a String, or null. If null, the Reader value
* or binary value is used. Exactly one of stringValue(), readerValue(), and
* binaryValue() must be set. */
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
/** The value of the field as a Reader, or null. If null, the String value
* or binary value is used. Exactly one of stringValue(), readerValue(),
* and binaryValue() must be set. */
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
/** The value of the field in Binary, or null. If null, the Reader or
* String value is used. Exactly one of stringValue(), readerValue() and
* binaryValue() must be set. */
/** The value of the field in Binary, or null. If null, the Reader value,
* String value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
/** The value of the field as a TokesStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
/**
* Create a field by specifying its name, value and how it will
* be saved in the index. Term vectors will not be stored in the index.
@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
setStoreTermVector(termVector);
}
/**
* Create a tokenized and indexed field that is not stored. Term vectors will
* not be stored. This is useful for pre-analyzed fields.
* The TokenStream is read only when the Document is added to the index,
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param tokenStream The TokenStream with the content
* @throws NullPointerException if name or tokenStream is <code>null</code>
*/
public Field(String name, TokenStream tokenStream) {
this(name, tokenStream, TermVector.NO);
}
/**
* Create a tokenized and indexed field that is not stored, optionally with
* storing term vectors. This is useful for pre-analyzed fields.
* The TokenStream is read only when the Document is added to the index,
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
* has been called.
*
* @param name The name of the field
* @param tokenStream The TokenStream with the content
* @param termVector Whether term vector should be stored
* @throws NullPointerException if name or tokenStream is <code>null</code>
*/
public Field(String name, TokenStream tokenStream, TermVector termVector) {
if (name == null)
throw new NullPointerException("name cannot be null");
if (tokenStream == null)
throw new NullPointerException("tokenStream cannot be null");
this.name = name.intern(); // field names are interned
this.fieldsData = tokenStream;
this.isStored = false;
this.isCompressed = false;
this.isIndexed = true;
this.isTokenized = true;
this.isBinary = false;
setStoreTermVector(termVector);
}
/**
* Create a stored field with binary value. Optionally the value may be compressed.

View File

@ -19,6 +19,8 @@ package org.apache.lucene.document;
import java.io.Reader;
import java.io.Serializable;
import org.apache.lucene.analysis.TokenStream;
/**
* Synonymous with {@link Field}.
*
@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
*/
String name();
/** The value of the field as a String, or null. If null, the Reader value
* or binary value is used. Exactly one of stringValue(), readerValue(), and
* binaryValue() must be set. */
String stringValue();
/** The value of the field as a Reader, or null. If null, the String value
* or binary value is used. Exactly one of stringValue(), readerValue(),
* and binaryValue() must be set. */
Reader readerValue();
/** The value of the field in Binary, or null. If null, the Reader or
* String value is used. Exactly one of stringValue(), readerValue() and
* binaryValue() must be set. */
byte[] binaryValue();
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue();
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue();
/** The value of the field in Binary, or null. If null, the Reader value,
* String value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public byte[] binaryValue();
/** The value of the field as a TokesStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue();
/** True iff the value of the field is to be stored in the index for return
with search hits. It is an error for this to be true if a field is

View File

@ -162,18 +162,28 @@ final class DocumentWriter {
offset += stringValue.length();
length++;
} else
{
Reader reader; // find or make Reader
if (field.readerValue() != null)
reader = field.readerValue();
else if (field.stringValue() != null)
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException
("field must have either String or Reader value");
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
{ // tokenized field
TokenStream stream = field.tokenStreamValue();
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
if (stream == null) {
Reader reader; // find or make Reader
if (field.readerValue() != null)
reader = field.readerValue();
else if (field.stringValue() != null)
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException
("field must have either String or Reader value");
// Tokenize field and add to postingTable
stream = analyzer.tokenStream(fieldName, reader);
}
// reset the TokenStream to the first token
stream.reset();
try {
Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) {

View File

@ -17,6 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
@ -331,11 +332,9 @@ final class FieldsReader {
return localFieldsStream;
}
/**
* The value of the field in Binary, or null. If null, the Reader or
* String value is used. Exactly one of stringValue(), readerValue() and
* binaryValue() must be set.
*/
/** The value of the field in Binary, or null. If null, the Reader value,
* String value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public byte[] binaryValue() {
ensureOpen();
if (fieldsData == null) {
@ -358,21 +357,26 @@ final class FieldsReader {
return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
}
/**
* The value of the field as a Reader, or null. If null, the String value
* or binary value is used. Exactly one of stringValue(), readerValue(),
* and binaryValue() must be set.
*/
/** The value of the field as a Reader, or null. If null, the String value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public Reader readerValue() {
ensureOpen();
return fieldsData instanceof Reader ? (Reader) fieldsData : null;
}
/**
* The value of the field as a String, or null. If null, the Reader value
* or binary value is used. Exactly one of stringValue(), readerValue(), and
* binaryValue() must be set.
*/
/** The value of the field as a TokesStream, or null. If null, the Reader value,
* String value, or binary value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public TokenStream tokenStreamValue() {
ensureOpen();
return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
}
/** The value of the field as a String, or null. If null, the Reader value,
* binary value, or TokenStream value is used. Exactly one of stringValue(),
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
public String stringValue() {
ensureOpen();
if (fieldsData == null) {
@ -462,6 +466,11 @@ final class FieldsReader {
public byte[] binaryValue() {
return (byte[]) this.fieldsData;
}
public TokenStream tokenStreamValue() {
// not needed for merge
return null;
}
public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
this.isStored = true;

View File

@ -0,0 +1,103 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
public class TestCachingTokenFilter extends TestCase {
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
public void testCaching() throws IOException {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
Document doc = new Document();
TokenStream stream = new TokenStream() {
private int index = 0;
public Token next() throws IOException {
if (index == tokens.length) {
return null;
} else {
return new Token(tokens[index++], 0, 0);
}
}
};
stream = new CachingTokenFilter(stream);
doc.add(new Field("preanalyzed", stream, TermVector.NO));
// 1) we consume all tokens twice before we add the doc to the index
checkTokens(stream);
stream.reset();
checkTokens(stream);
// 2) now add the document to the index and verify if all tokens are indexed
// don't reset the stream here, the DocumentWriter should do that implicitly
writer.addDocument(doc);
writer.close();
IndexReader reader = IndexReader.open(dir);
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term2"));
assertTrue(termPositions.next());
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term3"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
reader.close();
// 3) reset stream and consume tokens again
stream.reset();
checkTokens(stream);
}
private void checkTokens(TokenStream stream) throws IOException {
int count = 0;
Token token;
while ((token = stream.next()) != null) {
assertTrue(count < tokens.length);
assertEquals(tokens[count], token.termText);
count++;
}
assertEquals(tokens.length, count);
}
}

View File

@ -19,10 +19,13 @@ package org.apache.lucene.index;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.RAMDirectory;
@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
assertEquals(0, termPositions.nextPosition());
assertEquals(502, termPositions.nextPosition());
}
public void testPreAnalyzedField() throws IOException {
Similarity similarity = Similarity.getDefault();
DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
Document doc = new Document();
doc.add(new Field("preanalyzed", new TokenStream() {
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
private int index = 0;
public Token next() throws IOException {
if (index == tokens.length) {
return null;
} else {
return new Token(tokens[index++], 0, 0);
}
}
}, TermVector.NO));
String segName = "test";
writer.addDocument(segName, doc);
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(0, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term2"));
assertTrue(termPositions.next());
assertEquals(2, termPositions.freq());
assertEquals(1, termPositions.nextPosition());
assertEquals(3, termPositions.nextPosition());
termPositions.seek(new Term("preanalyzed", "term3"));
assertTrue(termPositions.next());
assertEquals(1, termPositions.freq());
assertEquals(2, termPositions.nextPosition());
}
}