From 80c0f267f6ea53820392ef658024dea4b0ffe84e Mon Sep 17 00:00:00 2001 From: Michael Busch Date: Sun, 29 Apr 2007 19:26:11 +0000 Subject: [PATCH] LUCENE-580: - Added the public method reset() to TokenStream. - Added a new constructor to Field that takes a TokenStream as argument, useful for pre-analyzed fields. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 11 +- .../lucene/analysis/CachingTokenFilter.java | 68 ++++++++++++ .../apache/lucene/analysis/TokenStream.java | 9 ++ .../org/apache/lucene/document/Field.java | 72 ++++++++++-- .../org/apache/lucene/document/Fieldable.java | 35 +++--- .../apache/lucene/index/DocumentWriter.java | 34 ++++-- .../org/apache/lucene/index/FieldsReader.java | 39 ++++--- .../analysis/TestCachingTokenFilter.java | 103 ++++++++++++++++++ .../lucene/index/TestDocumentWriter.java | 44 ++++++++ 9 files changed, 364 insertions(+), 51 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/CachingTokenFilter.java create mode 100644 src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java diff --git a/CHANGES.txt b/CHANGES.txt index 0bb9ffe099d..6100929aacd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -47,7 +47,16 @@ API Changes (Chris Hostetter, Otis Gospodnetic) 8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory - to enable extensibility of these classes. + to enable extensibility of these classes. (Michael Busch) + + 9. LUCENE-580: Added the public method reset() to TokenStream. This method does + nothing by default, but may be overwritten by subclasses to support consuming + the TokenStream more than once. (Michael Busch) + +10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as + argument, available as tokenStreamValue(). This is useful to avoid the need of + "dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch) + Bug fixes diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java new file mode 100644 index 00000000000..c49729eca22 --- /dev/null +++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java @@ -0,0 +1,68 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +/** + * This class can be used if the Tokens of a TokenStream + * are intended to be consumed more than once. It caches + * all Tokens locally in a List. + * + * CachingTokenFilter implements the optional method + * {@link TokenStream#reset()}, which repositions the + * stream to the first Token. + * + */ +public class CachingTokenFilter extends TokenFilter { + private List cache; + private int index; + + public CachingTokenFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + if (cache == null) { + // fill cache lazily + cache = new LinkedList(); + fillCache(); + } + + if (index == cache.size()) { + // the cache is exhausted, return null + return null; + } + + return (Token) cache.get(index++); + } + + public void reset() throws IOException { + index = 0; + } + + private void fillCache() throws IOException { + Token token; + while ( (token = input.next()) != null) { + cache.add(token); + } + } + +} diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java index a1f97b0ecbf..98ba85a1a4a 100644 --- a/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/src/java/org/apache/lucene/analysis/TokenStream.java @@ -35,6 +35,15 @@ public abstract class TokenStream { /** Returns the next token in the stream, or null at EOS. */ public abstract Token next() throws IOException; + /** Resets this stream to the beginning. This is an + * optional operation, so subclasses may or may not + * implement this method. Reset() is not needed for + * the standard indexing process. However, if the Tokens + * of a TokenStream are intended to be consumed more than + * once, it is neccessary to implement reset(). + */ + public void reset() throws IOException {} + /** Releases resources associated with this stream. */ public void close() throws IOException {} } diff --git a/src/java/org/apache/lucene/document/Field.java b/src/java/org/apache/lucene/document/Field.java index 840d66e8b79..5a1b4c3e581 100644 --- a/src/java/org/apache/lucene/document/Field.java +++ b/src/java/org/apache/lucene/document/Field.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; * limitations under the License. */ +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexWriter; // for javadoc import org.apache.lucene.util.Parameter; @@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl } - /** The value of the field as a String, or null. If null, the Reader value - * or binary value is used. Exactly one of stringValue(), readerValue(), and - * binaryValue() must be set. */ + /** The value of the field as a String, or null. If null, the Reader value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } - /** The value of the field as a Reader, or null. If null, the String value - * or binary value is used. Exactly one of stringValue(), readerValue(), - * and binaryValue() must be set. */ + /** The value of the field as a Reader, or null. If null, the String value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } - /** The value of the field in Binary, or null. If null, the Reader or - * String value is used. Exactly one of stringValue(), readerValue() and - * binaryValue() must be set. */ + /** The value of the field in Binary, or null. If null, the Reader value, + * String value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; } + /** The value of the field as a TokesStream, or null. If null, the Reader value, + * String value, or binary value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; } + /** * Create a field by specifying its name, value and how it will * be saved in the index. Term vectors will not be stored in the index. @@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl setStoreTermVector(termVector); } + + /** + * Create a tokenized and indexed field that is not stored. Term vectors will + * not be stored. This is useful for pre-analyzed fields. + * The TokenStream is read only when the Document is added to the index, + * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)} + * has been called. + * + * @param name The name of the field + * @param tokenStream The TokenStream with the content + * @throws NullPointerException if name or tokenStream is null + */ + public Field(String name, TokenStream tokenStream) { + this(name, tokenStream, TermVector.NO); + } + + /** + * Create a tokenized and indexed field that is not stored, optionally with + * storing term vectors. This is useful for pre-analyzed fields. + * The TokenStream is read only when the Document is added to the index, + * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)} + * has been called. + * + * @param name The name of the field + * @param tokenStream The TokenStream with the content + * @param termVector Whether term vector should be stored + * @throws NullPointerException if name or tokenStream is null + */ + public Field(String name, TokenStream tokenStream, TermVector termVector) { + if (name == null) + throw new NullPointerException("name cannot be null"); + if (tokenStream == null) + throw new NullPointerException("tokenStream cannot be null"); + + this.name = name.intern(); // field names are interned + this.fieldsData = tokenStream; + + this.isStored = false; + this.isCompressed = false; + + this.isIndexed = true; + this.isTokenized = true; + + this.isBinary = false; + + setStoreTermVector(termVector); + } + /** * Create a stored field with binary value. Optionally the value may be compressed. diff --git a/src/java/org/apache/lucene/document/Fieldable.java b/src/java/org/apache/lucene/document/Fieldable.java index 58494382b26..dba02262b69 100755 --- a/src/java/org/apache/lucene/document/Fieldable.java +++ b/src/java/org/apache/lucene/document/Fieldable.java @@ -19,6 +19,8 @@ package org.apache.lucene.document; import java.io.Reader; import java.io.Serializable; +import org.apache.lucene.analysis.TokenStream; + /** * Synonymous with {@link Field}. * @@ -60,20 +62,25 @@ public interface Fieldable extends Serializable { */ String name(); - /** The value of the field as a String, or null. If null, the Reader value - * or binary value is used. Exactly one of stringValue(), readerValue(), and - * binaryValue() must be set. */ - String stringValue(); - - /** The value of the field as a Reader, or null. If null, the String value - * or binary value is used. Exactly one of stringValue(), readerValue(), - * and binaryValue() must be set. */ - Reader readerValue(); - - /** The value of the field in Binary, or null. If null, the Reader or - * String value is used. Exactly one of stringValue(), readerValue() and - * binaryValue() must be set. */ - byte[] binaryValue(); + /** The value of the field as a String, or null. If null, the Reader value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public String stringValue(); + + /** The value of the field as a Reader, or null. If null, the String value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public Reader readerValue(); + + /** The value of the field in Binary, or null. If null, the Reader value, + * String value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public byte[] binaryValue(); + + /** The value of the field as a TokesStream, or null. If null, the Reader value, + * String value, or binary value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public TokenStream tokenStreamValue(); /** True iff the value of the field is to be stored in the index for return with search hits. It is an error for this to be true if a field is diff --git a/src/java/org/apache/lucene/index/DocumentWriter.java b/src/java/org/apache/lucene/index/DocumentWriter.java index 6d482dc5c7c..45ed02f7170 100644 --- a/src/java/org/apache/lucene/index/DocumentWriter.java +++ b/src/java/org/apache/lucene/index/DocumentWriter.java @@ -162,18 +162,28 @@ final class DocumentWriter { offset += stringValue.length(); length++; } else - { - Reader reader; // find or make Reader - if (field.readerValue() != null) - reader = field.readerValue(); - else if (field.stringValue() != null) - reader = new StringReader(field.stringValue()); - else - throw new IllegalArgumentException - ("field must have either String or Reader value"); - - // Tokenize field and add to postingTable - TokenStream stream = analyzer.tokenStream(fieldName, reader); + { // tokenized field + TokenStream stream = field.tokenStreamValue(); + + // the field does not have a TokenStream, + // so we have to obtain one from the analyzer + if (stream == null) { + Reader reader; // find or make Reader + if (field.readerValue() != null) + reader = field.readerValue(); + else if (field.stringValue() != null) + reader = new StringReader(field.stringValue()); + else + throw new IllegalArgumentException + ("field must have either String or Reader value"); + + // Tokenize field and add to postingTable + stream = analyzer.tokenStream(fieldName, reader); + } + + // reset the TokenStream to the first token + stream.reset(); + try { Token lastToken = null; for (Token t = stream.next(); t != null; t = stream.next()) { diff --git a/src/java/org/apache/lucene/index/FieldsReader.java b/src/java/org/apache/lucene/index/FieldsReader.java index 6a56883c38a..6cd9667b669 100644 --- a/src/java/org/apache/lucene/index/FieldsReader.java +++ b/src/java/org/apache/lucene/index/FieldsReader.java @@ -17,6 +17,7 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; @@ -331,11 +332,9 @@ final class FieldsReader { return localFieldsStream; } - /** - * The value of the field in Binary, or null. If null, the Reader or - * String value is used. Exactly one of stringValue(), readerValue() and - * binaryValue() must be set. - */ + /** The value of the field in Binary, or null. If null, the Reader value, + * String value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public byte[] binaryValue() { ensureOpen(); if (fieldsData == null) { @@ -358,21 +357,26 @@ final class FieldsReader { return fieldsData instanceof byte[] ? (byte[]) fieldsData : null; } - /** - * The value of the field as a Reader, or null. If null, the String value - * or binary value is used. Exactly one of stringValue(), readerValue(), - * and binaryValue() must be set. - */ + /** The value of the field as a Reader, or null. If null, the String value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public Reader readerValue() { ensureOpen(); return fieldsData instanceof Reader ? (Reader) fieldsData : null; } - /** - * The value of the field as a String, or null. If null, the Reader value - * or binary value is used. Exactly one of stringValue(), readerValue(), and - * binaryValue() must be set. - */ + /** The value of the field as a TokesStream, or null. If null, the Reader value, + * String value, or binary value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ + public TokenStream tokenStreamValue() { + ensureOpen(); + return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null; + } + + + /** The value of the field as a String, or null. If null, the Reader value, + * binary value, or TokenStream value is used. Exactly one of stringValue(), + * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public String stringValue() { ensureOpen(); if (fieldsData == null) { @@ -462,6 +466,11 @@ final class FieldsReader { public byte[] binaryValue() { return (byte[]) this.fieldsData; } + + public TokenStream tokenStreamValue() { + // not needed for merge + return null; + } public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) { this.isStored = true; diff --git a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java new file mode 100644 index 00000000000..94a17313343 --- /dev/null +++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java @@ -0,0 +1,103 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; + +import junit.framework.TestCase; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +public class TestCachingTokenFilter extends TestCase { + private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; + + public void testCaching() throws IOException { + Directory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer()); + Document doc = new Document(); + TokenStream stream = new TokenStream() { + private int index = 0; + + public Token next() throws IOException { + if (index == tokens.length) { + return null; + } else { + return new Token(tokens[index++], 0, 0); + } + } + + }; + + stream = new CachingTokenFilter(stream); + + doc.add(new Field("preanalyzed", stream, TermVector.NO)); + + // 1) we consume all tokens twice before we add the doc to the index + checkTokens(stream); + stream.reset(); + checkTokens(stream); + + // 2) now add the document to the index and verify if all tokens are indexed + // don't reset the stream here, the DocumentWriter should do that implicitly + writer.addDocument(doc); + writer.close(); + + IndexReader reader = IndexReader.open(dir); + TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1")); + assertTrue(termPositions.next()); + assertEquals(1, termPositions.freq()); + assertEquals(0, termPositions.nextPosition()); + + termPositions.seek(new Term("preanalyzed", "term2")); + assertTrue(termPositions.next()); + assertEquals(2, termPositions.freq()); + assertEquals(1, termPositions.nextPosition()); + assertEquals(3, termPositions.nextPosition()); + + termPositions.seek(new Term("preanalyzed", "term3")); + assertTrue(termPositions.next()); + assertEquals(1, termPositions.freq()); + assertEquals(2, termPositions.nextPosition()); + reader.close(); + + // 3) reset stream and consume tokens again + stream.reset(); + checkTokens(stream); + } + + private void checkTokens(TokenStream stream) throws IOException { + int count = 0; + Token token; + while ((token = stream.next()) != null) { + assertTrue(count < tokens.length); + assertEquals(tokens[count], token.termText); + count++; + } + + assertEquals(tokens.length, count); + } +} diff --git a/src/test/org/apache/lucene/index/TestDocumentWriter.java b/src/test/org/apache/lucene/index/TestDocumentWriter.java index 9279c7ff039..df3560dfa46 100644 --- a/src/test/org/apache/lucene/index/TestDocumentWriter.java +++ b/src/test/org/apache/lucene/index/TestDocumentWriter.java @@ -19,10 +19,13 @@ package org.apache.lucene.index; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.document.*; +import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.RAMDirectory; @@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase { assertEquals(0, termPositions.nextPosition()); assertEquals(502, termPositions.nextPosition()); } + + public void testPreAnalyzedField() throws IOException { + Similarity similarity = Similarity.getDefault(); + DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50); + Document doc = new Document(); + + doc.add(new Field("preanalyzed", new TokenStream() { + private String[] tokens = new String[] {"term1", "term2", "term3", "term2"}; + private int index = 0; + + public Token next() throws IOException { + if (index == tokens.length) { + return null; + } else { + return new Token(tokens[index++], 0, 0); + } + } + + }, TermVector.NO)); + + String segName = "test"; + writer.addDocument(segName, doc); + SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir)); + + TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1")); + assertTrue(termPositions.next()); + assertEquals(1, termPositions.freq()); + assertEquals(0, termPositions.nextPosition()); + + termPositions.seek(new Term("preanalyzed", "term2")); + assertTrue(termPositions.next()); + assertEquals(2, termPositions.freq()); + assertEquals(1, termPositions.nextPosition()); + assertEquals(3, termPositions.nextPosition()); + + termPositions.seek(new Term("preanalyzed", "term3")); + assertTrue(termPositions.next()); + assertEquals(1, termPositions.freq()); + assertEquals(2, termPositions.nextPosition()); + + } }