mirror of https://github.com/apache/lucene.git
LUCENE-580:
- Added the public method reset() to TokenStream. - Added a new constructor to Field that takes a TokenStream as argument, useful for pre-analyzed fields. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@533549 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3c60a00b69
commit
80c0f267f6
11
CHANGES.txt
11
CHANGES.txt
|
@ -47,7 +47,16 @@ API Changes
|
|||
(Chris Hostetter, Otis Gospodnetic)
|
||||
|
||||
8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory
|
||||
to enable extensibility of these classes.
|
||||
to enable extensibility of these classes. (Michael Busch)
|
||||
|
||||
9. LUCENE-580: Added the public method reset() to TokenStream. This method does
|
||||
nothing by default, but may be overwritten by subclasses to support consuming
|
||||
the TokenStream more than once. (Michael Busch)
|
||||
|
||||
10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as
|
||||
argument, available as tokenStreamValue(). This is useful to avoid the need of
|
||||
"dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch)
|
||||
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* This class can be used if the Tokens of a TokenStream
|
||||
* are intended to be consumed more than once. It caches
|
||||
* all Tokens locally in a List.
|
||||
*
|
||||
* CachingTokenFilter implements the optional method
|
||||
* {@link TokenStream#reset()}, which repositions the
|
||||
* stream to the first Token.
|
||||
*
|
||||
*/
|
||||
public class CachingTokenFilter extends TokenFilter {
|
||||
private List cache;
|
||||
private int index;
|
||||
|
||||
public CachingTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
if (cache == null) {
|
||||
// fill cache lazily
|
||||
cache = new LinkedList();
|
||||
fillCache();
|
||||
}
|
||||
|
||||
if (index == cache.size()) {
|
||||
// the cache is exhausted, return null
|
||||
return null;
|
||||
}
|
||||
|
||||
return (Token) cache.get(index++);
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
index = 0;
|
||||
}
|
||||
|
||||
private void fillCache() throws IOException {
|
||||
Token token;
|
||||
while ( (token = input.next()) != null) {
|
||||
cache.add(token);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -35,6 +35,15 @@ public abstract class TokenStream {
|
|||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public abstract Token next() throws IOException;
|
||||
|
||||
/** Resets this stream to the beginning. This is an
|
||||
* optional operation, so subclasses may or may not
|
||||
* implement this method. Reset() is not needed for
|
||||
* the standard indexing process. However, if the Tokens
|
||||
* of a TokenStream are intended to be consumed more than
|
||||
* once, it is neccessary to implement reset().
|
||||
*/
|
||||
public void reset() throws IOException {}
|
||||
|
||||
/** Releases resources associated with this stream. */
|
||||
public void close() throws IOException {}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.document;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexWriter; // for javadoc
|
||||
import org.apache.lucene.util.Parameter;
|
||||
|
||||
|
@ -134,21 +135,26 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
}
|
||||
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(), and
|
||||
* binaryValue() must be set. */
|
||||
/** The value of the field as a String, or null. If null, the Reader value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(),
|
||||
* and binaryValue() must be set. */
|
||||
/** The value of the field as a Reader, or null. If null, the String value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
|
||||
|
||||
/** The value of the field in Binary, or null. If null, the Reader or
|
||||
* String value is used. Exactly one of stringValue(), readerValue() and
|
||||
* binaryValue() must be set. */
|
||||
/** The value of the field in Binary, or null. If null, the Reader value,
|
||||
* String value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public byte[] binaryValue() { return fieldsData instanceof byte[] ? (byte[])fieldsData : null; }
|
||||
|
||||
/** The value of the field as a TokesStream, or null. If null, the Reader value,
|
||||
* String value, or binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
|
||||
|
||||
/**
|
||||
* Create a field by specifying its name, value and how it will
|
||||
* be saved in the index. Term vectors will not be stored in the index.
|
||||
|
@ -280,6 +286,54 @@ public final class Field extends AbstractField implements Fieldable, Serializabl
|
|||
|
||||
setStoreTermVector(termVector);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a tokenized and indexed field that is not stored. Term vectors will
|
||||
* not be stored. This is useful for pre-analyzed fields.
|
||||
* The TokenStream is read only when the Document is added to the index,
|
||||
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
|
||||
* has been called.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param tokenStream The TokenStream with the content
|
||||
* @throws NullPointerException if name or tokenStream is <code>null</code>
|
||||
*/
|
||||
public Field(String name, TokenStream tokenStream) {
|
||||
this(name, tokenStream, TermVector.NO);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a tokenized and indexed field that is not stored, optionally with
|
||||
* storing term vectors. This is useful for pre-analyzed fields.
|
||||
* The TokenStream is read only when the Document is added to the index,
|
||||
* i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
|
||||
* has been called.
|
||||
*
|
||||
* @param name The name of the field
|
||||
* @param tokenStream The TokenStream with the content
|
||||
* @param termVector Whether term vector should be stored
|
||||
* @throws NullPointerException if name or tokenStream is <code>null</code>
|
||||
*/
|
||||
public Field(String name, TokenStream tokenStream, TermVector termVector) {
|
||||
if (name == null)
|
||||
throw new NullPointerException("name cannot be null");
|
||||
if (tokenStream == null)
|
||||
throw new NullPointerException("tokenStream cannot be null");
|
||||
|
||||
this.name = name.intern(); // field names are interned
|
||||
this.fieldsData = tokenStream;
|
||||
|
||||
this.isStored = false;
|
||||
this.isCompressed = false;
|
||||
|
||||
this.isIndexed = true;
|
||||
this.isTokenized = true;
|
||||
|
||||
this.isBinary = false;
|
||||
|
||||
setStoreTermVector(termVector);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a stored field with binary value. Optionally the value may be compressed.
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.document;
|
|||
import java.io.Reader;
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Synonymous with {@link Field}.
|
||||
*
|
||||
|
@ -60,20 +62,25 @@ public interface Fieldable extends Serializable {
|
|||
*/
|
||||
String name();
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(), and
|
||||
* binaryValue() must be set. */
|
||||
String stringValue();
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(),
|
||||
* and binaryValue() must be set. */
|
||||
Reader readerValue();
|
||||
|
||||
/** The value of the field in Binary, or null. If null, the Reader or
|
||||
* String value is used. Exactly one of stringValue(), readerValue() and
|
||||
* binaryValue() must be set. */
|
||||
byte[] binaryValue();
|
||||
/** The value of the field as a String, or null. If null, the Reader value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public String stringValue();
|
||||
|
||||
/** The value of the field as a Reader, or null. If null, the String value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public Reader readerValue();
|
||||
|
||||
/** The value of the field in Binary, or null. If null, the Reader value,
|
||||
* String value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public byte[] binaryValue();
|
||||
|
||||
/** The value of the field as a TokesStream, or null. If null, the Reader value,
|
||||
* String value, or binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public TokenStream tokenStreamValue();
|
||||
|
||||
/** True iff the value of the field is to be stored in the index for return
|
||||
with search hits. It is an error for this to be true if a field is
|
||||
|
|
|
@ -162,18 +162,28 @@ final class DocumentWriter {
|
|||
offset += stringValue.length();
|
||||
length++;
|
||||
} else
|
||||
{
|
||||
Reader reader; // find or make Reader
|
||||
if (field.readerValue() != null)
|
||||
reader = field.readerValue();
|
||||
else if (field.stringValue() != null)
|
||||
reader = new StringReader(field.stringValue());
|
||||
else
|
||||
throw new IllegalArgumentException
|
||||
("field must have either String or Reader value");
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
||||
{ // tokenized field
|
||||
TokenStream stream = field.tokenStreamValue();
|
||||
|
||||
// the field does not have a TokenStream,
|
||||
// so we have to obtain one from the analyzer
|
||||
if (stream == null) {
|
||||
Reader reader; // find or make Reader
|
||||
if (field.readerValue() != null)
|
||||
reader = field.readerValue();
|
||||
else if (field.stringValue() != null)
|
||||
reader = new StringReader(field.stringValue());
|
||||
else
|
||||
throw new IllegalArgumentException
|
||||
("field must have either String or Reader value");
|
||||
|
||||
// Tokenize field and add to postingTable
|
||||
stream = analyzer.tokenStream(fieldName, reader);
|
||||
}
|
||||
|
||||
// reset the TokenStream to the first token
|
||||
stream.reset();
|
||||
|
||||
try {
|
||||
Token lastToken = null;
|
||||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.index;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
|
@ -331,11 +332,9 @@ final class FieldsReader {
|
|||
return localFieldsStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* The value of the field in Binary, or null. If null, the Reader or
|
||||
* String value is used. Exactly one of stringValue(), readerValue() and
|
||||
* binaryValue() must be set.
|
||||
*/
|
||||
/** The value of the field in Binary, or null. If null, the Reader value,
|
||||
* String value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public byte[] binaryValue() {
|
||||
ensureOpen();
|
||||
if (fieldsData == null) {
|
||||
|
@ -358,21 +357,26 @@ final class FieldsReader {
|
|||
return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* The value of the field as a Reader, or null. If null, the String value
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(),
|
||||
* and binaryValue() must be set.
|
||||
*/
|
||||
/** The value of the field as a Reader, or null. If null, the String value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public Reader readerValue() {
|
||||
ensureOpen();
|
||||
return fieldsData instanceof Reader ? (Reader) fieldsData : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* The value of the field as a String, or null. If null, the Reader value
|
||||
* or binary value is used. Exactly one of stringValue(), readerValue(), and
|
||||
* binaryValue() must be set.
|
||||
*/
|
||||
/** The value of the field as a TokesStream, or null. If null, the Reader value,
|
||||
* String value, or binary value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public TokenStream tokenStreamValue() {
|
||||
ensureOpen();
|
||||
return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
|
||||
}
|
||||
|
||||
|
||||
/** The value of the field as a String, or null. If null, the Reader value,
|
||||
* binary value, or TokenStream value is used. Exactly one of stringValue(),
|
||||
* readerValue(), binaryValue(), and tokenStreamValue() must be set. */
|
||||
public String stringValue() {
|
||||
ensureOpen();
|
||||
if (fieldsData == null) {
|
||||
|
@ -462,6 +466,11 @@ final class FieldsReader {
|
|||
public byte[] binaryValue() {
|
||||
return (byte[]) this.fieldsData;
|
||||
}
|
||||
|
||||
public TokenStream tokenStreamValue() {
|
||||
// not needed for merge
|
||||
return null;
|
||||
}
|
||||
|
||||
public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
|
||||
this.isStored = true;
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
public class TestCachingTokenFilter extends TestCase {
|
||||
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
|
||||
|
||||
public void testCaching() throws IOException {
|
||||
Directory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer());
|
||||
Document doc = new Document();
|
||||
TokenStream stream = new TokenStream() {
|
||||
private int index = 0;
|
||||
|
||||
public Token next() throws IOException {
|
||||
if (index == tokens.length) {
|
||||
return null;
|
||||
} else {
|
||||
return new Token(tokens[index++], 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
stream = new CachingTokenFilter(stream);
|
||||
|
||||
doc.add(new Field("preanalyzed", stream, TermVector.NO));
|
||||
|
||||
// 1) we consume all tokens twice before we add the doc to the index
|
||||
checkTokens(stream);
|
||||
stream.reset();
|
||||
checkTokens(stream);
|
||||
|
||||
// 2) now add the document to the index and verify if all tokens are indexed
|
||||
// don't reset the stream here, the DocumentWriter should do that implicitly
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
|
||||
assertTrue(termPositions.next());
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(0, termPositions.nextPosition());
|
||||
|
||||
termPositions.seek(new Term("preanalyzed", "term2"));
|
||||
assertTrue(termPositions.next());
|
||||
assertEquals(2, termPositions.freq());
|
||||
assertEquals(1, termPositions.nextPosition());
|
||||
assertEquals(3, termPositions.nextPosition());
|
||||
|
||||
termPositions.seek(new Term("preanalyzed", "term3"));
|
||||
assertTrue(termPositions.next());
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(2, termPositions.nextPosition());
|
||||
reader.close();
|
||||
|
||||
// 3) reset stream and consume tokens again
|
||||
stream.reset();
|
||||
checkTokens(stream);
|
||||
}
|
||||
|
||||
private void checkTokens(TokenStream stream) throws IOException {
|
||||
int count = 0;
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
assertTrue(count < tokens.length);
|
||||
assertEquals(tokens[count], token.termText);
|
||||
count++;
|
||||
}
|
||||
|
||||
assertEquals(tokens.length, count);
|
||||
}
|
||||
}
|
|
@ -19,10 +19,13 @@ package org.apache.lucene.index;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.document.Field.TermVector;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
|
@ -124,4 +127,45 @@ public class TestDocumentWriter extends TestCase {
|
|||
assertEquals(0, termPositions.nextPosition());
|
||||
assertEquals(502, termPositions.nextPosition());
|
||||
}
|
||||
|
||||
public void testPreAnalyzedField() throws IOException {
|
||||
Similarity similarity = Similarity.getDefault();
|
||||
DocumentWriter writer = new DocumentWriter(dir, new SimpleAnalyzer(), similarity, 50);
|
||||
Document doc = new Document();
|
||||
|
||||
doc.add(new Field("preanalyzed", new TokenStream() {
|
||||
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
|
||||
private int index = 0;
|
||||
|
||||
public Token next() throws IOException {
|
||||
if (index == tokens.length) {
|
||||
return null;
|
||||
} else {
|
||||
return new Token(tokens[index++], 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
}, TermVector.NO));
|
||||
|
||||
String segName = "test";
|
||||
writer.addDocument(segName, doc);
|
||||
SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));
|
||||
|
||||
TermPositions termPositions = reader.termPositions(new Term("preanalyzed", "term1"));
|
||||
assertTrue(termPositions.next());
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(0, termPositions.nextPosition());
|
||||
|
||||
termPositions.seek(new Term("preanalyzed", "term2"));
|
||||
assertTrue(termPositions.next());
|
||||
assertEquals(2, termPositions.freq());
|
||||
assertEquals(1, termPositions.nextPosition());
|
||||
assertEquals(3, termPositions.nextPosition());
|
||||
|
||||
termPositions.seek(new Term("preanalyzed", "term3"));
|
||||
assertTrue(termPositions.next());
|
||||
assertEquals(1, termPositions.freq());
|
||||
assertEquals(2, termPositions.nextPosition());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue