From d42de329840f0a224fc93e51296d81c34efd4730 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Fri, 10 Aug 2007 18:34:33 +0000 Subject: [PATCH] LUCENE-969: deprecate Token.termText() & optimize core tokenizers by re-using tokens & TokenStreams git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564715 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 10 + .../byTask/tasks/ReadTokensTask.java | 6 +- .../org/apache/lucene/analysis/Analyzer.java | 29 +- .../apache/lucene/analysis/CharArraySet.java | 149 +++++++++ .../apache/lucene/analysis/CharTokenizer.java | 45 +-- .../analysis/ISOLatin1AccentFilter.java | 300 +++++++++-------- .../lucene/analysis/KeywordAnalyzer.java | 11 +- .../lucene/analysis/KeywordTokenizer.java | 19 +- .../apache/lucene/analysis/LengthFilter.java | 6 +- .../lucene/analysis/LowerCaseFilter.java | 17 +- .../analysis/PerFieldAnalyzerWrapper.java | 9 + .../lucene/analysis/PorterStemFilter.java | 17 +- .../lucene/analysis/SimpleAnalyzer.java | 11 + .../apache/lucene/analysis/StopAnalyzer.java | 17 + .../apache/lucene/analysis/StopFilter.java | 65 ++-- .../org/apache/lucene/analysis/Token.java | 306 +++++++++++++----- .../apache/lucene/analysis/TokenFilter.java | 2 + .../apache/lucene/analysis/TokenStream.java | 29 +- .../org/apache/lucene/analysis/Tokenizer.java | 9 + .../lucene/analysis/WhitespaceAnalyzer.java | 11 + .../analysis/standard/StandardAnalyzer.java | 19 ++ .../analysis/standard/StandardFilter.java | 38 +-- .../analysis/standard/StandardTokenizer.java | 22 +- .../standard/StandardTokenizerImpl.java | 17 +- .../standard/StandardTokenizerImpl.jflex | 9 + .../apache/lucene/index/DocumentsWriter.java | 82 ++--- .../analysis/TestCachingTokenFilter.java | 2 +- .../org/apache/lucene/analysis/TestToken.java | 56 ++++ 28 files changed, 927 insertions(+), 386 deletions(-) create mode 100644 src/java/org/apache/lucene/analysis/CharArraySet.java create mode 100644 src/test/org/apache/lucene/analysis/TestToken.java diff --git a/CHANGES.txt b/CHANGES.txt index 40bac2e47c9..4adc9738873 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -22,6 +22,12 @@ API Changes Field instance during indexing. This is a sizable performance gain, especially for small documents. (Mike McCandless) + 4. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to + permit re-using of Token and TokenStream instances during + indexing. Changed Token to use a char[] as the store for the + termText instead of String. This gives faster tokenization + performance (~10-15%). (Mike McCandless) + Bug fixes 1. LUCENE-933: QueryParser fixed to not produce empty sub @@ -107,6 +113,10 @@ Optimizations JavaCC to generate the tokenizer. (Stanislaw Osinski via Mike McCandless) + 8. LUCENE-969: Changed core tokenizers & filters to re-use Token and + TokenStream instances when possible to improve tokenization + performance (~10-15%). (Mike McCandless) + Documentation Build diff --git a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java index e402dffa31b..a2f802a091a 100644 --- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java +++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java @@ -73,7 +73,7 @@ public class ReadTokensTask extends PerfTask { super.tearDown(); } - Token token = new Token("", 0, 0); + Token token = new Token(); public int doLogic() throws Exception { List fields = doc.getFields(); @@ -104,13 +104,13 @@ public class ReadTokensTask extends PerfTask { } // Tokenize field - stream = analyzer.tokenStream(field.name(), reader); + stream = analyzer.reusableTokenStream(field.name(), reader); } // reset the TokenStream to the first token stream.reset(); - while(stream.next() != null) + while(stream.next(token) != null) tokenCount++; } totalTokenCount += tokenCount; diff --git a/src/java/org/apache/lucene/analysis/Analyzer.java b/src/java/org/apache/lucene/analysis/Analyzer.java index 3665a43b24e..59d84841566 100644 --- a/src/java/org/apache/lucene/analysis/Analyzer.java +++ b/src/java/org/apache/lucene/analysis/Analyzer.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis; */ import java.io.Reader; +import java.io.IOException; /** An Analyzer builds TokenStreams, which analyze text. It thus represents a * policy for extracting index terms from text. @@ -37,6 +38,33 @@ public abstract class Analyzer { field name for backward compatibility. */ public abstract TokenStream tokenStream(String fieldName, Reader reader); + /** Creates a TokenStream that is allowed to be re-used + * from the previous time that the same thread called + * this method. Callers that do not need to use more + * than one TokenStream at the same time from this + * analyzer should use this method for better + * performance. + */ + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + return tokenStream(fieldName, reader); + } + + private ThreadLocal tokenStreams = new ThreadLocal(); + + /** Used by Analyzers that implement reusableTokenStream + * to retrieve previously saved TokenStreams for re-use + * by the same thread. */ + protected Object getPreviousTokenStream() { + return tokenStreams.get(); + } + + /** Used by Analyzers that implement reusableTokenStream + * to save a TokenStream for later re-use by the same + * thread. */ + protected void setPreviousTokenStream(Object obj) { + tokenStreams.set(obj); + } + /** * Invoked before indexing a Fieldable instance if @@ -56,4 +84,3 @@ public abstract class Analyzer { return 0; } } - diff --git a/src/java/org/apache/lucene/analysis/CharArraySet.java b/src/java/org/apache/lucene/analysis/CharArraySet.java new file mode 100644 index 00000000000..5307d73282c --- /dev/null +++ b/src/java/org/apache/lucene/analysis/CharArraySet.java @@ -0,0 +1,149 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * A simple class that can store & retrieve char[]'s in a + * hash table. Note that this is not a general purpose + * class. For example, it cannot remove char[]'s from the + * set, nor does it resize its hash table to be smaller, + * etc. It is designed for use with StopFilter to enable + * quick filtering based on the char[] termBuffer in a + * Token. + */ + +final class CharArraySet { + + private final static int INIT_SIZE = 8; + private final static double MAX_LOAD_FACTOR = 0.75; + private int mask; + private char[][] entries; + private int count; + private boolean ignoreCase; + + /** Create set with enough capacity to hold startSize + * terms */ + public CharArraySet(int startSize, boolean ignoreCase) { + this.ignoreCase = ignoreCase; + int size = INIT_SIZE; + while(((double) startSize)/size >= MAX_LOAD_FACTOR) + size *= 2; + mask = size-1; + entries = new char[size][]; + } + + /** Returns true if the characters in text up to length + * len is present in the set. */ + public boolean contains(char[] text, int len) { + int code = getHashCode(text, len); + int pos = code & mask; + char[] text2 = entries[pos]; + if (text2 != null && !equals(text, len, text2)) { + final int inc = code*1347|1; + do { + code += inc; + pos = code & mask; + text2 = entries[pos]; + } while (text2 != null && !equals(text, len, text2)); + } + return text2 != null; + } + + /** Add this String into the set */ + public void add(String text) { + add(text.toCharArray()); + } + + /** Add this text into the set */ + public void add(char[] text) { + if (ignoreCase) + for(int i=0;i MAX_LOAD_FACTOR) { + rehash(); + } + } + + private boolean equals(char[] text1, int len, char[] text2) { + if (len != text2.length) + return false; + for(int i=0;i 0) { + final char c; + if (ignoreCase) + c = Character.toLowerCase(text[--downto]); + else + c = text[--downto]; + code = (code*31) + c; + } + return code; + } +} diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java index 2bf8c904746..c394f76dc1a 100644 --- a/src/java/org/apache/lucene/analysis/CharTokenizer.java +++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java @@ -28,8 +28,7 @@ public abstract class CharTokenizer extends Tokenizer { private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; - private static final int IO_BUFFER_SIZE = 1024; - private final char[] buffer = new char[MAX_WORD_LEN]; + private static final int IO_BUFFER_SIZE = 4096; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; /** Returns true iff a character should be included in a token. This @@ -45,31 +44,32 @@ public abstract class CharTokenizer extends Tokenizer { return c; } - /** Returns the next token in the stream, or null at EOS. */ - public final Token next() throws IOException { + public final Token next(Token token) throws IOException { int length = 0; - int start = offset; + int start = bufferIndex; + char[] buffer = token.termBuffer(); while (true) { - final char c; - offset++; if (bufferIndex >= dataLen) { + offset += dataLen; dataLen = input.read(ioBuffer); + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } bufferIndex = 0; } - ; - if (dataLen == -1) { - if (length > 0) - break; - else - return null; - } else - c = ioBuffer[bufferIndex++]; + + final char c = ioBuffer[bufferIndex++]; if (isTokenChar(c)) { // if it's a token char if (length == 0) // start of token - start = offset - 1; + start = offset + bufferIndex - 1; + else if (length == buffer.length) + buffer = token.resizeTermBuffer(1+length); buffer[length++] = normalize(c); // buffer it, normalized @@ -78,9 +78,18 @@ public abstract class CharTokenizer extends Tokenizer { } else if (length > 0) // at non-Letter w/ chars break; // return 'em - } - return new Token(new String(buffer, 0, length), start, start + length); + token.termLength = length; + token.startOffset = start; + token.endOffset = start+length; + return token; + } + + public void reset(Reader input) throws IOException { + super.reset(input); + bufferIndex = 0; + offset = 0; + dataLen = 0; } } diff --git a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java index c259f24de37..c930a8dbe85 100644 --- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java +++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java @@ -25,144 +25,166 @@ package org.apache.lucene.analysis; *

*/ public class ISOLatin1AccentFilter extends TokenFilter { - public ISOLatin1AccentFilter(TokenStream input) { - super(input); - } + public ISOLatin1AccentFilter(TokenStream input) { + super(input); + } - public final Token next() throws java.io.IOException { - final Token t = input.next(); - if (t != null) - t.setTermText(removeAccents(t.termText())); - return t; - } + private char[] output = new char[256]; + private int outputPos; - /** - * To replace accented characters in a String by unaccented equivalents. - */ - public final static String removeAccents(String input) { - final StringBuffer output = new StringBuffer(); - for (int i = 0; i < input.length(); i++) { - switch (input.charAt(i)) { - case '\u00C0' : // À - case '\u00C1' : // Á - case '\u00C2' : //  - case '\u00C3' : // à - case '\u00C4' : // Ä - case '\u00C5' : // Å - output.append("A"); - break; - case '\u00C6' : // Æ - output.append("AE"); - break; - case '\u00C7' : // Ç - output.append("C"); - break; - case '\u00C8' : // È - case '\u00C9' : // É - case '\u00CA' : // Ê - case '\u00CB' : // Ë - output.append("E"); - break; - case '\u00CC' : // Ì - case '\u00CD' : // Í - case '\u00CE' : // Î - case '\u00CF' : // Ï - output.append("I"); - break; - case '\u00D0' : // Ð - output.append("D"); - break; - case '\u00D1' : // Ñ - output.append("N"); - break; - case '\u00D2' : // Ò - case '\u00D3' : // Ó - case '\u00D4' : // Ô - case '\u00D5' : // Õ - case '\u00D6' : // Ö - case '\u00D8' : // Ø - output.append("O"); - break; - case '\u0152' : // Œ - output.append("OE"); - break; - case '\u00DE' : // Þ - output.append("TH"); - break; - case '\u00D9' : // Ù - case '\u00DA' : // Ú - case '\u00DB' : // Û - case '\u00DC' : // Ü - output.append("U"); - break; - case '\u00DD' : // Ý - case '\u0178' : // Ÿ - output.append("Y"); - break; - case '\u00E0' : // à - case '\u00E1' : // á - case '\u00E2' : // â - case '\u00E3' : // ã - case '\u00E4' : // ä - case '\u00E5' : // å - output.append("a"); - break; - case '\u00E6' : // æ - output.append("ae"); - break; - case '\u00E7' : // ç - output.append("c"); - break; - case '\u00E8' : // è - case '\u00E9' : // é - case '\u00EA' : // ê - case '\u00EB' : // ë - output.append("e"); - break; - case '\u00EC' : // ì - case '\u00ED' : // í - case '\u00EE' : // î - case '\u00EF' : // ï - output.append("i"); - break; - case '\u00F0' : // ð - output.append("d"); - break; - case '\u00F1' : // ñ - output.append("n"); - break; - case '\u00F2' : // ò - case '\u00F3' : // ó - case '\u00F4' : // ô - case '\u00F5' : // õ - case '\u00F6' : // ö - case '\u00F8' : // ø - output.append("o"); - break; - case '\u0153' : // œ - output.append("oe"); - break; - case '\u00DF' : // ß - output.append("ss"); - break; - case '\u00FE' : // þ - output.append("th"); - break; - case '\u00F9' : // ù - case '\u00FA' : // ú - case '\u00FB' : // û - case '\u00FC' : // ü - output.append("u"); - break; - case '\u00FD' : // ý - case '\u00FF' : // ÿ - output.append("y"); - break; - default : - output.append(input.charAt(i)); - break; - } - } - return output.toString(); - } -} \ No newline at end of file + public final Token next(Token result) throws java.io.IOException { + result = input.next(result); + if (result != null) { + outputPos = 0; + removeAccents(result.termBuffer(), result.termLength()); + result.setTermBuffer(output, 0, outputPos); + return result; + } else + return null; + } + + private final void addChar(char c) { + if (outputPos == output.length) { + char[] newArray = new char[2*output.length]; + System.arraycopy(output, 0, newArray, 0, output.length); + output = newArray; + } + output[outputPos++] = c; + } + + /** + * To replace accented characters in a String by unaccented equivalents. + */ + public final void removeAccents(char[] input, int length) { + int pos = 0; + for (int i=0; i= min && len <= max) { return token; } diff --git a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java index 6c1df8eafba..669f2b0113f 100644 --- a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java +++ b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java @@ -29,14 +29,17 @@ public final class LowerCaseFilter extends TokenFilter { super(in); } - public final Token next() throws IOException { - Token t = input.next(); + public final Token next(Token result) throws IOException { + result = input.next(result); + if (result != null) { - if (t == null) + final char[] buffer = result.termBuffer(); + final int length = result.termLength; + for(int i=0;i +

+ +

NOTE: As of 2.3, Token stores the term text + internally as a malleable char[] termBuffer instead of + String termText. The indexing code and core tokenizers + have been changed re-use a single Token instance, changing + its buffer and other fields in-place as the Token is + processed. This provides substantially better indexing + performance as it saves the GC cost of new'ing a Token and + String for every term. The APIs that accept String + termText are still available but a warning about the + associated performance cost has been added (below). The + {@link #termText()} method has been deprecated.

+ +

Tokenizers and filters should try to re-use a Token + instance when possible for best performance, by + implementing the {@link TokenStream#next(Token)} API. + Failing that, to create a new Token you should first use + one of the constructors that starts with null text. Then + you should call either {@link #termBuffer()} or {@link + #resizeTermBuffer(int)} to retrieve the Token's + termBuffer. Fill in the characters of your term into this + buffer, and finally call {@link #setTermLength(int)} to + set the length of the term text. See LUCENE-969 + for details.

+ @see org.apache.lucene.index.Payload - */ - // TODO: Remove warning after API has been finalized +*/ + +// TODO: Remove warning after API has been finalized + public class Token implements Cloneable { - String termText; // the text of the term + + private static final String DEFAULT_TYPE = "word"; + private static int MIN_BUFFER_SIZE = 10; + + /** @deprecated: we will remove this when we remove the + * deprecated APIs */ + private String termText; + + char[] termBuffer; // characters for the term text + int termLength; // length of term text in buffer + int startOffset; // start in source text int endOffset; // end in source text - String type = "word"; // lexical type + String type = DEFAULT_TYPE; // lexical type Payload payload; - // For better indexing speed, use termBuffer (and - // termBufferOffset/termBufferLength) instead of termText - // to save new'ing a String per token - char[] termBuffer; - int termBufferOffset; - int termBufferLength; + int positionIncrement = 1; - private int positionIncrement = 1; + /** Constructs a Token will null text. */ + public Token() { + } - /** Constructs a Token with the given term text, and start & end offsets. - The type defaults to "word." */ + /** Constructs a Token with null text and start & end + * offsets. + * @param start start offset + * @param end end offset */ + public Token(int start, int end) { + startOffset = start; + endOffset = end; + } + + /** Constructs a Token with null text and start & end + * offsets plus the Token type. + * @param start start offset + * @param end end offset */ + public Token(int start, int end, String typ) { + startOffset = start; + endOffset = end; + type = typ; + } + + /** Constructs a Token with the given term text, and start + * & end offsets. The type defaults to "word." + * NOTE: for better indexing speed you should + * instead use the char[] termBuffer methods to set the + * term text. + * @param text term text + * @param start start offset + * @param end end offset */ public Token(String text, int start, int end) { termText = text; startOffset = start; endOffset = end; } - /** Constructs a Token with the given term text buffer - * starting at offset for length lenth, and start & end offsets. - * The type defaults to "word." */ - public Token(char[] text, int offset, int length, int start, int end) { - termBuffer = text; - termBufferOffset = offset; - termBufferLength = length; - startOffset = start; - endOffset = end; - } - - /** Constructs a Token with the given text, start and end offsets, & type. */ + /** Constructs a Token with the given text, start and end + * offsets, & type. NOTE: for better indexing + * speed you should instead use the char[] termBuffer + * methods to set the term text. + * @param text term text + * @param start start offset + * @param end end offset + * @param typ token type */ public Token(String text, int start, int end, String typ) { termText = text; startOffset = start; @@ -91,19 +148,6 @@ public class Token implements Cloneable { type = typ; } - /** Constructs a Token with the given term text buffer - * starting at offset for length lenth, and start & end - * offsets, & type. */ - public Token(char[] text, int offset, int length, int start, int end, String typ) { - termBuffer = text; - termBufferOffset = offset; - termBufferLength = length; - startOffset = start; - endOffset = end; - type = typ; - } - - /** Set the position increment. This determines the position of this token * relative to the previous Token in a {@link TokenStream}, used in phrase * searching. @@ -139,28 +183,103 @@ public class Token implements Cloneable { /** Returns the position increment of this Token. * @see #setPositionIncrement */ - public int getPositionIncrement() { return positionIncrement; } + public int getPositionIncrement() { + return positionIncrement; + } - /** Sets the Token's term text. */ + /** Sets the Token's term text. NOTE: for better + * indexing speed you should instead use the char[] + * termBuffer methods to set the term text. */ public void setTermText(String text) { termText = text; + termBuffer = null; } - /** Returns the Token's term text. */ - public final String termText() { return termText; } - public final char[] termBuffer() { return termBuffer; } - public final int termBufferOffset() { return termBufferOffset; } - public final int termBufferLength() { return termBufferLength; } - - public void setStartOffset(int offset) {this.startOffset = offset;} - public void setEndOffset(int offset) {this.endOffset = offset;} + /** Returns the Token's term text. + * + * @deprecated Use {@link #termBuffer()} and {@link + * #termLength()} instead. */ + public final String termText() { + if (termText == null && termBuffer != null) + termText = new String(termBuffer, 0, termLength); + return termText; + } + /** Copies the contents of buffer, starting at offset for + * length characters, into the termBuffer + * array. NOTE: for better indexing speed you + * should instead retrieve the termBuffer, using {@link + * #termBuffer()} or {@link #resizeTermBuffer(int)}, and + * fill it in directly to set the term text. This saves + * an extra copy. */ public final void setTermBuffer(char[] buffer, int offset, int length) { - this.termBuffer = buffer; - this.termBufferOffset = offset; - this.termBufferLength = length; + resizeTermBuffer(length); + System.arraycopy(buffer, offset, termBuffer, 0, length); + termLength = length; + } + + /** Returns the internal termBuffer character array which + * you can then directly alter. If the array is too + * small for your token, use {@link + * #resizeTermBuffer(int)} to increase it. After + * altering the buffer be sure to call {@link + * #setTermLength} to record the number of valid + * characters that were placed into the termBuffer. */ + public final char[] termBuffer() { + initTermBuffer(); + return termBuffer; + } + + /** Grows the termBuffer to at least size newSize. + * @param newSize minimum size of the new termBuffer + * @return newly created termBuffer with length >= newSize + */ + public char[] resizeTermBuffer(int newSize) { + initTermBuffer(); + if (newSize > termBuffer.length) { + int size = termBuffer.length; + while(size < newSize) + size *= 2; + char[] newBuffer = new char[size]; + System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length); + termBuffer = newBuffer; + } + return termBuffer; + } + + // TODO: once we remove the deprecated termText() method + // and switch entirely to char[] termBuffer we don't need + // to use this method anymore + private void initTermBuffer() { + if (termBuffer == null) { + if (termText == null) { + termBuffer = new char[MIN_BUFFER_SIZE]; + termLength = 0; + } else { + int length = termText.length(); + if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE; + termBuffer = new char[length]; + termLength = termText.length(); + termText.getChars(0, termText.length(), termBuffer, 0); + termText = null; + } + } else if (termText != null) + termText = null; + } + + /** Return number of valid characters (length of the term) + * in the termBuffer array. */ + public final int termLength() { + initTermBuffer(); + return termLength; + } + + /** Set number of valid characters (length of the term) in + * the termBuffer array. */ + public final void setTermLength(int length) { + initTermBuffer(); + termLength = length; } - /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. @@ -168,14 +287,50 @@ public class Token implements Cloneable { Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ - public final int startOffset() { return startOffset; } + public final int startOffset() { + return startOffset; + } + + /** Set the starting offset. + @see #startOffset() */ + public void setStartOffset(int offset) { + this.startOffset = offset; + } /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. */ - public final int endOffset() { return endOffset; } + public final int endOffset() { + return endOffset; + } + + /** Set the ending offset. + @see #endOffset() */ + public void setEndOffset(int offset) { + this.endOffset = offset; + } /** Returns this Token's lexical type. Defaults to "word". */ - public final String type() { return type; } + public final String type() { + return type; + } + + /** Set the lexical type. + @see #type() */ + public final void setType(String type) { + this.type = type; + } + + /** + * Returns this Token's payload. + *

+ * WARNING: The status of the Payloads feature is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ + // TODO: Remove warning after API has been finalized + public Payload getPayload() { + return this.payload; + } /** * Sets this Token's payload. @@ -189,21 +344,15 @@ public class Token implements Cloneable { this.payload = payload; } - /** - * Returns this Token's payload. - *

- * WARNING: The status of the Payloads feature is experimental. - * The APIs introduced here might change in the future and will not be - * supported anymore in such a case. - */ - // TODO: Remove warning after API has been finalized - public Payload getPayload() { - return this.payload; - } - public String toString() { StringBuffer sb = new StringBuffer(); - sb.append("(" + termText + "," + startOffset + "," + endOffset); + sb.append("("); + initTermBuffer(); + if (termBuffer == null) + sb.append("null"); + else + sb.append(termBuffer, 0, termLength); + sb.append("," + startOffset + "," + endOffset); if (!type.equals("word")) sb.append(",type="+type); if (positionIncrement != 1) @@ -212,11 +361,14 @@ public class Token implements Cloneable { return sb.toString(); } - public Object clone() { - try { - return super.clone(); - } catch (CloneNotSupportedException e) { - throw new RuntimeException(e); // shouldn't happen since we implement Cloneable - } + /** Reset all state for this token back to defaults. */ + public void clear() { + payload = null; + // Leave termBuffer to allow re-use + termLength = 0; + termText = null; + positionIncrement = 1; + startOffset = endOffset = 0; + type = DEFAULT_TYPE; } } diff --git a/src/java/org/apache/lucene/analysis/TokenFilter.java b/src/java/org/apache/lucene/analysis/TokenFilter.java index 52e91ac7743..dc6cf9366d7 100644 --- a/src/java/org/apache/lucene/analysis/TokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TokenFilter.java @@ -22,6 +22,8 @@ import java.io.IOException; /** A TokenFilter is a TokenStream whose input is another token stream.

This is an abstract class. + NOTE: subclasses must override at least one of {@link + #next()} or {@link #next(Token)}. */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java index 98ba85a1a4a..61bbe3a0409 100644 --- a/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/src/java/org/apache/lucene/analysis/TokenStream.java @@ -29,11 +29,36 @@ import java.io.IOException;

  • {@link TokenFilter}, a TokenStream whose input is another TokenStream. + NOTE: subclasses must override at least one of {@link + #next()} or {@link #next(Token)}. */ public abstract class TokenStream { - /** Returns the next token in the stream, or null at EOS. */ - public abstract Token next() throws IOException; + + /** Returns the next token in the stream, or null at EOS. + * The returned Token is a "full private copy" (not + * re-used across calls to next()) but will be slower + * than calling {@link #next(Token)} instead.. */ + public Token next() throws IOException { + Token result = next(new Token()); + return result; + } + + /** Returns the next token in the stream, or null at EOS. + * When possible, the input Token should be used as the + * returned Token (this gives fastest tokenization + * performance), but this is not required and a new Token + * may be returned. Callers may re-use a single Token + * instance for successive calls to this method and must + * therefore fully consume the previously returned Token + * before calling this method again. + * @param result a Token that may or may not be used to + * return + * @return next token in the stream or null if + * end-of-stream was hit*/ + public Token next(Token result) throws IOException { + return next(); + } /** Resets this stream to the beginning. This is an * optional operation, so subclasses may or may not diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java index 0f5bef9d207..83b3b46d539 100644 --- a/src/java/org/apache/lucene/analysis/Tokenizer.java +++ b/src/java/org/apache/lucene/analysis/Tokenizer.java @@ -23,6 +23,8 @@ import java.io.IOException; /** A Tokenizer is a TokenStream whose input is a Reader.

    This is an abstract class. + NOTE: subclasses must override at least one of {@link + #next()} or {@link #next(Token)}. */ public abstract class Tokenizer extends TokenStream { @@ -41,5 +43,12 @@ public abstract class Tokenizer extends TokenStream { public void close() throws IOException { input.close(); } + + /** Reset the tokenizer to a new reader. Typically, an + * analyzer (in its reusableTokenStream method) will use + * this to re-use a previously created tokenizer. */ + protected void reset(Reader input) throws IOException { + this.input = input; + } } diff --git a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java index 852562a32a0..c76392af875 100644 --- a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis; */ import java.io.Reader; +import java.io.IOException; /** An Analyzer that uses WhitespaceTokenizer. */ @@ -25,4 +26,14 @@ public final class WhitespaceAnalyzer extends Analyzer { public TokenStream tokenStream(String fieldName, Reader reader) { return new WhitespaceTokenizer(reader); } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); + if (tokenizer == null) { + tokenizer = new WhitespaceTokenizer(reader); + setPreviousTokenStream(tokenizer); + } else + tokenizer.reset(reader); + return tokenizer; + } } diff --git a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index abd8099663b..964e15b54a1 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -75,4 +75,23 @@ public class StandardAnalyzer extends Analyzer { result = new StopFilter(result, stopSet); return result; } + + private class SavedStreams { + StandardTokenizer tokenStream; + TokenStream filteredTokenStream; + }; + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + SavedStreams streams = (SavedStreams) getPreviousTokenStream(); + if (streams == null) { + streams = new SavedStreams(); + setPreviousTokenStream(streams); + streams.tokenStream = new StandardTokenizer(reader); + streams.filteredTokenStream = new StandardFilter(streams.tokenStream); + streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); + streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); + } else + streams.tokenStream.reset(reader); + + return streams.filteredTokenStream; + } } diff --git a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java index 3c526d4e2f0..7f18d7e2f28 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java @@ -18,6 +18,7 @@ package org.apache.lucene.analysis.standard; */ import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; /** Normalizes tokens extracted with {@link StandardTokenizer}. */ @@ -37,33 +38,32 @@ public final class StandardFilter extends TokenFilter { *

    Removes 's from the end of words. *

    Removes dots from acronyms. */ - public final org.apache.lucene.analysis.Token next() throws java.io.IOException { - org.apache.lucene.analysis.Token t = input.next(); + public final Token next(Token result) throws java.io.IOException { + Token t = input.next(result); if (t == null) return null; - String text = t.termText(); - String type = t.type(); + char[] buffer = t.termBuffer(); + final int bufferLength = t.termLength(); + final String type = t.type(); if (type == APOSTROPHE_TYPE && // remove 's - (text.endsWith("'s") || text.endsWith("'S"))) { - return new org.apache.lucene.analysis.Token - (text.substring(0,text.length()-2), - t.startOffset(), t.endOffset(), type); - + bufferLength >= 2 && + buffer[bufferLength-2] == '\'' && + (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { + // Strip last 2 characters off + t.setTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots - StringBuffer trimmed = new StringBuffer(); - for (int i = 0; i < text.length(); i++) { - char c = text.charAt(i); - if (c != '.') - trimmed.append(c); + int upto = 0; + for(int i=0;iJFlex 1.4.1 + * on 8/9/07 10:15 AM from the specification file + * /tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex + */ class StandardTokenizerImpl { /** This character denotes the end of file */ @@ -297,6 +305,13 @@ public final int yychar() return yychar; } +/** + * Fills Lucene token with the current token text. + */ +final void getText(Token t) { + t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); +} + /** * Creates a new scanner diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex index 9872fe24afa..a9637aeb16a 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.standard; * limitations under the License. */ +import org.apache.lucene.analysis.Token; + %% %class StandardTokenizerImpl @@ -52,6 +54,13 @@ public final int yychar() { return yychar; } + +/** + * Fills Lucene token with the current token text. + */ +final void getText(Token t) { + t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); +} %} // basic word: a sequence of digits & letters diff --git a/src/java/org/apache/lucene/index/DocumentsWriter.java b/src/java/org/apache/lucene/index/DocumentsWriter.java index 450d1cb37b7..ed6737e673a 100644 --- a/src/java/org/apache/lucene/index/DocumentsWriter.java +++ b/src/java/org/apache/lucene/index/DocumentsWriter.java @@ -960,27 +960,17 @@ final class DocumentsWriter { /** Test whether the text for current Posting p equals * current tokenText. */ - boolean postingEquals(final String tokenString, final char[] tokenText, - final int tokenTextLen, final int tokenTextOffset) { + boolean postingEquals(final char[] tokenText, final int tokenTextLen) { final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT]; assert text != null; int pos = p.textStart & CHAR_BLOCK_MASK; - if (tokenText == null) { - // Compare to String - for(int i=0;i= maxFieldLength) { if (infoStream != null) infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); @@ -1357,55 +1347,32 @@ final class DocumentsWriter { * for every term of every document. Its job is to * * update the postings byte stream (Postings hash) * * based on the occurence of a single term. */ - private void addPosition() { + private void addPosition(Token token) { final Payload payload = token.getPayload(); - final String tokenString; - final int tokenTextLen; - final int tokenTextOffset; - // Get the text of this term. Term can either // provide a String token or offset into a char[] // array final char[] tokenText = token.termBuffer(); + final int tokenTextLen = token.termLength(); int code = 0; int code2 = 0; - if (tokenText == null) { + // Compute hashcode + int downto = tokenTextLen; + while (downto > 0) + code = (code*31) + tokenText[--downto]; - // Fallback to String token - tokenString = token.termText(); - tokenTextLen = tokenString.length(); - tokenTextOffset = 0; - - // Compute hashcode. - int downto = tokenTextLen; - while (downto > 0) - code = (code*31) + tokenString.charAt(--downto); - - // System.out.println(" addPosition: field=" + fieldInfo.name + " string=" + tokenString + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset+token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); - - } else { - tokenString = null; - tokenTextLen = token.termBufferLength(); - tokenTextOffset = token.termBufferOffset(); - - // Compute hashcode - int downto = tokenTextLen+tokenTextOffset; - while (downto > tokenTextOffset) - code = (code*31) + tokenText[--downto]; - - // System.out.println(" addPosition: buffer=" + new String(tokenText, tokenTextOffset, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); - } + // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); int hashPos = code & postingsHashMask; // Locate Posting in hash p = postingsHash[hashPos]; - if (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)) { + if (p != null && !postingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. final int inc = code*1347|1; @@ -1413,7 +1380,7 @@ final class DocumentsWriter { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; - } while (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)); + } while (p != null && !postingEquals(tokenText, tokenTextLen)); } final int proxCode; @@ -1492,10 +1459,7 @@ final class DocumentsWriter { p.textStart = textUpto + charPool.byteOffset; charPool.byteUpto += textLen1; - if (tokenString == null) - System.arraycopy(tokenText, tokenTextOffset, text, textUpto, tokenTextLen); - else - tokenString.getChars(0, tokenTextLen, text, textUpto); + System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto+tokenTextLen] = 0xffff; diff --git a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java index 94a17313343..ead291f3fe6 100644 --- a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java +++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java @@ -94,7 +94,7 @@ public class TestCachingTokenFilter extends TestCase { Token token; while ((token = stream.next()) != null) { assertTrue(count < tokens.length); - assertEquals(tokens[count], token.termText); + assertEquals(tokens[count], token.termText()); count++; } diff --git a/src/test/org/apache/lucene/analysis/TestToken.java b/src/test/org/apache/lucene/analysis/TestToken.java new file mode 100644 index 00000000000..a95fa321013 --- /dev/null +++ b/src/test/org/apache/lucene/analysis/TestToken.java @@ -0,0 +1,56 @@ +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import junit.framework.*; + +public class TestToken extends TestCase { + + public TestToken(String name) { + super(name); + } + + public void testToString() throws Exception { + char[] b = {'a', 'l', 'o', 'h', 'a'}; + Token t = new Token("", 0, 5); + t.setTermBuffer(b, 0, 5); + assertEquals("(aloha,0,5)", t.toString()); + + t.setTermText("hi there"); + assertEquals("(hi there,0,5)", t.toString()); + } + + public void testMixedStringArray() throws Exception { + Token t = new Token("hello", 0, 5); + assertEquals(t.termText(), "hello"); + assertEquals(t.termLength(), 5); + assertEquals(new String(t.termBuffer(), 0, 5), "hello"); + t.setTermText("hello2"); + assertEquals(t.termLength(), 6); + assertEquals(new String(t.termBuffer(), 0, 6), "hello2"); + t.setTermBuffer("hello3".toCharArray(), 0, 6); + assertEquals(t.termText(), "hello3"); + + // Make sure if we get the buffer and change a character + // that termText() reflects the change + char[] buffer = t.termBuffer(); + buffer[1] = 'o'; + assertEquals(t.termText(), "hollo3"); + } +}