LUCENE-969: deprecate Token.termText() & optimize core tokenizers by re-using tokens & TokenStreams

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2007-08-10 18:34:33 +00:00
parent 82eb074afd
commit d42de32984
28 changed files with 927 additions and 386 deletions

View File

@ -22,6 +22,12 @@ API Changes
Field instance during indexing. This is a sizable performance Field instance during indexing. This is a sizable performance
gain, especially for small documents. (Mike McCandless) gain, especially for small documents. (Mike McCandless)
4. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to
permit re-using of Token and TokenStream instances during
indexing. Changed Token to use a char[] as the store for the
termText instead of String. This gives faster tokenization
performance (~10-15%). (Mike McCandless)
Bug fixes Bug fixes
1. LUCENE-933: QueryParser fixed to not produce empty sub 1. LUCENE-933: QueryParser fixed to not produce empty sub
@ -107,6 +113,10 @@ Optimizations
JavaCC to generate the tokenizer. JavaCC to generate the tokenizer.
(Stanislaw Osinski via Mike McCandless) (Stanislaw Osinski via Mike McCandless)
8. LUCENE-969: Changed core tokenizers & filters to re-use Token and
TokenStream instances when possible to improve tokenization
performance (~10-15%). (Mike McCandless)
Documentation Documentation
Build Build

View File

@ -73,7 +73,7 @@ public class ReadTokensTask extends PerfTask {
super.tearDown(); super.tearDown();
} }
Token token = new Token("", 0, 0); Token token = new Token();
public int doLogic() throws Exception { public int doLogic() throws Exception {
List fields = doc.getFields(); List fields = doc.getFields();
@ -104,13 +104,13 @@ public class ReadTokensTask extends PerfTask {
} }
// Tokenize field // Tokenize field
stream = analyzer.tokenStream(field.name(), reader); stream = analyzer.reusableTokenStream(field.name(), reader);
} }
// reset the TokenStream to the first token // reset the TokenStream to the first token
stream.reset(); stream.reset();
while(stream.next() != null) while(stream.next(token) != null)
tokenCount++; tokenCount++;
} }
totalTokenCount += tokenCount; totalTokenCount += tokenCount;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
*/ */
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a /** An Analyzer builds TokenStreams, which analyze text. It thus represents a
* policy for extracting index terms from text. * policy for extracting index terms from text.
@ -37,6 +38,33 @@ public abstract class Analyzer {
field name for backward compatibility. */ field name for backward compatibility. */
public abstract TokenStream tokenStream(String fieldName, Reader reader); public abstract TokenStream tokenStream(String fieldName, Reader reader);
/** Creates a TokenStream that is allowed to be re-used
* from the previous time that the same thread called
* this method. Callers that do not need to use more
* than one TokenStream at the same time from this
* analyzer should use this method for better
* performance.
*/
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
return tokenStream(fieldName, reader);
}
private ThreadLocal tokenStreams = new ThreadLocal();
/** Used by Analyzers that implement reusableTokenStream
* to retrieve previously saved TokenStreams for re-use
* by the same thread. */
protected Object getPreviousTokenStream() {
return tokenStreams.get();
}
/** Used by Analyzers that implement reusableTokenStream
* to save a TokenStream for later re-use by the same
* thread. */
protected void setPreviousTokenStream(Object obj) {
tokenStreams.set(obj);
}
/** /**
* Invoked before indexing a Fieldable instance if * Invoked before indexing a Fieldable instance if
@ -56,4 +84,3 @@ public abstract class Analyzer {
return 0; return 0;
} }
} }

View File

@ -0,0 +1,149 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A simple class that can store & retrieve char[]'s in a
* hash table. Note that this is not a general purpose
* class. For example, it cannot remove char[]'s from the
* set, nor does it resize its hash table to be smaller,
* etc. It is designed for use with StopFilter to enable
* quick filtering based on the char[] termBuffer in a
* Token.
*/
final class CharArraySet {
private final static int INIT_SIZE = 8;
private final static double MAX_LOAD_FACTOR = 0.75;
private int mask;
private char[][] entries;
private int count;
private boolean ignoreCase;
/** Create set with enough capacity to hold startSize
* terms */
public CharArraySet(int startSize, boolean ignoreCase) {
this.ignoreCase = ignoreCase;
int size = INIT_SIZE;
while(((double) startSize)/size >= MAX_LOAD_FACTOR)
size *= 2;
mask = size-1;
entries = new char[size][];
}
/** Returns true if the characters in text up to length
* len is present in the set. */
public boolean contains(char[] text, int len) {
int code = getHashCode(text, len);
int pos = code & mask;
char[] text2 = entries[pos];
if (text2 != null && !equals(text, len, text2)) {
final int inc = code*1347|1;
do {
code += inc;
pos = code & mask;
text2 = entries[pos];
} while (text2 != null && !equals(text, len, text2));
}
return text2 != null;
}
/** Add this String into the set */
public void add(String text) {
add(text.toCharArray());
}
/** Add this text into the set */
public void add(char[] text) {
if (ignoreCase)
for(int i=0;i<text.length;i++)
text[i] = Character.toLowerCase(text[i]);
int code = getHashCode(text, text.length);
int pos = code & mask;
char[] text2 = entries[pos];
if (text2 != null) {
final int inc = code*1347|1;
do {
code += inc;
pos = code & mask;
text2 = entries[pos];
} while (text2 != null);
}
entries[pos] = text;
count++;
if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
rehash();
}
}
private boolean equals(char[] text1, int len, char[] text2) {
if (len != text2.length)
return false;
for(int i=0;i<len;i++) {
if (ignoreCase) {
if (Character.toLowerCase(text1[i]) != text2[i])
return false;
} else {
if (text1[i] != text2[i])
return false;
}
}
return true;
}
private void rehash() {
final int newSize = 2*count;
mask = newSize-1;
char[][] newEntries = new char[newSize][];
for(int i=0;i<entries.length;i++) {
char[] text = entries[i];
if (text != null) {
int code = getHashCode(text, text.length);
int pos = code & mask;
if (newEntries[pos] != null) {
final int inc = code*1347|1;
do {
code += inc;
pos = code & mask;
} while (newEntries[pos] != null);
}
newEntries[pos] = text;
}
}
entries = newEntries;
}
private int getHashCode(char[] text, int len) {
int downto = len;
int code = 0;
while (downto > 0) {
final char c;
if (ignoreCase)
c = Character.toLowerCase(text[--downto]);
else
c = text[--downto];
code = (code*31) + c;
}
return code;
}
}

View File

@ -28,8 +28,7 @@ public abstract class CharTokenizer extends Tokenizer {
private int offset = 0, bufferIndex = 0, dataLen = 0; private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255; private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 1024; private static final int IO_BUFFER_SIZE = 4096;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** Returns true iff a character should be included in a token. This /** Returns true iff a character should be included in a token. This
@ -45,31 +44,32 @@ public abstract class CharTokenizer extends Tokenizer {
return c; return c;
} }
/** Returns the next token in the stream, or null at EOS. */ public final Token next(Token token) throws IOException {
public final Token next() throws IOException {
int length = 0; int length = 0;
int start = offset; int start = bufferIndex;
char[] buffer = token.termBuffer();
while (true) { while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) { if (bufferIndex >= dataLen) {
offset += dataLen;
dataLen = input.read(ioBuffer); dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
;
if (dataLen == -1) { if (dataLen == -1) {
if (length > 0) if (length > 0)
break; break;
else else
return null; return null;
} else }
c = ioBuffer[bufferIndex++]; bufferIndex = 0;
}
final char c = ioBuffer[bufferIndex++];
if (isTokenChar(c)) { // if it's a token char if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token if (length == 0) // start of token
start = offset - 1; start = offset + bufferIndex - 1;
else if (length == buffer.length)
buffer = token.resizeTermBuffer(1+length);
buffer[length++] = normalize(c); // buffer it, normalized buffer[length++] = normalize(c); // buffer it, normalized
@ -78,9 +78,18 @@ public abstract class CharTokenizer extends Tokenizer {
} else if (length > 0) // at non-Letter w/ chars } else if (length > 0) // at non-Letter w/ chars
break; // return 'em break; // return 'em
} }
return new Token(new String(buffer, 0, length), start, start + length); token.termLength = length;
token.startOffset = start;
token.endOffset = start+length;
return token;
}
public void reset(Reader input) throws IOException {
super.reset(input);
bufferIndex = 0;
offset = 0;
dataLen = 0;
} }
} }

View File

@ -29,51 +29,68 @@ public class ISOLatin1AccentFilter extends TokenFilter {
super(input); super(input);
} }
public final Token next() throws java.io.IOException { private char[] output = new char[256];
final Token t = input.next(); private int outputPos;
if (t != null)
t.setTermText(removeAccents(t.termText())); public final Token next(Token result) throws java.io.IOException {
return t; result = input.next(result);
if (result != null) {
outputPos = 0;
removeAccents(result.termBuffer(), result.termLength());
result.setTermBuffer(output, 0, outputPos);
return result;
} else
return null;
}
private final void addChar(char c) {
if (outputPos == output.length) {
char[] newArray = new char[2*output.length];
System.arraycopy(output, 0, newArray, 0, output.length);
output = newArray;
}
output[outputPos++] = c;
} }
/** /**
* To replace accented characters in a String by unaccented equivalents. * To replace accented characters in a String by unaccented equivalents.
*/ */
public final static String removeAccents(String input) { public final void removeAccents(char[] input, int length) {
final StringBuffer output = new StringBuffer(); int pos = 0;
for (int i = 0; i < input.length(); i++) { for (int i=0; i<length; i++, pos++) {
switch (input.charAt(i)) { switch (input[pos]) {
case '\u00C0' : // À case '\u00C0' : // À
case '\u00C1' : // Á case '\u00C1' : // Á
case '\u00C2' : // Â case '\u00C2' : // Â
case '\u00C3' : // Ã case '\u00C3' : // Ã
case '\u00C4' : // Ä case '\u00C4' : // Ä
case '\u00C5' : // Å case '\u00C5' : // Å
output.append("A"); addChar('A');
break; break;
case '\u00C6' : // Æ case '\u00C6' : // Æ
output.append("AE"); addChar('A');
addChar('E');
break; break;
case '\u00C7' : // Ç case '\u00C7' : // Ç
output.append("C"); addChar('C');
break; break;
case '\u00C8' : // È case '\u00C8' : // È
case '\u00C9' : // É case '\u00C9' : // É
case '\u00CA' : // Ê case '\u00CA' : // Ê
case '\u00CB' : // Ë case '\u00CB' : // Ë
output.append("E"); addChar('E');
break; break;
case '\u00CC' : // Ì case '\u00CC' : // Ì
case '\u00CD' : // Í case '\u00CD' : // Í
case '\u00CE' : // Î case '\u00CE' : // Î
case '\u00CF' : // Ï case '\u00CF' : // Ï
output.append("I"); addChar('I');
break; break;
case '\u00D0' : // Ð case '\u00D0' : // Ð
output.append("D"); addChar('D');
break; break;
case '\u00D1' : // Ñ case '\u00D1' : // Ñ
output.append("N"); addChar('N');
break; break;
case '\u00D2' : // Ò case '\u00D2' : // Ò
case '\u00D3' : // Ó case '\u00D3' : // Ó
@ -81,23 +98,25 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00D5' : // Õ case '\u00D5' : // Õ
case '\u00D6' : // Ö case '\u00D6' : // Ö
case '\u00D8' : // Ø case '\u00D8' : // Ø
output.append("O"); addChar('O');
break; break;
case '\u0152' : // Œ case '\u0152' : // Œ
output.append("OE"); addChar('O');
addChar('E');
break; break;
case '\u00DE' : // Þ case '\u00DE' : // Þ
output.append("TH"); addChar('T');
addChar('H');
break; break;
case '\u00D9' : // Ù case '\u00D9' : // Ù
case '\u00DA' : // Ú case '\u00DA' : // Ú
case '\u00DB' : // Û case '\u00DB' : // Û
case '\u00DC' : // Ü case '\u00DC' : // Ü
output.append("U"); addChar('U');
break; break;
case '\u00DD' : // Ý case '\u00DD' : // Ý
case '\u0178' : // Ÿ case '\u0178' : // Ÿ
output.append("Y"); addChar('Y');
break; break;
case '\u00E0' : // à case '\u00E0' : // à
case '\u00E1' : // á case '\u00E1' : // á
@ -105,31 +124,32 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00E3' : // ã case '\u00E3' : // ã
case '\u00E4' : // ä case '\u00E4' : // ä
case '\u00E5' : // å case '\u00E5' : // å
output.append("a"); addChar('a');
break; break;
case '\u00E6' : // æ case '\u00E6' : // æ
output.append("ae"); addChar('a');
addChar('e');
break; break;
case '\u00E7' : // ç case '\u00E7' : // ç
output.append("c"); addChar('c');
break; break;
case '\u00E8' : // è case '\u00E8' : // è
case '\u00E9' : // é case '\u00E9' : // é
case '\u00EA' : // ê case '\u00EA' : // ê
case '\u00EB' : // ë case '\u00EB' : // ë
output.append("e"); addChar('e');
break; break;
case '\u00EC' : // ì case '\u00EC' : // ì
case '\u00ED' : // í case '\u00ED' : // í
case '\u00EE' : // î case '\u00EE' : // î
case '\u00EF' : // ï case '\u00EF' : // ï
output.append("i"); addChar('i');
break; break;
case '\u00F0' : // ð case '\u00F0' : // ð
output.append("d"); addChar('d');
break; break;
case '\u00F1' : // ñ case '\u00F1' : // ñ
output.append("n"); addChar('n');
break; break;
case '\u00F2' : // ò case '\u00F2' : // ò
case '\u00F3' : // ó case '\u00F3' : // ó
@ -137,32 +157,34 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00F5' : // õ case '\u00F5' : // õ
case '\u00F6' : // ö case '\u00F6' : // ö
case '\u00F8' : // ø case '\u00F8' : // ø
output.append("o"); addChar('o');
break; break;
case '\u0153' : // œ case '\u0153' : // œ
output.append("oe"); addChar('o');
addChar('e');
break; break;
case '\u00DF' : // ß case '\u00DF' : // ß
output.append("ss"); addChar('s');
addChar('s');
break; break;
case '\u00FE' : // þ case '\u00FE' : // þ
output.append("th"); addChar('t');
addChar('h');
break; break;
case '\u00F9' : // ù case '\u00F9' : // ù
case '\u00FA' : // ú case '\u00FA' : // ú
case '\u00FB' : // û case '\u00FB' : // û
case '\u00FC' : // ü case '\u00FC' : // ü
output.append("u"); addChar('u');
break; break;
case '\u00FD' : // ý case '\u00FD' : // ý
case '\u00FF' : // ÿ case '\u00FF' : // ÿ
output.append("y"); addChar('y');
break; break;
default : default :
output.append(input.charAt(i)); addChar(input[pos]);
break; break;
} }
} }
return output.toString();
} }
} }

View File

@ -28,4 +28,13 @@ public class KeywordAnalyzer extends Analyzer {
final Reader reader) { final Reader reader) {
return new KeywordTokenizer(reader); return new KeywordTokenizer(reader);
} }
public TokenStream reusableTokenStream(String fieldName,
final Reader reader) {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new KeywordTokenizer(reader);
setPreviousTokenStream(tokenizer);
}
return tokenizer;
}
} }

View File

@ -28,7 +28,6 @@ public class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256; private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done; private boolean done;
private final char[] buffer;
public KeywordTokenizer(Reader input) { public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE); this(input, DEFAULT_BUFFER_SIZE);
@ -36,23 +35,23 @@ public class KeywordTokenizer extends Tokenizer {
public KeywordTokenizer(Reader input, int bufferSize) { public KeywordTokenizer(Reader input, int bufferSize) {
super(input); super(input);
this.buffer = new char[bufferSize];
this.done = false; this.done = false;
} }
public Token next() throws IOException { public Token next(Token result) throws IOException {
if (!done) { if (!done) {
done = true; done = true;
StringBuffer buffer = new StringBuffer(); int upto = 0;
int length; char[] buffer = result.termBuffer();
while (true) { while (true) {
length = input.read(this.buffer); final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break; if (length == -1) break;
upto += length;
buffer.append(this.buffer, 0, length); if (upto == buffer.length)
buffer = result.resizeTermBuffer(1+buffer.length);
} }
String text = buffer.toString(); result.termLength = upto;
return new Token(text, 0, text.length()); return result;
} }
return null; return null;
} }

View File

@ -44,12 +44,12 @@ public final class LengthFilter extends TokenFilter {
/** /**
* Returns the next input Token whose termText() is the right len * Returns the next input Token whose termText() is the right len
*/ */
public final Token next() throws IOException public final Token next(Token result) throws IOException
{ {
// return the first non-stop word found // return the first non-stop word found
for (Token token = input.next(); token != null; token = input.next()) for (Token token = input.next(result); token != null; token = input.next(result))
{ {
int len = token.termText().length(); int len = token.termLength();
if (len >= min && len <= max) { if (len >= min && len <= max) {
return token; return token;
} }

View File

@ -29,14 +29,17 @@ public final class LowerCaseFilter extends TokenFilter {
super(in); super(in);
} }
public final Token next() throws IOException { public final Token next(Token result) throws IOException {
Token t = input.next(); result = input.next(result);
if (result != null) {
if (t == null) final char[] buffer = result.termBuffer();
final int length = result.termLength;
for(int i=0;i<length;i++)
buffer[i] = Character.toLowerCase(buffer[i]);
return result;
} else
return null; return null;
t.termText = t.termText.toLowerCase();
return t;
} }
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
*/ */
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -75,6 +76,14 @@ public class PerFieldAnalyzerWrapper extends Analyzer {
return analyzer.tokenStream(fieldName, reader); return analyzer.tokenStream(fieldName, reader);
} }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
if (analyzer == null)
analyzer = defaultAnalyzer;
return analyzer.reusableTokenStream(fieldName, reader);
}
/** Return the positionIncrementGap from the analyzer assigned to fieldName */ /** Return the positionIncrementGap from the analyzer assigned to fieldName */
public int getPositionIncrementGap(String fieldName) { public int getPositionIncrementGap(String fieldName) {
Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);

View File

@ -45,16 +45,13 @@ public final class PorterStemFilter extends TokenFilter {
stemmer = new PorterStemmer(); stemmer = new PorterStemmer();
} }
/** Returns the next input Token, after being stemmed */ public final Token next(Token result) throws IOException {
public final Token next() throws IOException { result = input.next(result);
Token token = input.next(); if (result != null) {
if (token == null) if (stemmer.stem(result.termBuffer(), 0, result.termLength))
result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return result;
} else
return null; return null;
else {
String s = stemmer.stem(token.termText);
if (s != token.termText) // Yes, I mean object reference comparison here
token.termText = s;
return token;
}
} }
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
*/ */
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */ /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
@ -25,4 +26,14 @@ public final class SimpleAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
return new LowerCaseTokenizer(reader); return new LowerCaseTokenizer(reader);
} }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new LowerCaseTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
}
} }

View File

@ -71,5 +71,22 @@ public final class StopAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new LowerCaseTokenizer(reader), stopWords); return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
} }
/** Filters LowerCaseTokenizer with StopFilter. */
private class SavedStreams {
Tokenizer source;
TokenStream result;
};
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
streams.source = new LowerCaseTokenizer(reader);
streams.result = new StopFilter(streams.source, stopWords);
setPreviousTokenStream(streams);
} else
streams.source.reset(reader);
return streams.result;
}
} }

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator;
import java.util.Set; import java.util.Set;
/** /**
@ -27,7 +28,7 @@ import java.util.Set;
public final class StopFilter extends TokenFilter { public final class StopFilter extends TokenFilter {
private final Set stopWords; private final CharArraySet stopWords;
private final boolean ignoreCase; private final boolean ignoreCase;
/** /**
@ -45,7 +46,7 @@ public final class StopFilter extends TokenFilter {
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) { public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in); super(in);
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
this.stopWords = makeStopSet(stopWords, ignoreCase); this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
} }
@ -59,7 +60,10 @@ public final class StopFilter extends TokenFilter {
{ {
super(input); super(input);
this.ignoreCase = ignoreCase; this.ignoreCase = ignoreCase;
this.stopWords = stopWords; this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
Iterator it = stopWords.iterator();
while(it.hasNext())
this.stopWords.add((String) it.next());
} }
/** /**
@ -99,16 +103,21 @@ public final class StopFilter extends TokenFilter {
return stopTable; return stopTable;
} }
private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
for (int i = 0; i < stopWords.length; i++)
stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
return stopSet;
}
/** /**
* Returns the next input Token whose termText() is not a stop word. * Returns the next input Token whose termText() is not a stop word.
*/ */
public final Token next() throws IOException { public final Token next(Token result) throws IOException {
// return the first non-stop word found // return the first non-stop word found
for (Token token = input.next(); token != null; token = input.next()) while((result = input.next(result)) != null) {
{ if (!stopWords.contains(result.termBuffer(), result.termLength))
String termText = ignoreCase ? token.termText.toLowerCase() : token.termText; return result;
if (!stopWords.contains(termText))
return token;
} }
// reached EOS -- return null // reached EOS -- return null
return null; return null;

View File

@ -1,8 +1,5 @@
package org.apache.lucene.analysis; package org.apache.lucene.analysis;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
@ -20,6 +17,9 @@ import org.apache.lucene.index.TermPositions;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions;
/** A Token is an occurence of a term from the text of a field. It consists of /** A Token is an occurence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field, a term's text, the start and end offset of the term in the text of the field,
and a type string. and a type string.
@ -44,46 +44,103 @@ import org.apache.lucene.index.TermPositions;
The APIs introduced here might change in the future and will not be The APIs introduced here might change in the future and will not be
supported anymore in such a case.</font> supported anymore in such a case.</font>
<br><br>
<p><b>NOTE:</b> As of 2.3, Token stores the term text
internally as a malleable char[] termBuffer instead of
String termText. The indexing code and core tokenizers
have been changed re-use a single Token instance, changing
its buffer and other fields in-place as the Token is
processed. This provides substantially better indexing
performance as it saves the GC cost of new'ing a Token and
String for every term. The APIs that accept String
termText are still available but a warning about the
associated performance cost has been added (below). The
{@link #termText()} method has been deprecated.</p>
<p>Tokenizers and filters should try to re-use a Token
instance when possible for best performance, by
implementing the {@link TokenStream#next(Token)} API.
Failing that, to create a new Token you should first use
one of the constructors that starts with null text. Then
you should call either {@link #termBuffer()} or {@link
#resizeTermBuffer(int)} to retrieve the Token's
termBuffer. Fill in the characters of your term into this
buffer, and finally call {@link #setTermLength(int)} to
set the length of the term text. See <a target="_top"
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
for details.</p>
@see org.apache.lucene.index.Payload @see org.apache.lucene.index.Payload
*/ */
// TODO: Remove warning after API has been finalized
// TODO: Remove warning after API has been finalized
public class Token implements Cloneable { public class Token implements Cloneable {
String termText; // the text of the term
private static final String DEFAULT_TYPE = "word";
private static int MIN_BUFFER_SIZE = 10;
/** @deprecated: we will remove this when we remove the
* deprecated APIs */
private String termText;
char[] termBuffer; // characters for the term text
int termLength; // length of term text in buffer
int startOffset; // start in source text int startOffset; // start in source text
int endOffset; // end in source text int endOffset; // end in source text
String type = "word"; // lexical type String type = DEFAULT_TYPE; // lexical type
Payload payload; Payload payload;
// For better indexing speed, use termBuffer (and int positionIncrement = 1;
// termBufferOffset/termBufferLength) instead of termText
// to save new'ing a String per token
char[] termBuffer;
int termBufferOffset;
int termBufferLength;
private int positionIncrement = 1; /** Constructs a Token will null text. */
public Token() {
}
/** Constructs a Token with the given term text, and start & end offsets. /** Constructs a Token with null text and start & end
The type defaults to "word." */ * offsets.
* @param start start offset
* @param end end offset */
public Token(int start, int end) {
startOffset = start;
endOffset = end;
}
/** Constructs a Token with null text and start & end
* offsets plus the Token type.
* @param start start offset
* @param end end offset */
public Token(int start, int end, String typ) {
startOffset = start;
endOffset = end;
type = typ;
}
/** Constructs a Token with the given term text, and start
* & end offsets. The type defaults to "word."
* <b>NOTE:</b> for better indexing speed you should
* instead use the char[] termBuffer methods to set the
* term text.
* @param text term text
* @param start start offset
* @param end end offset */
public Token(String text, int start, int end) { public Token(String text, int start, int end) {
termText = text; termText = text;
startOffset = start; startOffset = start;
endOffset = end; endOffset = end;
} }
/** Constructs a Token with the given term text buffer /** Constructs a Token with the given text, start and end
* starting at offset for length lenth, and start & end offsets. * offsets, & type. <b>NOTE:</b> for better indexing
* The type defaults to "word." */ * speed you should instead use the char[] termBuffer
public Token(char[] text, int offset, int length, int start, int end) { * methods to set the term text.
termBuffer = text; * @param text term text
termBufferOffset = offset; * @param start start offset
termBufferLength = length; * @param end end offset
startOffset = start; * @param typ token type */
endOffset = end;
}
/** Constructs a Token with the given text, start and end offsets, & type. */
public Token(String text, int start, int end, String typ) { public Token(String text, int start, int end, String typ) {
termText = text; termText = text;
startOffset = start; startOffset = start;
@ -91,19 +148,6 @@ public class Token implements Cloneable {
type = typ; type = typ;
} }
/** Constructs a Token with the given term text buffer
* starting at offset for length lenth, and start & end
* offsets, & type. */
public Token(char[] text, int offset, int length, int start, int end, String typ) {
termBuffer = text;
termBufferOffset = offset;
termBufferLength = length;
startOffset = start;
endOffset = end;
type = typ;
}
/** Set the position increment. This determines the position of this token /** Set the position increment. This determines the position of this token
* relative to the previous Token in a {@link TokenStream}, used in phrase * relative to the previous Token in a {@link TokenStream}, used in phrase
* searching. * searching.
@ -139,28 +183,103 @@ public class Token implements Cloneable {
/** Returns the position increment of this Token. /** Returns the position increment of this Token.
* @see #setPositionIncrement * @see #setPositionIncrement
*/ */
public int getPositionIncrement() { return positionIncrement; } public int getPositionIncrement() {
return positionIncrement;
}
/** Sets the Token's term text. */ /** Sets the Token's term text. <b>NOTE:</b> for better
* indexing speed you should instead use the char[]
* termBuffer methods to set the term text. */
public void setTermText(String text) { public void setTermText(String text) {
termText = text; termText = text;
termBuffer = null;
} }
/** Returns the Token's term text. */ /** Returns the Token's term text.
public final String termText() { return termText; } *
public final char[] termBuffer() { return termBuffer; } * @deprecated Use {@link #termBuffer()} and {@link
public final int termBufferOffset() { return termBufferOffset; } * #termLength()} instead. */
public final int termBufferLength() { return termBufferLength; } public final String termText() {
if (termText == null && termBuffer != null)
public void setStartOffset(int offset) {this.startOffset = offset;} termText = new String(termBuffer, 0, termLength);
public void setEndOffset(int offset) {this.endOffset = offset;} return termText;
}
/** Copies the contents of buffer, starting at offset for
* length characters, into the termBuffer
* array. <b>NOTE:</b> for better indexing speed you
* should instead retrieve the termBuffer, using {@link
* #termBuffer()} or {@link #resizeTermBuffer(int)}, and
* fill it in directly to set the term text. This saves
* an extra copy. */
public final void setTermBuffer(char[] buffer, int offset, int length) { public final void setTermBuffer(char[] buffer, int offset, int length) {
this.termBuffer = buffer; resizeTermBuffer(length);
this.termBufferOffset = offset; System.arraycopy(buffer, offset, termBuffer, 0, length);
this.termBufferLength = length; termLength = length;
} }
/** Returns the internal termBuffer character array which
* you can then directly alter. If the array is too
* small for your token, use {@link
* #resizeTermBuffer(int)} to increase it. After
* altering the buffer be sure to call {@link
* #setTermLength} to record the number of valid
* characters that were placed into the termBuffer. */
public final char[] termBuffer() {
initTermBuffer();
return termBuffer;
}
/** Grows the termBuffer to at least size newSize.
* @param newSize minimum size of the new termBuffer
* @return newly created termBuffer with length >= newSize
*/
public char[] resizeTermBuffer(int newSize) {
initTermBuffer();
if (newSize > termBuffer.length) {
int size = termBuffer.length;
while(size < newSize)
size *= 2;
char[] newBuffer = new char[size];
System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
termBuffer = newBuffer;
}
return termBuffer;
}
// TODO: once we remove the deprecated termText() method
// and switch entirely to char[] termBuffer we don't need
// to use this method anymore
private void initTermBuffer() {
if (termBuffer == null) {
if (termText == null) {
termBuffer = new char[MIN_BUFFER_SIZE];
termLength = 0;
} else {
int length = termText.length();
if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE;
termBuffer = new char[length];
termLength = termText.length();
termText.getChars(0, termText.length(), termBuffer, 0);
termText = null;
}
} else if (termText != null)
termText = null;
}
/** Return number of valid characters (length of the term)
* in the termBuffer array. */
public final int termLength() {
initTermBuffer();
return termLength;
}
/** Set number of valid characters (length of the term) in
* the termBuffer array. */
public final void setTermLength(int length) {
initTermBuffer();
termLength = length;
}
/** Returns this Token's starting offset, the position of the first character /** Returns this Token's starting offset, the position of the first character
corresponding to this token in the source text. corresponding to this token in the source text.
@ -168,25 +287,37 @@ public class Token implements Cloneable {
Note that the difference between endOffset() and startOffset() may not be Note that the difference between endOffset() and startOffset() may not be
equal to termText.length(), as the term text may have been altered by a equal to termText.length(), as the term text may have been altered by a
stemmer or some other filter. */ stemmer or some other filter. */
public final int startOffset() { return startOffset; } public final int startOffset() {
return startOffset;
}
/** Set the starting offset.
@see #startOffset() */
public void setStartOffset(int offset) {
this.startOffset = offset;
}
/** Returns this Token's ending offset, one greater than the position of the /** Returns this Token's ending offset, one greater than the position of the
last character corresponding to this token in the source text. */ last character corresponding to this token in the source text. */
public final int endOffset() { return endOffset; } public final int endOffset() {
return endOffset;
}
/** Set the ending offset.
@see #endOffset() */
public void setEndOffset(int offset) {
this.endOffset = offset;
}
/** Returns this Token's lexical type. Defaults to "word". */ /** Returns this Token's lexical type. Defaults to "word". */
public final String type() { return type; } public final String type() {
return type;
}
/** /** Set the lexical type.
* Sets this Token's payload. @see #type() */
* <p><font color="#FF0000"> public final void setType(String type) {
* WARNING: The status of the <b>Payloads</b> feature is experimental. this.type = type;
* The APIs introduced here might change in the future and will not be
* supported anymore in such a case.</font>
*/
// TODO: Remove warning after API has been finalized
public void setPayload(Payload payload) {
this.payload = payload;
} }
/** /**
@ -201,9 +332,27 @@ public class Token implements Cloneable {
return this.payload; return this.payload;
} }
/**
* Sets this Token's payload.
* <p><font color="#FF0000">
* WARNING: The status of the <b>Payloads</b> feature is experimental.
* The APIs introduced here might change in the future and will not be
* supported anymore in such a case.</font>
*/
// TODO: Remove warning after API has been finalized
public void setPayload(Payload payload) {
this.payload = payload;
}
public String toString() { public String toString() {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
sb.append("(" + termText + "," + startOffset + "," + endOffset); sb.append("(");
initTermBuffer();
if (termBuffer == null)
sb.append("null");
else
sb.append(termBuffer, 0, termLength);
sb.append("," + startOffset + "," + endOffset);
if (!type.equals("word")) if (!type.equals("word"))
sb.append(",type="+type); sb.append(",type="+type);
if (positionIncrement != 1) if (positionIncrement != 1)
@ -212,11 +361,14 @@ public class Token implements Cloneable {
return sb.toString(); return sb.toString();
} }
public Object clone() { /** Reset all state for this token back to defaults. */
try { public void clear() {
return super.clone(); payload = null;
} catch (CloneNotSupportedException e) { // Leave termBuffer to allow re-use
throw new RuntimeException(e); // shouldn't happen since we implement Cloneable termLength = 0;
} termText = null;
positionIncrement = 1;
startOffset = endOffset = 0;
type = DEFAULT_TYPE;
} }
} }

View File

@ -22,6 +22,8 @@ import java.io.IOException;
/** A TokenFilter is a TokenStream whose input is another token stream. /** A TokenFilter is a TokenStream whose input is another token stream.
<p> <p>
This is an abstract class. This is an abstract class.
NOTE: subclasses must override at least one of {@link
#next()} or {@link #next(Token)}.
*/ */
public abstract class TokenFilter extends TokenStream { public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */ /** The source of tokens for this filter. */

View File

@ -29,11 +29,36 @@ import java.io.IOException;
<li>{@link TokenFilter}, a TokenStream <li>{@link TokenFilter}, a TokenStream
whose input is another TokenStream. whose input is another TokenStream.
</ul> </ul>
NOTE: subclasses must override at least one of {@link
#next()} or {@link #next(Token)}.
*/ */
public abstract class TokenStream { public abstract class TokenStream {
/** Returns the next token in the stream, or null at EOS. */
public abstract Token next() throws IOException; /** Returns the next token in the stream, or null at EOS.
* The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
* than calling {@link #next(Token)} instead.. */
public Token next() throws IOException {
Token result = next(new Token());
return result;
}
/** Returns the next token in the stream, or null at EOS.
* When possible, the input Token should be used as the
* returned Token (this gives fastest tokenization
* performance), but this is not required and a new Token
* may be returned. Callers may re-use a single Token
* instance for successive calls to this method and must
* therefore fully consume the previously returned Token
* before calling this method again.
* @param result a Token that may or may not be used to
* return
* @return next token in the stream or null if
* end-of-stream was hit*/
public Token next(Token result) throws IOException {
return next();
}
/** Resets this stream to the beginning. This is an /** Resets this stream to the beginning. This is an
* optional operation, so subclasses may or may not * optional operation, so subclasses may or may not

View File

@ -23,6 +23,8 @@ import java.io.IOException;
/** A Tokenizer is a TokenStream whose input is a Reader. /** A Tokenizer is a TokenStream whose input is a Reader.
<p> <p>
This is an abstract class. This is an abstract class.
NOTE: subclasses must override at least one of {@link
#next()} or {@link #next(Token)}.
*/ */
public abstract class Tokenizer extends TokenStream { public abstract class Tokenizer extends TokenStream {
@ -41,5 +43,12 @@ public abstract class Tokenizer extends TokenStream {
public void close() throws IOException { public void close() throws IOException {
input.close(); input.close();
} }
/** Reset the tokenizer to a new reader. Typically, an
* analyzer (in its reusableTokenStream method) will use
* this to re-use a previously created tokenizer. */
protected void reset(Reader input) throws IOException {
this.input = input;
}
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
*/ */
import java.io.Reader; import java.io.Reader;
import java.io.IOException;
/** An Analyzer that uses WhitespaceTokenizer. */ /** An Analyzer that uses WhitespaceTokenizer. */
@ -25,4 +26,14 @@ public final class WhitespaceAnalyzer extends Analyzer {
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
return new WhitespaceTokenizer(reader); return new WhitespaceTokenizer(reader);
} }
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
if (tokenizer == null) {
tokenizer = new WhitespaceTokenizer(reader);
setPreviousTokenStream(tokenizer);
} else
tokenizer.reset(reader);
return tokenizer;
}
} }

View File

@ -75,4 +75,23 @@ public class StandardAnalyzer extends Analyzer {
result = new StopFilter(result, stopSet); result = new StopFilter(result, stopSet);
return result; return result;
} }
private class SavedStreams {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
};
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
if (streams == null) {
streams = new SavedStreams();
setPreviousTokenStream(streams);
streams.tokenStream = new StandardTokenizer(reader);
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
} else
streams.tokenStream.reset(reader);
return streams.filteredTokenStream;
}
} }

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.standard;
*/ */
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
/** Normalizes tokens extracted with {@link StandardTokenizer}. */ /** Normalizes tokens extracted with {@link StandardTokenizer}. */
@ -37,33 +38,32 @@ public final class StandardFilter extends TokenFilter {
* <p>Removes <tt>'s</tt> from the end of words. * <p>Removes <tt>'s</tt> from the end of words.
* <p>Removes dots from acronyms. * <p>Removes dots from acronyms.
*/ */
public final org.apache.lucene.analysis.Token next() throws java.io.IOException { public final Token next(Token result) throws java.io.IOException {
org.apache.lucene.analysis.Token t = input.next(); Token t = input.next(result);
if (t == null) if (t == null)
return null; return null;
String text = t.termText(); char[] buffer = t.termBuffer();
String type = t.type(); final int bufferLength = t.termLength();
final String type = t.type();
if (type == APOSTROPHE_TYPE && // remove 's if (type == APOSTROPHE_TYPE && // remove 's
(text.endsWith("'s") || text.endsWith("'S"))) { bufferLength >= 2 &&
return new org.apache.lucene.analysis.Token buffer[bufferLength-2] == '\'' &&
(text.substring(0,text.length()-2), (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
t.startOffset(), t.endOffset(), type); // Strip last 2 characters off
t.setTermLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) { // remove dots } else if (type == ACRONYM_TYPE) { // remove dots
StringBuffer trimmed = new StringBuffer(); int upto = 0;
for (int i = 0; i < text.length(); i++) { for(int i=0;i<bufferLength;i++) {
char c = text.charAt(i); char c = buffer[i];
if (c != '.') if (c != '.')
trimmed.append(c); buffer[upto++] = c;
}
t.setTermLength(upto);
} }
return new org.apache.lucene.analysis.Token
(trimmed.toString(), t.startOffset(), t.endOffset(), type);
} else {
return t; return t;
} }
}
} }

View File

@ -43,6 +43,9 @@ import org.apache.lucene.analysis.Tokenizer;
public class StandardTokenizer extends Tokenizer { public class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */ /** A private instance of the JFlex-constructed scanner */
private final StandardTokenizerImpl scanner; private final StandardTokenizerImpl scanner;
void setInput(Reader reader) {
this.input = reader;
}
/** /**
* Creates a new instance of the {@link StandardTokenizer}. Attaches the * Creates a new instance of the {@link StandardTokenizer}. Attaches the
@ -58,19 +61,19 @@ public class StandardTokenizer extends Tokenizer {
* *
* @see org.apache.lucene.analysis.TokenStream#next() * @see org.apache.lucene.analysis.TokenStream#next()
*/ */
public Token next() throws IOException { public Token next(Token result) throws IOException {
int tokenType = scanner.getNextToken(); int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) { if (tokenType == StandardTokenizerImpl.YYEOF) {
return null; return null;
} }
int startPosition = scanner.yychar(); scanner.getText(result);
final int start = scanner.yychar();
final String tokenImage = scanner.yytext(); result.setStartOffset(start);
return new Token(tokenImage, startPosition, startPosition result.setEndOffset(start+result.termLength());
+ tokenImage.length(), result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
StandardTokenizerImpl.TOKEN_TYPES[tokenType]); return result;
} }
/* /*
@ -82,4 +85,9 @@ public class StandardTokenizer extends Tokenizer {
super.reset(); super.reset();
scanner.yyreset(input); scanner.yyreset(input);
} }
public void reset(Reader reader) throws IOException {
input = reader;
reset();
}
} }

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.4.1 on 8/8/07 10:18 PM */ /* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */
package org.apache.lucene.analysis.standard; package org.apache.lucene.analysis.standard;
@ -19,7 +19,15 @@ package org.apache.lucene.analysis.standard;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.Token;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
* on 8/9/07 10:15 AM from the specification file
* <tt>/tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
*/
class StandardTokenizerImpl { class StandardTokenizerImpl {
/** This character denotes the end of file */ /** This character denotes the end of file */
@ -297,6 +305,13 @@ public final int yychar()
return yychar; return yychar;
} }
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/** /**
* Creates a new scanner * Creates a new scanner

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.standard;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.Token;
%% %%
%class StandardTokenizerImpl %class StandardTokenizerImpl
@ -52,6 +54,13 @@ public final int yychar()
{ {
return yychar; return yychar;
} }
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%} %}
// basic word: a sequence of digits & letters // basic word: a sequence of digits & letters

View File

@ -960,28 +960,18 @@ final class DocumentsWriter {
/** Test whether the text for current Posting p equals /** Test whether the text for current Posting p equals
* current tokenText. */ * current tokenText. */
boolean postingEquals(final String tokenString, final char[] tokenText, boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
final int tokenTextLen, final int tokenTextOffset) {
final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT]; final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
assert text != null; assert text != null;
int pos = p.textStart & CHAR_BLOCK_MASK; int pos = p.textStart & CHAR_BLOCK_MASK;
if (tokenText == null) { int tokenPos = 0;
// Compare to String for(;tokenPos<tokenTextLen;pos++,tokenPos++)
for(int i=0;i<tokenTextLen;i++)
if (tokenString.charAt(i) != text[pos++])
return false;
return text[pos] == 0xffff;
} else {
int tokenPos = tokenTextOffset;
final int stopAt = tokenTextLen+tokenPos;
for(;tokenPos<stopAt;pos++,tokenPos++)
if (tokenText[tokenPos] != text[pos]) if (tokenText[tokenPos] != text[pos])
return false; return false;
return 0xffff == text[pos]; return 0xffff == text[pos];
} }
}
/** Compares term text for two Posting instance and /** Compares term text for two Posting instance and
* returns -1 if p1 < p2; 1 if p1 > p2; else 0. * returns -1 if p1 < p2; 1 if p1 > p2; else 0.
@ -1241,8 +1231,7 @@ final class DocumentsWriter {
} }
int offsetEnd; int offsetEnd;
Token token; Token localToken = new Token();
Token localToken = new Token("", 0, 0);
/* Invert one occurrence of one field in the document */ /* Invert one occurrence of one field in the document */
public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException { public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException {
@ -1251,12 +1240,12 @@ final class DocumentsWriter {
position += analyzer.getPositionIncrementGap(fieldInfo.name); position += analyzer.getPositionIncrementGap(fieldInfo.name);
if (!field.isTokenized()) { // un-tokenized field if (!field.isTokenized()) { // un-tokenized field
token = localToken;
String stringValue = field.stringValue(); String stringValue = field.stringValue();
Token token = localToken;
token.setTermText(stringValue); token.setTermText(stringValue);
token.setStartOffset(offset); token.setStartOffset(offset);
token.setEndOffset(offset + stringValue.length()); token.setEndOffset(offset + stringValue.length());
addPosition(); addPosition(token);
offset += stringValue.length(); offset += stringValue.length();
length++; length++;
} else { // tokenized field } else { // tokenized field
@ -1282,7 +1271,7 @@ final class DocumentsWriter {
} }
// Tokenize field and add to postingTable // Tokenize field and add to postingTable
stream = analyzer.tokenStream(fieldInfo.name, reader); stream = analyzer.reusableTokenStream(fieldInfo.name, reader);
} }
// reset the TokenStream to the first token // reset the TokenStream to the first token
@ -1290,9 +1279,10 @@ final class DocumentsWriter {
try { try {
offsetEnd = offset-1; offsetEnd = offset-1;
for (token = stream.next(); token != null; token = stream.next()) { Token token;
while((token = stream.next(localToken)) != null) {
position += (token.getPositionIncrement() - 1); position += (token.getPositionIncrement() - 1);
addPosition(); addPosition(token);
if (++length >= maxFieldLength) { if (++length >= maxFieldLength) {
if (infoStream != null) if (infoStream != null)
infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
@ -1357,55 +1347,32 @@ final class DocumentsWriter {
* for every term of every document. Its job is to * * for every term of every document. Its job is to *
* update the postings byte stream (Postings hash) * * update the postings byte stream (Postings hash) *
* based on the occurence of a single term. */ * based on the occurence of a single term. */
private void addPosition() { private void addPosition(Token token) {
final Payload payload = token.getPayload(); final Payload payload = token.getPayload();
final String tokenString;
final int tokenTextLen;
final int tokenTextOffset;
// Get the text of this term. Term can either // Get the text of this term. Term can either
// provide a String token or offset into a char[] // provide a String token or offset into a char[]
// array // array
final char[] tokenText = token.termBuffer(); final char[] tokenText = token.termBuffer();
final int tokenTextLen = token.termLength();
int code = 0; int code = 0;
int code2 = 0; int code2 = 0;
if (tokenText == null) { // Compute hashcode
// Fallback to String token
tokenString = token.termText();
tokenTextLen = tokenString.length();
tokenTextOffset = 0;
// Compute hashcode.
int downto = tokenTextLen; int downto = tokenTextLen;
while (downto > 0) while (downto > 0)
code = (code*31) + tokenString.charAt(--downto);
// System.out.println(" addPosition: field=" + fieldInfo.name + " string=" + tokenString + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset+token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
} else {
tokenString = null;
tokenTextLen = token.termBufferLength();
tokenTextOffset = token.termBufferOffset();
// Compute hashcode
int downto = tokenTextLen+tokenTextOffset;
while (downto > tokenTextOffset)
code = (code*31) + tokenText[--downto]; code = (code*31) + tokenText[--downto];
// System.out.println(" addPosition: buffer=" + new String(tokenText, tokenTextOffset, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
}
int hashPos = code & postingsHashMask; int hashPos = code & postingsHashMask;
// Locate Posting in hash // Locate Posting in hash
p = postingsHash[hashPos]; p = postingsHash[hashPos];
if (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)) { if (p != null && !postingEquals(tokenText, tokenTextLen)) {
// Conflict: keep searching different locations in // Conflict: keep searching different locations in
// the hash table. // the hash table.
final int inc = code*1347|1; final int inc = code*1347|1;
@ -1413,7 +1380,7 @@ final class DocumentsWriter {
code += inc; code += inc;
hashPos = code & postingsHashMask; hashPos = code & postingsHashMask;
p = postingsHash[hashPos]; p = postingsHash[hashPos];
} while (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)); } while (p != null && !postingEquals(tokenText, tokenTextLen));
} }
final int proxCode; final int proxCode;
@ -1492,10 +1459,7 @@ final class DocumentsWriter {
p.textStart = textUpto + charPool.byteOffset; p.textStart = textUpto + charPool.byteOffset;
charPool.byteUpto += textLen1; charPool.byteUpto += textLen1;
if (tokenString == null) System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
System.arraycopy(tokenText, tokenTextOffset, text, textUpto, tokenTextLen);
else
tokenString.getChars(0, tokenTextLen, text, textUpto);
text[textUpto+tokenTextLen] = 0xffff; text[textUpto+tokenTextLen] = 0xffff;

View File

@ -94,7 +94,7 @@ public class TestCachingTokenFilter extends TestCase {
Token token; Token token;
while ((token = stream.next()) != null) { while ((token = stream.next()) != null) {
assertTrue(count < tokens.length); assertTrue(count < tokens.length);
assertEquals(tokens[count], token.termText); assertEquals(tokens[count], token.termText());
count++; count++;
} }

View File

@ -0,0 +1,56 @@
package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import junit.framework.*;
public class TestToken extends TestCase {
public TestToken(String name) {
super(name);
}
public void testToString() throws Exception {
char[] b = {'a', 'l', 'o', 'h', 'a'};
Token t = new Token("", 0, 5);
t.setTermBuffer(b, 0, 5);
assertEquals("(aloha,0,5)", t.toString());
t.setTermText("hi there");
assertEquals("(hi there,0,5)", t.toString());
}
public void testMixedStringArray() throws Exception {
Token t = new Token("hello", 0, 5);
assertEquals(t.termText(), "hello");
assertEquals(t.termLength(), 5);
assertEquals(new String(t.termBuffer(), 0, 5), "hello");
t.setTermText("hello2");
assertEquals(t.termLength(), 6);
assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
t.setTermBuffer("hello3".toCharArray(), 0, 6);
assertEquals(t.termText(), "hello3");
// Make sure if we get the buffer and change a character
// that termText() reflects the change
char[] buffer = t.termBuffer();
buffer[1] = 'o';
assertEquals(t.termText(), "hollo3");
}
}