mirror of https://github.com/apache/lucene.git
LUCENE-969: deprecate Token.termText() & optimize core tokenizers by re-using tokens & TokenStreams
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564715 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
82eb074afd
commit
d42de32984
10
CHANGES.txt
10
CHANGES.txt
|
@ -22,6 +22,12 @@ API Changes
|
||||||
Field instance during indexing. This is a sizable performance
|
Field instance during indexing. This is a sizable performance
|
||||||
gain, especially for small documents. (Mike McCandless)
|
gain, especially for small documents. (Mike McCandless)
|
||||||
|
|
||||||
|
4. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to
|
||||||
|
permit re-using of Token and TokenStream instances during
|
||||||
|
indexing. Changed Token to use a char[] as the store for the
|
||||||
|
termText instead of String. This gives faster tokenization
|
||||||
|
performance (~10-15%). (Mike McCandless)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-933: QueryParser fixed to not produce empty sub
|
1. LUCENE-933: QueryParser fixed to not produce empty sub
|
||||||
|
@ -107,6 +113,10 @@ Optimizations
|
||||||
JavaCC to generate the tokenizer.
|
JavaCC to generate the tokenizer.
|
||||||
(Stanislaw Osinski via Mike McCandless)
|
(Stanislaw Osinski via Mike McCandless)
|
||||||
|
|
||||||
|
8. LUCENE-969: Changed core tokenizers & filters to re-use Token and
|
||||||
|
TokenStream instances when possible to improve tokenization
|
||||||
|
performance (~10-15%). (Mike McCandless)
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
|
@ -73,7 +73,7 @@ public class ReadTokensTask extends PerfTask {
|
||||||
super.tearDown();
|
super.tearDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
Token token = new Token("", 0, 0);
|
Token token = new Token();
|
||||||
|
|
||||||
public int doLogic() throws Exception {
|
public int doLogic() throws Exception {
|
||||||
List fields = doc.getFields();
|
List fields = doc.getFields();
|
||||||
|
@ -104,13 +104,13 @@ public class ReadTokensTask extends PerfTask {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize field
|
// Tokenize field
|
||||||
stream = analyzer.tokenStream(field.name(), reader);
|
stream = analyzer.reusableTokenStream(field.name(), reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset the TokenStream to the first token
|
// reset the TokenStream to the first token
|
||||||
stream.reset();
|
stream.reset();
|
||||||
|
|
||||||
while(stream.next() != null)
|
while(stream.next(token) != null)
|
||||||
tokenCount++;
|
tokenCount++;
|
||||||
}
|
}
|
||||||
totalTokenCount += tokenCount;
|
totalTokenCount += tokenCount;
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
|
/** An Analyzer builds TokenStreams, which analyze text. It thus represents a
|
||||||
* policy for extracting index terms from text.
|
* policy for extracting index terms from text.
|
||||||
|
@ -37,6 +38,33 @@ public abstract class Analyzer {
|
||||||
field name for backward compatibility. */
|
field name for backward compatibility. */
|
||||||
public abstract TokenStream tokenStream(String fieldName, Reader reader);
|
public abstract TokenStream tokenStream(String fieldName, Reader reader);
|
||||||
|
|
||||||
|
/** Creates a TokenStream that is allowed to be re-used
|
||||||
|
* from the previous time that the same thread called
|
||||||
|
* this method. Callers that do not need to use more
|
||||||
|
* than one TokenStream at the same time from this
|
||||||
|
* analyzer should use this method for better
|
||||||
|
* performance.
|
||||||
|
*/
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
return tokenStream(fieldName, reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ThreadLocal tokenStreams = new ThreadLocal();
|
||||||
|
|
||||||
|
/** Used by Analyzers that implement reusableTokenStream
|
||||||
|
* to retrieve previously saved TokenStreams for re-use
|
||||||
|
* by the same thread. */
|
||||||
|
protected Object getPreviousTokenStream() {
|
||||||
|
return tokenStreams.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Used by Analyzers that implement reusableTokenStream
|
||||||
|
* to save a TokenStream for later re-use by the same
|
||||||
|
* thread. */
|
||||||
|
protected void setPreviousTokenStream(Object obj) {
|
||||||
|
tokenStreams.set(obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Invoked before indexing a Fieldable instance if
|
* Invoked before indexing a Fieldable instance if
|
||||||
|
@ -56,4 +84,3 @@ public abstract class Analyzer {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,149 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple class that can store & retrieve char[]'s in a
|
||||||
|
* hash table. Note that this is not a general purpose
|
||||||
|
* class. For example, it cannot remove char[]'s from the
|
||||||
|
* set, nor does it resize its hash table to be smaller,
|
||||||
|
* etc. It is designed for use with StopFilter to enable
|
||||||
|
* quick filtering based on the char[] termBuffer in a
|
||||||
|
* Token.
|
||||||
|
*/
|
||||||
|
|
||||||
|
final class CharArraySet {
|
||||||
|
|
||||||
|
private final static int INIT_SIZE = 8;
|
||||||
|
private final static double MAX_LOAD_FACTOR = 0.75;
|
||||||
|
private int mask;
|
||||||
|
private char[][] entries;
|
||||||
|
private int count;
|
||||||
|
private boolean ignoreCase;
|
||||||
|
|
||||||
|
/** Create set with enough capacity to hold startSize
|
||||||
|
* terms */
|
||||||
|
public CharArraySet(int startSize, boolean ignoreCase) {
|
||||||
|
this.ignoreCase = ignoreCase;
|
||||||
|
int size = INIT_SIZE;
|
||||||
|
while(((double) startSize)/size >= MAX_LOAD_FACTOR)
|
||||||
|
size *= 2;
|
||||||
|
mask = size-1;
|
||||||
|
entries = new char[size][];
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the characters in text up to length
|
||||||
|
* len is present in the set. */
|
||||||
|
public boolean contains(char[] text, int len) {
|
||||||
|
int code = getHashCode(text, len);
|
||||||
|
int pos = code & mask;
|
||||||
|
char[] text2 = entries[pos];
|
||||||
|
if (text2 != null && !equals(text, len, text2)) {
|
||||||
|
final int inc = code*1347|1;
|
||||||
|
do {
|
||||||
|
code += inc;
|
||||||
|
pos = code & mask;
|
||||||
|
text2 = entries[pos];
|
||||||
|
} while (text2 != null && !equals(text, len, text2));
|
||||||
|
}
|
||||||
|
return text2 != null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add this String into the set */
|
||||||
|
public void add(String text) {
|
||||||
|
add(text.toCharArray());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Add this text into the set */
|
||||||
|
public void add(char[] text) {
|
||||||
|
if (ignoreCase)
|
||||||
|
for(int i=0;i<text.length;i++)
|
||||||
|
text[i] = Character.toLowerCase(text[i]);
|
||||||
|
int code = getHashCode(text, text.length);
|
||||||
|
int pos = code & mask;
|
||||||
|
char[] text2 = entries[pos];
|
||||||
|
if (text2 != null) {
|
||||||
|
final int inc = code*1347|1;
|
||||||
|
do {
|
||||||
|
code += inc;
|
||||||
|
pos = code & mask;
|
||||||
|
text2 = entries[pos];
|
||||||
|
} while (text2 != null);
|
||||||
|
}
|
||||||
|
entries[pos] = text;
|
||||||
|
count++;
|
||||||
|
|
||||||
|
if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
|
||||||
|
rehash();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean equals(char[] text1, int len, char[] text2) {
|
||||||
|
if (len != text2.length)
|
||||||
|
return false;
|
||||||
|
for(int i=0;i<len;i++) {
|
||||||
|
if (ignoreCase) {
|
||||||
|
if (Character.toLowerCase(text1[i]) != text2[i])
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
if (text1[i] != text2[i])
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void rehash() {
|
||||||
|
final int newSize = 2*count;
|
||||||
|
mask = newSize-1;
|
||||||
|
|
||||||
|
char[][] newEntries = new char[newSize][];
|
||||||
|
for(int i=0;i<entries.length;i++) {
|
||||||
|
char[] text = entries[i];
|
||||||
|
if (text != null) {
|
||||||
|
int code = getHashCode(text, text.length);
|
||||||
|
int pos = code & mask;
|
||||||
|
if (newEntries[pos] != null) {
|
||||||
|
final int inc = code*1347|1;
|
||||||
|
do {
|
||||||
|
code += inc;
|
||||||
|
pos = code & mask;
|
||||||
|
} while (newEntries[pos] != null);
|
||||||
|
}
|
||||||
|
newEntries[pos] = text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
entries = newEntries;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getHashCode(char[] text, int len) {
|
||||||
|
int downto = len;
|
||||||
|
int code = 0;
|
||||||
|
while (downto > 0) {
|
||||||
|
final char c;
|
||||||
|
if (ignoreCase)
|
||||||
|
c = Character.toLowerCase(text[--downto]);
|
||||||
|
else
|
||||||
|
c = text[--downto];
|
||||||
|
code = (code*31) + c;
|
||||||
|
}
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,8 +28,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
|
|
||||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||||
private static final int MAX_WORD_LEN = 255;
|
private static final int MAX_WORD_LEN = 255;
|
||||||
private static final int IO_BUFFER_SIZE = 1024;
|
private static final int IO_BUFFER_SIZE = 4096;
|
||||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
|
||||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||||
|
|
||||||
/** Returns true iff a character should be included in a token. This
|
/** Returns true iff a character should be included in a token. This
|
||||||
|
@ -45,31 +44,32 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
public final Token next(Token token) throws IOException {
|
||||||
public final Token next() throws IOException {
|
|
||||||
int length = 0;
|
int length = 0;
|
||||||
int start = offset;
|
int start = bufferIndex;
|
||||||
|
char[] buffer = token.termBuffer();
|
||||||
while (true) {
|
while (true) {
|
||||||
final char c;
|
|
||||||
|
|
||||||
offset++;
|
|
||||||
if (bufferIndex >= dataLen) {
|
if (bufferIndex >= dataLen) {
|
||||||
|
offset += dataLen;
|
||||||
dataLen = input.read(ioBuffer);
|
dataLen = input.read(ioBuffer);
|
||||||
bufferIndex = 0;
|
|
||||||
}
|
|
||||||
;
|
|
||||||
if (dataLen == -1) {
|
if (dataLen == -1) {
|
||||||
if (length > 0)
|
if (length > 0)
|
||||||
break;
|
break;
|
||||||
else
|
else
|
||||||
return null;
|
return null;
|
||||||
} else
|
}
|
||||||
c = ioBuffer[bufferIndex++];
|
bufferIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
final char c = ioBuffer[bufferIndex++];
|
||||||
|
|
||||||
if (isTokenChar(c)) { // if it's a token char
|
if (isTokenChar(c)) { // if it's a token char
|
||||||
|
|
||||||
if (length == 0) // start of token
|
if (length == 0) // start of token
|
||||||
start = offset - 1;
|
start = offset + bufferIndex - 1;
|
||||||
|
else if (length == buffer.length)
|
||||||
|
buffer = token.resizeTermBuffer(1+length);
|
||||||
|
|
||||||
buffer[length++] = normalize(c); // buffer it, normalized
|
buffer[length++] = normalize(c); // buffer it, normalized
|
||||||
|
|
||||||
|
@ -78,9 +78,18 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
|
|
||||||
} else if (length > 0) // at non-Letter w/ chars
|
} else if (length > 0) // at non-Letter w/ chars
|
||||||
break; // return 'em
|
break; // return 'em
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Token(new String(buffer, 0, length), start, start + length);
|
token.termLength = length;
|
||||||
|
token.startOffset = start;
|
||||||
|
token.endOffset = start+length;
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset(Reader input) throws IOException {
|
||||||
|
super.reset(input);
|
||||||
|
bufferIndex = 0;
|
||||||
|
offset = 0;
|
||||||
|
dataLen = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,51 +29,68 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
super(input);
|
super(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next() throws java.io.IOException {
|
private char[] output = new char[256];
|
||||||
final Token t = input.next();
|
private int outputPos;
|
||||||
if (t != null)
|
|
||||||
t.setTermText(removeAccents(t.termText()));
|
public final Token next(Token result) throws java.io.IOException {
|
||||||
return t;
|
result = input.next(result);
|
||||||
|
if (result != null) {
|
||||||
|
outputPos = 0;
|
||||||
|
removeAccents(result.termBuffer(), result.termLength());
|
||||||
|
result.setTermBuffer(output, 0, outputPos);
|
||||||
|
return result;
|
||||||
|
} else
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void addChar(char c) {
|
||||||
|
if (outputPos == output.length) {
|
||||||
|
char[] newArray = new char[2*output.length];
|
||||||
|
System.arraycopy(output, 0, newArray, 0, output.length);
|
||||||
|
output = newArray;
|
||||||
|
}
|
||||||
|
output[outputPos++] = c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* To replace accented characters in a String by unaccented equivalents.
|
* To replace accented characters in a String by unaccented equivalents.
|
||||||
*/
|
*/
|
||||||
public final static String removeAccents(String input) {
|
public final void removeAccents(char[] input, int length) {
|
||||||
final StringBuffer output = new StringBuffer();
|
int pos = 0;
|
||||||
for (int i = 0; i < input.length(); i++) {
|
for (int i=0; i<length; i++, pos++) {
|
||||||
switch (input.charAt(i)) {
|
switch (input[pos]) {
|
||||||
case '\u00C0' : // À
|
case '\u00C0' : // À
|
||||||
case '\u00C1' : // Á
|
case '\u00C1' : // Á
|
||||||
case '\u00C2' : // Â
|
case '\u00C2' : // Â
|
||||||
case '\u00C3' : // Ã
|
case '\u00C3' : // Ã
|
||||||
case '\u00C4' : // Ä
|
case '\u00C4' : // Ä
|
||||||
case '\u00C5' : // Å
|
case '\u00C5' : // Å
|
||||||
output.append("A");
|
addChar('A');
|
||||||
break;
|
break;
|
||||||
case '\u00C6' : // Æ
|
case '\u00C6' : // Æ
|
||||||
output.append("AE");
|
addChar('A');
|
||||||
|
addChar('E');
|
||||||
break;
|
break;
|
||||||
case '\u00C7' : // Ç
|
case '\u00C7' : // Ç
|
||||||
output.append("C");
|
addChar('C');
|
||||||
break;
|
break;
|
||||||
case '\u00C8' : // È
|
case '\u00C8' : // È
|
||||||
case '\u00C9' : // É
|
case '\u00C9' : // É
|
||||||
case '\u00CA' : // Ê
|
case '\u00CA' : // Ê
|
||||||
case '\u00CB' : // Ë
|
case '\u00CB' : // Ë
|
||||||
output.append("E");
|
addChar('E');
|
||||||
break;
|
break;
|
||||||
case '\u00CC' : // Ì
|
case '\u00CC' : // Ì
|
||||||
case '\u00CD' : // Í
|
case '\u00CD' : // Í
|
||||||
case '\u00CE' : // Î
|
case '\u00CE' : // Î
|
||||||
case '\u00CF' : // Ï
|
case '\u00CF' : // Ï
|
||||||
output.append("I");
|
addChar('I');
|
||||||
break;
|
break;
|
||||||
case '\u00D0' : // Ð
|
case '\u00D0' : // Ð
|
||||||
output.append("D");
|
addChar('D');
|
||||||
break;
|
break;
|
||||||
case '\u00D1' : // Ñ
|
case '\u00D1' : // Ñ
|
||||||
output.append("N");
|
addChar('N');
|
||||||
break;
|
break;
|
||||||
case '\u00D2' : // Ò
|
case '\u00D2' : // Ò
|
||||||
case '\u00D3' : // Ó
|
case '\u00D3' : // Ó
|
||||||
|
@ -81,23 +98,25 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\u00D5' : // Õ
|
case '\u00D5' : // Õ
|
||||||
case '\u00D6' : // Ö
|
case '\u00D6' : // Ö
|
||||||
case '\u00D8' : // Ø
|
case '\u00D8' : // Ø
|
||||||
output.append("O");
|
addChar('O');
|
||||||
break;
|
break;
|
||||||
case '\u0152' : // Œ
|
case '\u0152' : // Œ
|
||||||
output.append("OE");
|
addChar('O');
|
||||||
|
addChar('E');
|
||||||
break;
|
break;
|
||||||
case '\u00DE' : // Þ
|
case '\u00DE' : // Þ
|
||||||
output.append("TH");
|
addChar('T');
|
||||||
|
addChar('H');
|
||||||
break;
|
break;
|
||||||
case '\u00D9' : // Ù
|
case '\u00D9' : // Ù
|
||||||
case '\u00DA' : // Ú
|
case '\u00DA' : // Ú
|
||||||
case '\u00DB' : // Û
|
case '\u00DB' : // Û
|
||||||
case '\u00DC' : // Ü
|
case '\u00DC' : // Ü
|
||||||
output.append("U");
|
addChar('U');
|
||||||
break;
|
break;
|
||||||
case '\u00DD' : // Ý
|
case '\u00DD' : // Ý
|
||||||
case '\u0178' : // Ÿ
|
case '\u0178' : // Ÿ
|
||||||
output.append("Y");
|
addChar('Y');
|
||||||
break;
|
break;
|
||||||
case '\u00E0' : // à
|
case '\u00E0' : // à
|
||||||
case '\u00E1' : // á
|
case '\u00E1' : // á
|
||||||
|
@ -105,31 +124,32 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\u00E3' : // ã
|
case '\u00E3' : // ã
|
||||||
case '\u00E4' : // ä
|
case '\u00E4' : // ä
|
||||||
case '\u00E5' : // å
|
case '\u00E5' : // å
|
||||||
output.append("a");
|
addChar('a');
|
||||||
break;
|
break;
|
||||||
case '\u00E6' : // æ
|
case '\u00E6' : // æ
|
||||||
output.append("ae");
|
addChar('a');
|
||||||
|
addChar('e');
|
||||||
break;
|
break;
|
||||||
case '\u00E7' : // ç
|
case '\u00E7' : // ç
|
||||||
output.append("c");
|
addChar('c');
|
||||||
break;
|
break;
|
||||||
case '\u00E8' : // è
|
case '\u00E8' : // è
|
||||||
case '\u00E9' : // é
|
case '\u00E9' : // é
|
||||||
case '\u00EA' : // ê
|
case '\u00EA' : // ê
|
||||||
case '\u00EB' : // ë
|
case '\u00EB' : // ë
|
||||||
output.append("e");
|
addChar('e');
|
||||||
break;
|
break;
|
||||||
case '\u00EC' : // ì
|
case '\u00EC' : // ì
|
||||||
case '\u00ED' : // í
|
case '\u00ED' : // í
|
||||||
case '\u00EE' : // î
|
case '\u00EE' : // î
|
||||||
case '\u00EF' : // ï
|
case '\u00EF' : // ï
|
||||||
output.append("i");
|
addChar('i');
|
||||||
break;
|
break;
|
||||||
case '\u00F0' : // ð
|
case '\u00F0' : // ð
|
||||||
output.append("d");
|
addChar('d');
|
||||||
break;
|
break;
|
||||||
case '\u00F1' : // ñ
|
case '\u00F1' : // ñ
|
||||||
output.append("n");
|
addChar('n');
|
||||||
break;
|
break;
|
||||||
case '\u00F2' : // ò
|
case '\u00F2' : // ò
|
||||||
case '\u00F3' : // ó
|
case '\u00F3' : // ó
|
||||||
|
@ -137,32 +157,34 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\u00F5' : // õ
|
case '\u00F5' : // õ
|
||||||
case '\u00F6' : // ö
|
case '\u00F6' : // ö
|
||||||
case '\u00F8' : // ø
|
case '\u00F8' : // ø
|
||||||
output.append("o");
|
addChar('o');
|
||||||
break;
|
break;
|
||||||
case '\u0153' : // œ
|
case '\u0153' : // œ
|
||||||
output.append("oe");
|
addChar('o');
|
||||||
|
addChar('e');
|
||||||
break;
|
break;
|
||||||
case '\u00DF' : // ß
|
case '\u00DF' : // ß
|
||||||
output.append("ss");
|
addChar('s');
|
||||||
|
addChar('s');
|
||||||
break;
|
break;
|
||||||
case '\u00FE' : // þ
|
case '\u00FE' : // þ
|
||||||
output.append("th");
|
addChar('t');
|
||||||
|
addChar('h');
|
||||||
break;
|
break;
|
||||||
case '\u00F9' : // ù
|
case '\u00F9' : // ù
|
||||||
case '\u00FA' : // ú
|
case '\u00FA' : // ú
|
||||||
case '\u00FB' : // û
|
case '\u00FB' : // û
|
||||||
case '\u00FC' : // ü
|
case '\u00FC' : // ü
|
||||||
output.append("u");
|
addChar('u');
|
||||||
break;
|
break;
|
||||||
case '\u00FD' : // ý
|
case '\u00FD' : // ý
|
||||||
case '\u00FF' : // ÿ
|
case '\u00FF' : // ÿ
|
||||||
output.append("y");
|
addChar('y');
|
||||||
break;
|
break;
|
||||||
default :
|
default :
|
||||||
output.append(input.charAt(i));
|
addChar(input[pos]);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return output.toString();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -28,4 +28,13 @@ public class KeywordAnalyzer extends Analyzer {
|
||||||
final Reader reader) {
|
final Reader reader) {
|
||||||
return new KeywordTokenizer(reader);
|
return new KeywordTokenizer(reader);
|
||||||
}
|
}
|
||||||
|
public TokenStream reusableTokenStream(String fieldName,
|
||||||
|
final Reader reader) {
|
||||||
|
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||||
|
if (tokenizer == null) {
|
||||||
|
tokenizer = new KeywordTokenizer(reader);
|
||||||
|
setPreviousTokenStream(tokenizer);
|
||||||
|
}
|
||||||
|
return tokenizer;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -28,7 +28,6 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
private static final int DEFAULT_BUFFER_SIZE = 256;
|
private static final int DEFAULT_BUFFER_SIZE = 256;
|
||||||
|
|
||||||
private boolean done;
|
private boolean done;
|
||||||
private final char[] buffer;
|
|
||||||
|
|
||||||
public KeywordTokenizer(Reader input) {
|
public KeywordTokenizer(Reader input) {
|
||||||
this(input, DEFAULT_BUFFER_SIZE);
|
this(input, DEFAULT_BUFFER_SIZE);
|
||||||
|
@ -36,23 +35,23 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
|
|
||||||
public KeywordTokenizer(Reader input, int bufferSize) {
|
public KeywordTokenizer(Reader input, int bufferSize) {
|
||||||
super(input);
|
super(input);
|
||||||
this.buffer = new char[bufferSize];
|
|
||||||
this.done = false;
|
this.done = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next() throws IOException {
|
public Token next(Token result) throws IOException {
|
||||||
if (!done) {
|
if (!done) {
|
||||||
done = true;
|
done = true;
|
||||||
StringBuffer buffer = new StringBuffer();
|
int upto = 0;
|
||||||
int length;
|
char[] buffer = result.termBuffer();
|
||||||
while (true) {
|
while (true) {
|
||||||
length = input.read(this.buffer);
|
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||||
if (length == -1) break;
|
if (length == -1) break;
|
||||||
|
upto += length;
|
||||||
buffer.append(this.buffer, 0, length);
|
if (upto == buffer.length)
|
||||||
|
buffer = result.resizeTermBuffer(1+buffer.length);
|
||||||
}
|
}
|
||||||
String text = buffer.toString();
|
result.termLength = upto;
|
||||||
return new Token(text, 0, text.length());
|
return result;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,12 +44,12 @@ public final class LengthFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token whose termText() is the right len
|
* Returns the next input Token whose termText() is the right len
|
||||||
*/
|
*/
|
||||||
public final Token next() throws IOException
|
public final Token next(Token result) throws IOException
|
||||||
{
|
{
|
||||||
// return the first non-stop word found
|
// return the first non-stop word found
|
||||||
for (Token token = input.next(); token != null; token = input.next())
|
for (Token token = input.next(result); token != null; token = input.next(result))
|
||||||
{
|
{
|
||||||
int len = token.termText().length();
|
int len = token.termLength();
|
||||||
if (len >= min && len <= max) {
|
if (len >= min && len <= max) {
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,14 +29,17 @@ public final class LowerCaseFilter extends TokenFilter {
|
||||||
super(in);
|
super(in);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token result) throws IOException {
|
||||||
Token t = input.next();
|
result = input.next(result);
|
||||||
|
if (result != null) {
|
||||||
|
|
||||||
if (t == null)
|
final char[] buffer = result.termBuffer();
|
||||||
|
final int length = result.termLength;
|
||||||
|
for(int i=0;i<length;i++)
|
||||||
|
buffer[i] = Character.toLowerCase(buffer[i]);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
} else
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
t.termText = t.termText.toLowerCase();
|
|
||||||
|
|
||||||
return t;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
@ -75,6 +76,14 @@ public class PerFieldAnalyzerWrapper extends Analyzer {
|
||||||
return analyzer.tokenStream(fieldName, reader);
|
return analyzer.tokenStream(fieldName, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
|
||||||
|
if (analyzer == null)
|
||||||
|
analyzer = defaultAnalyzer;
|
||||||
|
|
||||||
|
return analyzer.reusableTokenStream(fieldName, reader);
|
||||||
|
}
|
||||||
|
|
||||||
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
|
/** Return the positionIncrementGap from the analyzer assigned to fieldName */
|
||||||
public int getPositionIncrementGap(String fieldName) {
|
public int getPositionIncrementGap(String fieldName) {
|
||||||
Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
|
Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
|
||||||
|
|
|
@ -45,16 +45,13 @@ public final class PorterStemFilter extends TokenFilter {
|
||||||
stemmer = new PorterStemmer();
|
stemmer = new PorterStemmer();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next input Token, after being stemmed */
|
public final Token next(Token result) throws IOException {
|
||||||
public final Token next() throws IOException {
|
result = input.next(result);
|
||||||
Token token = input.next();
|
if (result != null) {
|
||||||
if (token == null)
|
if (stemmer.stem(result.termBuffer(), 0, result.termLength))
|
||||||
|
result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||||
|
return result;
|
||||||
|
} else
|
||||||
return null;
|
return null;
|
||||||
else {
|
|
||||||
String s = stemmer.stem(token.termText);
|
|
||||||
if (s != token.termText) // Yes, I mean object reference comparison here
|
|
||||||
token.termText = s;
|
|
||||||
return token;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
|
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
|
||||||
|
|
||||||
|
@ -25,4 +26,14 @@ public final class SimpleAnalyzer extends Analyzer {
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new LowerCaseTokenizer(reader);
|
return new LowerCaseTokenizer(reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||||
|
if (tokenizer == null) {
|
||||||
|
tokenizer = new LowerCaseTokenizer(reader);
|
||||||
|
setPreviousTokenStream(tokenizer);
|
||||||
|
} else
|
||||||
|
tokenizer.reset(reader);
|
||||||
|
return tokenizer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,5 +71,22 @@ public final class StopAnalyzer extends Analyzer {
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
|
return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||||
|
private class SavedStreams {
|
||||||
|
Tokenizer source;
|
||||||
|
TokenStream result;
|
||||||
|
};
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||||
|
if (streams == null) {
|
||||||
|
streams = new SavedStreams();
|
||||||
|
streams.source = new LowerCaseTokenizer(reader);
|
||||||
|
streams.result = new StopFilter(streams.source, stopWords);
|
||||||
|
setPreviousTokenStream(streams);
|
||||||
|
} else
|
||||||
|
streams.source.reset(reader);
|
||||||
|
return streams.result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -27,7 +28,7 @@ import java.util.Set;
|
||||||
|
|
||||||
public final class StopFilter extends TokenFilter {
|
public final class StopFilter extends TokenFilter {
|
||||||
|
|
||||||
private final Set stopWords;
|
private final CharArraySet stopWords;
|
||||||
private final boolean ignoreCase;
|
private final boolean ignoreCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -45,7 +46,7 @@ public final class StopFilter extends TokenFilter {
|
||||||
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
||||||
super(in);
|
super(in);
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
this.stopWords = makeStopSet(stopWords, ignoreCase);
|
this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -59,7 +60,10 @@ public final class StopFilter extends TokenFilter {
|
||||||
{
|
{
|
||||||
super(input);
|
super(input);
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
this.stopWords = stopWords;
|
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||||
|
Iterator it = stopWords.iterator();
|
||||||
|
while(it.hasNext())
|
||||||
|
this.stopWords.add((String) it.next());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -99,16 +103,21 @@ public final class StopFilter extends TokenFilter {
|
||||||
return stopTable;
|
return stopTable;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
|
||||||
|
CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
|
||||||
|
for (int i = 0; i < stopWords.length; i++)
|
||||||
|
stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
|
||||||
|
return stopSet;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token whose termText() is not a stop word.
|
* Returns the next input Token whose termText() is not a stop word.
|
||||||
*/
|
*/
|
||||||
public final Token next() throws IOException {
|
public final Token next(Token result) throws IOException {
|
||||||
// return the first non-stop word found
|
// return the first non-stop word found
|
||||||
for (Token token = input.next(); token != null; token = input.next())
|
while((result = input.next(result)) != null) {
|
||||||
{
|
if (!stopWords.contains(result.termBuffer(), result.termLength))
|
||||||
String termText = ignoreCase ? token.termText.toLowerCase() : token.termText;
|
return result;
|
||||||
if (!stopWords.contains(termText))
|
|
||||||
return token;
|
|
||||||
}
|
}
|
||||||
// reached EOS -- return null
|
// reached EOS -- return null
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -1,8 +1,5 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.index.Payload;
|
|
||||||
import org.apache.lucene.index.TermPositions;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -20,6 +17,9 @@ import org.apache.lucene.index.TermPositions;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
import org.apache.lucene.index.TermPositions;
|
||||||
|
|
||||||
/** A Token is an occurence of a term from the text of a field. It consists of
|
/** A Token is an occurence of a term from the text of a field. It consists of
|
||||||
a term's text, the start and end offset of the term in the text of the field,
|
a term's text, the start and end offset of the term in the text of the field,
|
||||||
and a type string.
|
and a type string.
|
||||||
|
@ -44,46 +44,103 @@ import org.apache.lucene.index.TermPositions;
|
||||||
The APIs introduced here might change in the future and will not be
|
The APIs introduced here might change in the future and will not be
|
||||||
supported anymore in such a case.</font>
|
supported anymore in such a case.</font>
|
||||||
|
|
||||||
|
<br><br>
|
||||||
|
|
||||||
|
<p><b>NOTE:</b> As of 2.3, Token stores the term text
|
||||||
|
internally as a malleable char[] termBuffer instead of
|
||||||
|
String termText. The indexing code and core tokenizers
|
||||||
|
have been changed re-use a single Token instance, changing
|
||||||
|
its buffer and other fields in-place as the Token is
|
||||||
|
processed. This provides substantially better indexing
|
||||||
|
performance as it saves the GC cost of new'ing a Token and
|
||||||
|
String for every term. The APIs that accept String
|
||||||
|
termText are still available but a warning about the
|
||||||
|
associated performance cost has been added (below). The
|
||||||
|
{@link #termText()} method has been deprecated.</p>
|
||||||
|
|
||||||
|
<p>Tokenizers and filters should try to re-use a Token
|
||||||
|
instance when possible for best performance, by
|
||||||
|
implementing the {@link TokenStream#next(Token)} API.
|
||||||
|
Failing that, to create a new Token you should first use
|
||||||
|
one of the constructors that starts with null text. Then
|
||||||
|
you should call either {@link #termBuffer()} or {@link
|
||||||
|
#resizeTermBuffer(int)} to retrieve the Token's
|
||||||
|
termBuffer. Fill in the characters of your term into this
|
||||||
|
buffer, and finally call {@link #setTermLength(int)} to
|
||||||
|
set the length of the term text. See <a target="_top"
|
||||||
|
href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
|
||||||
|
for details.</p>
|
||||||
|
|
||||||
@see org.apache.lucene.index.Payload
|
@see org.apache.lucene.index.Payload
|
||||||
*/
|
*/
|
||||||
// TODO: Remove warning after API has been finalized
|
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
|
||||||
public class Token implements Cloneable {
|
public class Token implements Cloneable {
|
||||||
String termText; // the text of the term
|
|
||||||
|
private static final String DEFAULT_TYPE = "word";
|
||||||
|
private static int MIN_BUFFER_SIZE = 10;
|
||||||
|
|
||||||
|
/** @deprecated: we will remove this when we remove the
|
||||||
|
* deprecated APIs */
|
||||||
|
private String termText;
|
||||||
|
|
||||||
|
char[] termBuffer; // characters for the term text
|
||||||
|
int termLength; // length of term text in buffer
|
||||||
|
|
||||||
int startOffset; // start in source text
|
int startOffset; // start in source text
|
||||||
int endOffset; // end in source text
|
int endOffset; // end in source text
|
||||||
String type = "word"; // lexical type
|
String type = DEFAULT_TYPE; // lexical type
|
||||||
|
|
||||||
Payload payload;
|
Payload payload;
|
||||||
|
|
||||||
// For better indexing speed, use termBuffer (and
|
int positionIncrement = 1;
|
||||||
// termBufferOffset/termBufferLength) instead of termText
|
|
||||||
// to save new'ing a String per token
|
|
||||||
char[] termBuffer;
|
|
||||||
int termBufferOffset;
|
|
||||||
int termBufferLength;
|
|
||||||
|
|
||||||
private int positionIncrement = 1;
|
/** Constructs a Token will null text. */
|
||||||
|
public Token() {
|
||||||
|
}
|
||||||
|
|
||||||
/** Constructs a Token with the given term text, and start & end offsets.
|
/** Constructs a Token with null text and start & end
|
||||||
The type defaults to "word." */
|
* offsets.
|
||||||
|
* @param start start offset
|
||||||
|
* @param end end offset */
|
||||||
|
public Token(int start, int end) {
|
||||||
|
startOffset = start;
|
||||||
|
endOffset = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Constructs a Token with null text and start & end
|
||||||
|
* offsets plus the Token type.
|
||||||
|
* @param start start offset
|
||||||
|
* @param end end offset */
|
||||||
|
public Token(int start, int end, String typ) {
|
||||||
|
startOffset = start;
|
||||||
|
endOffset = end;
|
||||||
|
type = typ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Constructs a Token with the given term text, and start
|
||||||
|
* & end offsets. The type defaults to "word."
|
||||||
|
* <b>NOTE:</b> for better indexing speed you should
|
||||||
|
* instead use the char[] termBuffer methods to set the
|
||||||
|
* term text.
|
||||||
|
* @param text term text
|
||||||
|
* @param start start offset
|
||||||
|
* @param end end offset */
|
||||||
public Token(String text, int start, int end) {
|
public Token(String text, int start, int end) {
|
||||||
termText = text;
|
termText = text;
|
||||||
startOffset = start;
|
startOffset = start;
|
||||||
endOffset = end;
|
endOffset = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructs a Token with the given term text buffer
|
/** Constructs a Token with the given text, start and end
|
||||||
* starting at offset for length lenth, and start & end offsets.
|
* offsets, & type. <b>NOTE:</b> for better indexing
|
||||||
* The type defaults to "word." */
|
* speed you should instead use the char[] termBuffer
|
||||||
public Token(char[] text, int offset, int length, int start, int end) {
|
* methods to set the term text.
|
||||||
termBuffer = text;
|
* @param text term text
|
||||||
termBufferOffset = offset;
|
* @param start start offset
|
||||||
termBufferLength = length;
|
* @param end end offset
|
||||||
startOffset = start;
|
* @param typ token type */
|
||||||
endOffset = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Constructs a Token with the given text, start and end offsets, & type. */
|
|
||||||
public Token(String text, int start, int end, String typ) {
|
public Token(String text, int start, int end, String typ) {
|
||||||
termText = text;
|
termText = text;
|
||||||
startOffset = start;
|
startOffset = start;
|
||||||
|
@ -91,19 +148,6 @@ public class Token implements Cloneable {
|
||||||
type = typ;
|
type = typ;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Constructs a Token with the given term text buffer
|
|
||||||
* starting at offset for length lenth, and start & end
|
|
||||||
* offsets, & type. */
|
|
||||||
public Token(char[] text, int offset, int length, int start, int end, String typ) {
|
|
||||||
termBuffer = text;
|
|
||||||
termBufferOffset = offset;
|
|
||||||
termBufferLength = length;
|
|
||||||
startOffset = start;
|
|
||||||
endOffset = end;
|
|
||||||
type = typ;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/** Set the position increment. This determines the position of this token
|
/** Set the position increment. This determines the position of this token
|
||||||
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||||
* searching.
|
* searching.
|
||||||
|
@ -139,28 +183,103 @@ public class Token implements Cloneable {
|
||||||
/** Returns the position increment of this Token.
|
/** Returns the position increment of this Token.
|
||||||
* @see #setPositionIncrement
|
* @see #setPositionIncrement
|
||||||
*/
|
*/
|
||||||
public int getPositionIncrement() { return positionIncrement; }
|
public int getPositionIncrement() {
|
||||||
|
return positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
/** Sets the Token's term text. */
|
/** Sets the Token's term text. <b>NOTE:</b> for better
|
||||||
|
* indexing speed you should instead use the char[]
|
||||||
|
* termBuffer methods to set the term text. */
|
||||||
public void setTermText(String text) {
|
public void setTermText(String text) {
|
||||||
termText = text;
|
termText = text;
|
||||||
|
termBuffer = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the Token's term text. */
|
/** Returns the Token's term text.
|
||||||
public final String termText() { return termText; }
|
*
|
||||||
public final char[] termBuffer() { return termBuffer; }
|
* @deprecated Use {@link #termBuffer()} and {@link
|
||||||
public final int termBufferOffset() { return termBufferOffset; }
|
* #termLength()} instead. */
|
||||||
public final int termBufferLength() { return termBufferLength; }
|
public final String termText() {
|
||||||
|
if (termText == null && termBuffer != null)
|
||||||
public void setStartOffset(int offset) {this.startOffset = offset;}
|
termText = new String(termBuffer, 0, termLength);
|
||||||
public void setEndOffset(int offset) {this.endOffset = offset;}
|
return termText;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Copies the contents of buffer, starting at offset for
|
||||||
|
* length characters, into the termBuffer
|
||||||
|
* array. <b>NOTE:</b> for better indexing speed you
|
||||||
|
* should instead retrieve the termBuffer, using {@link
|
||||||
|
* #termBuffer()} or {@link #resizeTermBuffer(int)}, and
|
||||||
|
* fill it in directly to set the term text. This saves
|
||||||
|
* an extra copy. */
|
||||||
public final void setTermBuffer(char[] buffer, int offset, int length) {
|
public final void setTermBuffer(char[] buffer, int offset, int length) {
|
||||||
this.termBuffer = buffer;
|
resizeTermBuffer(length);
|
||||||
this.termBufferOffset = offset;
|
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||||
this.termBufferLength = length;
|
termLength = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Returns the internal termBuffer character array which
|
||||||
|
* you can then directly alter. If the array is too
|
||||||
|
* small for your token, use {@link
|
||||||
|
* #resizeTermBuffer(int)} to increase it. After
|
||||||
|
* altering the buffer be sure to call {@link
|
||||||
|
* #setTermLength} to record the number of valid
|
||||||
|
* characters that were placed into the termBuffer. */
|
||||||
|
public final char[] termBuffer() {
|
||||||
|
initTermBuffer();
|
||||||
|
return termBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Grows the termBuffer to at least size newSize.
|
||||||
|
* @param newSize minimum size of the new termBuffer
|
||||||
|
* @return newly created termBuffer with length >= newSize
|
||||||
|
*/
|
||||||
|
public char[] resizeTermBuffer(int newSize) {
|
||||||
|
initTermBuffer();
|
||||||
|
if (newSize > termBuffer.length) {
|
||||||
|
int size = termBuffer.length;
|
||||||
|
while(size < newSize)
|
||||||
|
size *= 2;
|
||||||
|
char[] newBuffer = new char[size];
|
||||||
|
System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
|
||||||
|
termBuffer = newBuffer;
|
||||||
|
}
|
||||||
|
return termBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: once we remove the deprecated termText() method
|
||||||
|
// and switch entirely to char[] termBuffer we don't need
|
||||||
|
// to use this method anymore
|
||||||
|
private void initTermBuffer() {
|
||||||
|
if (termBuffer == null) {
|
||||||
|
if (termText == null) {
|
||||||
|
termBuffer = new char[MIN_BUFFER_SIZE];
|
||||||
|
termLength = 0;
|
||||||
|
} else {
|
||||||
|
int length = termText.length();
|
||||||
|
if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE;
|
||||||
|
termBuffer = new char[length];
|
||||||
|
termLength = termText.length();
|
||||||
|
termText.getChars(0, termText.length(), termBuffer, 0);
|
||||||
|
termText = null;
|
||||||
|
}
|
||||||
|
} else if (termText != null)
|
||||||
|
termText = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return number of valid characters (length of the term)
|
||||||
|
* in the termBuffer array. */
|
||||||
|
public final int termLength() {
|
||||||
|
initTermBuffer();
|
||||||
|
return termLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set number of valid characters (length of the term) in
|
||||||
|
* the termBuffer array. */
|
||||||
|
public final void setTermLength(int length) {
|
||||||
|
initTermBuffer();
|
||||||
|
termLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns this Token's starting offset, the position of the first character
|
/** Returns this Token's starting offset, the position of the first character
|
||||||
corresponding to this token in the source text.
|
corresponding to this token in the source text.
|
||||||
|
@ -168,25 +287,37 @@ public class Token implements Cloneable {
|
||||||
Note that the difference between endOffset() and startOffset() may not be
|
Note that the difference between endOffset() and startOffset() may not be
|
||||||
equal to termText.length(), as the term text may have been altered by a
|
equal to termText.length(), as the term text may have been altered by a
|
||||||
stemmer or some other filter. */
|
stemmer or some other filter. */
|
||||||
public final int startOffset() { return startOffset; }
|
public final int startOffset() {
|
||||||
|
return startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the starting offset.
|
||||||
|
@see #startOffset() */
|
||||||
|
public void setStartOffset(int offset) {
|
||||||
|
this.startOffset = offset;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns this Token's ending offset, one greater than the position of the
|
/** Returns this Token's ending offset, one greater than the position of the
|
||||||
last character corresponding to this token in the source text. */
|
last character corresponding to this token in the source text. */
|
||||||
public final int endOffset() { return endOffset; }
|
public final int endOffset() {
|
||||||
|
return endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the ending offset.
|
||||||
|
@see #endOffset() */
|
||||||
|
public void setEndOffset(int offset) {
|
||||||
|
this.endOffset = offset;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns this Token's lexical type. Defaults to "word". */
|
/** Returns this Token's lexical type. Defaults to "word". */
|
||||||
public final String type() { return type; }
|
public final String type() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/** Set the lexical type.
|
||||||
* Sets this Token's payload.
|
@see #type() */
|
||||||
* <p><font color="#FF0000">
|
public final void setType(String type) {
|
||||||
* WARNING: The status of the <b>Payloads</b> feature is experimental.
|
this.type = type;
|
||||||
* The APIs introduced here might change in the future and will not be
|
|
||||||
* supported anymore in such a case.</font>
|
|
||||||
*/
|
|
||||||
// TODO: Remove warning after API has been finalized
|
|
||||||
public void setPayload(Payload payload) {
|
|
||||||
this.payload = payload;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -201,9 +332,27 @@ public class Token implements Cloneable {
|
||||||
return this.payload;
|
return this.payload;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets this Token's payload.
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the <b>Payloads</b> feature is experimental.
|
||||||
|
* The APIs introduced here might change in the future and will not be
|
||||||
|
* supported anymore in such a case.</font>
|
||||||
|
*/
|
||||||
|
// TODO: Remove warning after API has been finalized
|
||||||
|
public void setPayload(Payload payload) {
|
||||||
|
this.payload = payload;
|
||||||
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuffer sb = new StringBuffer();
|
StringBuffer sb = new StringBuffer();
|
||||||
sb.append("(" + termText + "," + startOffset + "," + endOffset);
|
sb.append("(");
|
||||||
|
initTermBuffer();
|
||||||
|
if (termBuffer == null)
|
||||||
|
sb.append("null");
|
||||||
|
else
|
||||||
|
sb.append(termBuffer, 0, termLength);
|
||||||
|
sb.append("," + startOffset + "," + endOffset);
|
||||||
if (!type.equals("word"))
|
if (!type.equals("word"))
|
||||||
sb.append(",type="+type);
|
sb.append(",type="+type);
|
||||||
if (positionIncrement != 1)
|
if (positionIncrement != 1)
|
||||||
|
@ -212,11 +361,14 @@ public class Token implements Cloneable {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object clone() {
|
/** Reset all state for this token back to defaults. */
|
||||||
try {
|
public void clear() {
|
||||||
return super.clone();
|
payload = null;
|
||||||
} catch (CloneNotSupportedException e) {
|
// Leave termBuffer to allow re-use
|
||||||
throw new RuntimeException(e); // shouldn't happen since we implement Cloneable
|
termLength = 0;
|
||||||
}
|
termText = null;
|
||||||
|
positionIncrement = 1;
|
||||||
|
startOffset = endOffset = 0;
|
||||||
|
type = DEFAULT_TYPE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.io.IOException;
|
||||||
/** A TokenFilter is a TokenStream whose input is another token stream.
|
/** A TokenFilter is a TokenStream whose input is another token stream.
|
||||||
<p>
|
<p>
|
||||||
This is an abstract class.
|
This is an abstract class.
|
||||||
|
NOTE: subclasses must override at least one of {@link
|
||||||
|
#next()} or {@link #next(Token)}.
|
||||||
*/
|
*/
|
||||||
public abstract class TokenFilter extends TokenStream {
|
public abstract class TokenFilter extends TokenStream {
|
||||||
/** The source of tokens for this filter. */
|
/** The source of tokens for this filter. */
|
||||||
|
|
|
@ -29,11 +29,36 @@ import java.io.IOException;
|
||||||
<li>{@link TokenFilter}, a TokenStream
|
<li>{@link TokenFilter}, a TokenStream
|
||||||
whose input is another TokenStream.
|
whose input is another TokenStream.
|
||||||
</ul>
|
</ul>
|
||||||
|
NOTE: subclasses must override at least one of {@link
|
||||||
|
#next()} or {@link #next(Token)}.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public abstract class TokenStream {
|
public abstract class TokenStream {
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
|
||||||
public abstract Token next() throws IOException;
|
/** Returns the next token in the stream, or null at EOS.
|
||||||
|
* The returned Token is a "full private copy" (not
|
||||||
|
* re-used across calls to next()) but will be slower
|
||||||
|
* than calling {@link #next(Token)} instead.. */
|
||||||
|
public Token next() throws IOException {
|
||||||
|
Token result = next(new Token());
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the next token in the stream, or null at EOS.
|
||||||
|
* When possible, the input Token should be used as the
|
||||||
|
* returned Token (this gives fastest tokenization
|
||||||
|
* performance), but this is not required and a new Token
|
||||||
|
* may be returned. Callers may re-use a single Token
|
||||||
|
* instance for successive calls to this method and must
|
||||||
|
* therefore fully consume the previously returned Token
|
||||||
|
* before calling this method again.
|
||||||
|
* @param result a Token that may or may not be used to
|
||||||
|
* return
|
||||||
|
* @return next token in the stream or null if
|
||||||
|
* end-of-stream was hit*/
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
return next();
|
||||||
|
}
|
||||||
|
|
||||||
/** Resets this stream to the beginning. This is an
|
/** Resets this stream to the beginning. This is an
|
||||||
* optional operation, so subclasses may or may not
|
* optional operation, so subclasses may or may not
|
||||||
|
|
|
@ -23,6 +23,8 @@ import java.io.IOException;
|
||||||
/** A Tokenizer is a TokenStream whose input is a Reader.
|
/** A Tokenizer is a TokenStream whose input is a Reader.
|
||||||
<p>
|
<p>
|
||||||
This is an abstract class.
|
This is an abstract class.
|
||||||
|
NOTE: subclasses must override at least one of {@link
|
||||||
|
#next()} or {@link #next(Token)}.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public abstract class Tokenizer extends TokenStream {
|
public abstract class Tokenizer extends TokenStream {
|
||||||
|
@ -41,5 +43,12 @@ public abstract class Tokenizer extends TokenStream {
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
input.close();
|
input.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Reset the tokenizer to a new reader. Typically, an
|
||||||
|
* analyzer (in its reusableTokenStream method) will use
|
||||||
|
* this to re-use a previously created tokenizer. */
|
||||||
|
protected void reset(Reader input) throws IOException {
|
||||||
|
this.input = input;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/** An Analyzer that uses WhitespaceTokenizer. */
|
/** An Analyzer that uses WhitespaceTokenizer. */
|
||||||
|
|
||||||
|
@ -25,4 +26,14 @@ public final class WhitespaceAnalyzer extends Analyzer {
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new WhitespaceTokenizer(reader);
|
return new WhitespaceTokenizer(reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
|
||||||
|
if (tokenizer == null) {
|
||||||
|
tokenizer = new WhitespaceTokenizer(reader);
|
||||||
|
setPreviousTokenStream(tokenizer);
|
||||||
|
} else
|
||||||
|
tokenizer.reset(reader);
|
||||||
|
return tokenizer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,4 +75,23 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
result = new StopFilter(result, stopSet);
|
result = new StopFilter(result, stopSet);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class SavedStreams {
|
||||||
|
StandardTokenizer tokenStream;
|
||||||
|
TokenStream filteredTokenStream;
|
||||||
|
};
|
||||||
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
|
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||||
|
if (streams == null) {
|
||||||
|
streams = new SavedStreams();
|
||||||
|
setPreviousTokenStream(streams);
|
||||||
|
streams.tokenStream = new StandardTokenizer(reader);
|
||||||
|
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
|
||||||
|
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
|
||||||
|
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
|
||||||
|
} else
|
||||||
|
streams.tokenStream.reset(reader);
|
||||||
|
|
||||||
|
return streams.filteredTokenStream;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.standard;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
||||||
|
@ -37,33 +38,32 @@ public final class StandardFilter extends TokenFilter {
|
||||||
* <p>Removes <tt>'s</tt> from the end of words.
|
* <p>Removes <tt>'s</tt> from the end of words.
|
||||||
* <p>Removes dots from acronyms.
|
* <p>Removes dots from acronyms.
|
||||||
*/
|
*/
|
||||||
public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
|
public final Token next(Token result) throws java.io.IOException {
|
||||||
org.apache.lucene.analysis.Token t = input.next();
|
Token t = input.next(result);
|
||||||
|
|
||||||
if (t == null)
|
if (t == null)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
String text = t.termText();
|
char[] buffer = t.termBuffer();
|
||||||
String type = t.type();
|
final int bufferLength = t.termLength();
|
||||||
|
final String type = t.type();
|
||||||
|
|
||||||
if (type == APOSTROPHE_TYPE && // remove 's
|
if (type == APOSTROPHE_TYPE && // remove 's
|
||||||
(text.endsWith("'s") || text.endsWith("'S"))) {
|
bufferLength >= 2 &&
|
||||||
return new org.apache.lucene.analysis.Token
|
buffer[bufferLength-2] == '\'' &&
|
||||||
(text.substring(0,text.length()-2),
|
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||||
t.startOffset(), t.endOffset(), type);
|
// Strip last 2 characters off
|
||||||
|
t.setTermLength(bufferLength - 2);
|
||||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||||
StringBuffer trimmed = new StringBuffer();
|
int upto = 0;
|
||||||
for (int i = 0; i < text.length(); i++) {
|
for(int i=0;i<bufferLength;i++) {
|
||||||
char c = text.charAt(i);
|
char c = buffer[i];
|
||||||
if (c != '.')
|
if (c != '.')
|
||||||
trimmed.append(c);
|
buffer[upto++] = c;
|
||||||
|
}
|
||||||
|
t.setTermLength(upto);
|
||||||
}
|
}
|
||||||
return new org.apache.lucene.analysis.Token
|
|
||||||
(trimmed.toString(), t.startOffset(), t.endOffset(), type);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,9 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
public class StandardTokenizer extends Tokenizer {
|
public class StandardTokenizer extends Tokenizer {
|
||||||
/** A private instance of the JFlex-constructed scanner */
|
/** A private instance of the JFlex-constructed scanner */
|
||||||
private final StandardTokenizerImpl scanner;
|
private final StandardTokenizerImpl scanner;
|
||||||
|
void setInput(Reader reader) {
|
||||||
|
this.input = reader;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
||||||
|
@ -58,19 +61,19 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
*
|
*
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
*/
|
*/
|
||||||
public Token next() throws IOException {
|
public Token next(Token result) throws IOException {
|
||||||
int tokenType = scanner.getNextToken();
|
int tokenType = scanner.getNextToken();
|
||||||
|
|
||||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
int startPosition = scanner.yychar();
|
scanner.getText(result);
|
||||||
|
final int start = scanner.yychar();
|
||||||
final String tokenImage = scanner.yytext();
|
result.setStartOffset(start);
|
||||||
return new Token(tokenImage, startPosition, startPosition
|
result.setEndOffset(start+result.termLength());
|
||||||
+ tokenImage.length(),
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||||
StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -82,4 +85,9 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
super.reset();
|
super.reset();
|
||||||
scanner.yyreset(input);
|
scanner.yyreset(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void reset(Reader reader) throws IOException {
|
||||||
|
input = reader;
|
||||||
|
reset();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.4.1 on 8/8/07 10:18 PM */
|
/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
@ -19,7 +19,15 @@ package org.apache.lucene.analysis.standard;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is a scanner generated by
|
||||||
|
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
||||||
|
* on 8/9/07 10:15 AM from the specification file
|
||||||
|
* <tt>/tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
|
||||||
|
*/
|
||||||
class StandardTokenizerImpl {
|
class StandardTokenizerImpl {
|
||||||
|
|
||||||
/** This character denotes the end of file */
|
/** This character denotes the end of file */
|
||||||
|
@ -297,6 +305,13 @@ public final int yychar()
|
||||||
return yychar;
|
return yychar;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills Lucene token with the current token text.
|
||||||
|
*/
|
||||||
|
final void getText(Token t) {
|
||||||
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new scanner
|
* Creates a new scanner
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.standard;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
%class StandardTokenizerImpl
|
%class StandardTokenizerImpl
|
||||||
|
@ -52,6 +54,13 @@ public final int yychar()
|
||||||
{
|
{
|
||||||
return yychar;
|
return yychar;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills Lucene token with the current token text.
|
||||||
|
*/
|
||||||
|
final void getText(Token t) {
|
||||||
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
|
}
|
||||||
%}
|
%}
|
||||||
|
|
||||||
// basic word: a sequence of digits & letters
|
// basic word: a sequence of digits & letters
|
||||||
|
|
|
@ -960,28 +960,18 @@ final class DocumentsWriter {
|
||||||
|
|
||||||
/** Test whether the text for current Posting p equals
|
/** Test whether the text for current Posting p equals
|
||||||
* current tokenText. */
|
* current tokenText. */
|
||||||
boolean postingEquals(final String tokenString, final char[] tokenText,
|
boolean postingEquals(final char[] tokenText, final int tokenTextLen) {
|
||||||
final int tokenTextLen, final int tokenTextOffset) {
|
|
||||||
|
|
||||||
final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
|
final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
|
||||||
assert text != null;
|
assert text != null;
|
||||||
int pos = p.textStart & CHAR_BLOCK_MASK;
|
int pos = p.textStart & CHAR_BLOCK_MASK;
|
||||||
|
|
||||||
if (tokenText == null) {
|
int tokenPos = 0;
|
||||||
// Compare to String
|
for(;tokenPos<tokenTextLen;pos++,tokenPos++)
|
||||||
for(int i=0;i<tokenTextLen;i++)
|
|
||||||
if (tokenString.charAt(i) != text[pos++])
|
|
||||||
return false;
|
|
||||||
return text[pos] == 0xffff;
|
|
||||||
} else {
|
|
||||||
int tokenPos = tokenTextOffset;
|
|
||||||
final int stopAt = tokenTextLen+tokenPos;
|
|
||||||
for(;tokenPos<stopAt;pos++,tokenPos++)
|
|
||||||
if (tokenText[tokenPos] != text[pos])
|
if (tokenText[tokenPos] != text[pos])
|
||||||
return false;
|
return false;
|
||||||
return 0xffff == text[pos];
|
return 0xffff == text[pos];
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/** Compares term text for two Posting instance and
|
/** Compares term text for two Posting instance and
|
||||||
* returns -1 if p1 < p2; 1 if p1 > p2; else 0.
|
* returns -1 if p1 < p2; 1 if p1 > p2; else 0.
|
||||||
|
@ -1241,8 +1231,7 @@ final class DocumentsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
int offsetEnd;
|
int offsetEnd;
|
||||||
Token token;
|
Token localToken = new Token();
|
||||||
Token localToken = new Token("", 0, 0);
|
|
||||||
|
|
||||||
/* Invert one occurrence of one field in the document */
|
/* Invert one occurrence of one field in the document */
|
||||||
public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException {
|
public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException {
|
||||||
|
@ -1251,12 +1240,12 @@ final class DocumentsWriter {
|
||||||
position += analyzer.getPositionIncrementGap(fieldInfo.name);
|
position += analyzer.getPositionIncrementGap(fieldInfo.name);
|
||||||
|
|
||||||
if (!field.isTokenized()) { // un-tokenized field
|
if (!field.isTokenized()) { // un-tokenized field
|
||||||
token = localToken;
|
|
||||||
String stringValue = field.stringValue();
|
String stringValue = field.stringValue();
|
||||||
|
Token token = localToken;
|
||||||
token.setTermText(stringValue);
|
token.setTermText(stringValue);
|
||||||
token.setStartOffset(offset);
|
token.setStartOffset(offset);
|
||||||
token.setEndOffset(offset + stringValue.length());
|
token.setEndOffset(offset + stringValue.length());
|
||||||
addPosition();
|
addPosition(token);
|
||||||
offset += stringValue.length();
|
offset += stringValue.length();
|
||||||
length++;
|
length++;
|
||||||
} else { // tokenized field
|
} else { // tokenized field
|
||||||
|
@ -1282,7 +1271,7 @@ final class DocumentsWriter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize field and add to postingTable
|
// Tokenize field and add to postingTable
|
||||||
stream = analyzer.tokenStream(fieldInfo.name, reader);
|
stream = analyzer.reusableTokenStream(fieldInfo.name, reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset the TokenStream to the first token
|
// reset the TokenStream to the first token
|
||||||
|
@ -1290,9 +1279,10 @@ final class DocumentsWriter {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
offsetEnd = offset-1;
|
offsetEnd = offset-1;
|
||||||
for (token = stream.next(); token != null; token = stream.next()) {
|
Token token;
|
||||||
|
while((token = stream.next(localToken)) != null) {
|
||||||
position += (token.getPositionIncrement() - 1);
|
position += (token.getPositionIncrement() - 1);
|
||||||
addPosition();
|
addPosition(token);
|
||||||
if (++length >= maxFieldLength) {
|
if (++length >= maxFieldLength) {
|
||||||
if (infoStream != null)
|
if (infoStream != null)
|
||||||
infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
||||||
|
@ -1357,55 +1347,32 @@ final class DocumentsWriter {
|
||||||
* for every term of every document. Its job is to *
|
* for every term of every document. Its job is to *
|
||||||
* update the postings byte stream (Postings hash) *
|
* update the postings byte stream (Postings hash) *
|
||||||
* based on the occurence of a single term. */
|
* based on the occurence of a single term. */
|
||||||
private void addPosition() {
|
private void addPosition(Token token) {
|
||||||
|
|
||||||
final Payload payload = token.getPayload();
|
final Payload payload = token.getPayload();
|
||||||
|
|
||||||
final String tokenString;
|
|
||||||
final int tokenTextLen;
|
|
||||||
final int tokenTextOffset;
|
|
||||||
|
|
||||||
// Get the text of this term. Term can either
|
// Get the text of this term. Term can either
|
||||||
// provide a String token or offset into a char[]
|
// provide a String token or offset into a char[]
|
||||||
// array
|
// array
|
||||||
final char[] tokenText = token.termBuffer();
|
final char[] tokenText = token.termBuffer();
|
||||||
|
final int tokenTextLen = token.termLength();
|
||||||
|
|
||||||
int code = 0;
|
int code = 0;
|
||||||
int code2 = 0;
|
int code2 = 0;
|
||||||
|
|
||||||
if (tokenText == null) {
|
// Compute hashcode
|
||||||
|
|
||||||
// Fallback to String token
|
|
||||||
tokenString = token.termText();
|
|
||||||
tokenTextLen = tokenString.length();
|
|
||||||
tokenTextOffset = 0;
|
|
||||||
|
|
||||||
// Compute hashcode.
|
|
||||||
int downto = tokenTextLen;
|
int downto = tokenTextLen;
|
||||||
while (downto > 0)
|
while (downto > 0)
|
||||||
code = (code*31) + tokenString.charAt(--downto);
|
|
||||||
|
|
||||||
// System.out.println(" addPosition: field=" + fieldInfo.name + " string=" + tokenString + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset+token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
tokenString = null;
|
|
||||||
tokenTextLen = token.termBufferLength();
|
|
||||||
tokenTextOffset = token.termBufferOffset();
|
|
||||||
|
|
||||||
// Compute hashcode
|
|
||||||
int downto = tokenTextLen+tokenTextOffset;
|
|
||||||
while (downto > tokenTextOffset)
|
|
||||||
code = (code*31) + tokenText[--downto];
|
code = (code*31) + tokenText[--downto];
|
||||||
|
|
||||||
// System.out.println(" addPosition: buffer=" + new String(tokenText, tokenTextOffset, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
|
// System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
|
||||||
}
|
|
||||||
|
|
||||||
int hashPos = code & postingsHashMask;
|
int hashPos = code & postingsHashMask;
|
||||||
|
|
||||||
// Locate Posting in hash
|
// Locate Posting in hash
|
||||||
p = postingsHash[hashPos];
|
p = postingsHash[hashPos];
|
||||||
|
|
||||||
if (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)) {
|
if (p != null && !postingEquals(tokenText, tokenTextLen)) {
|
||||||
// Conflict: keep searching different locations in
|
// Conflict: keep searching different locations in
|
||||||
// the hash table.
|
// the hash table.
|
||||||
final int inc = code*1347|1;
|
final int inc = code*1347|1;
|
||||||
|
@ -1413,7 +1380,7 @@ final class DocumentsWriter {
|
||||||
code += inc;
|
code += inc;
|
||||||
hashPos = code & postingsHashMask;
|
hashPos = code & postingsHashMask;
|
||||||
p = postingsHash[hashPos];
|
p = postingsHash[hashPos];
|
||||||
} while (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset));
|
} while (p != null && !postingEquals(tokenText, tokenTextLen));
|
||||||
}
|
}
|
||||||
|
|
||||||
final int proxCode;
|
final int proxCode;
|
||||||
|
@ -1492,10 +1459,7 @@ final class DocumentsWriter {
|
||||||
p.textStart = textUpto + charPool.byteOffset;
|
p.textStart = textUpto + charPool.byteOffset;
|
||||||
charPool.byteUpto += textLen1;
|
charPool.byteUpto += textLen1;
|
||||||
|
|
||||||
if (tokenString == null)
|
System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);
|
||||||
System.arraycopy(tokenText, tokenTextOffset, text, textUpto, tokenTextLen);
|
|
||||||
else
|
|
||||||
tokenString.getChars(0, tokenTextLen, text, textUpto);
|
|
||||||
|
|
||||||
text[textUpto+tokenTextLen] = 0xffff;
|
text[textUpto+tokenTextLen] = 0xffff;
|
||||||
|
|
||||||
|
|
|
@ -94,7 +94,7 @@ public class TestCachingTokenFilter extends TestCase {
|
||||||
Token token;
|
Token token;
|
||||||
while ((token = stream.next()) != null) {
|
while ((token = stream.next()) != null) {
|
||||||
assertTrue(count < tokens.length);
|
assertTrue(count < tokens.length);
|
||||||
assertEquals(tokens[count], token.termText);
|
assertEquals(tokens[count], token.termText());
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import junit.framework.*;
|
||||||
|
|
||||||
|
public class TestToken extends TestCase {
|
||||||
|
|
||||||
|
public TestToken(String name) {
|
||||||
|
super(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testToString() throws Exception {
|
||||||
|
char[] b = {'a', 'l', 'o', 'h', 'a'};
|
||||||
|
Token t = new Token("", 0, 5);
|
||||||
|
t.setTermBuffer(b, 0, 5);
|
||||||
|
assertEquals("(aloha,0,5)", t.toString());
|
||||||
|
|
||||||
|
t.setTermText("hi there");
|
||||||
|
assertEquals("(hi there,0,5)", t.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMixedStringArray() throws Exception {
|
||||||
|
Token t = new Token("hello", 0, 5);
|
||||||
|
assertEquals(t.termText(), "hello");
|
||||||
|
assertEquals(t.termLength(), 5);
|
||||||
|
assertEquals(new String(t.termBuffer(), 0, 5), "hello");
|
||||||
|
t.setTermText("hello2");
|
||||||
|
assertEquals(t.termLength(), 6);
|
||||||
|
assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
|
||||||
|
t.setTermBuffer("hello3".toCharArray(), 0, 6);
|
||||||
|
assertEquals(t.termText(), "hello3");
|
||||||
|
|
||||||
|
// Make sure if we get the buffer and change a character
|
||||||
|
// that termText() reflects the change
|
||||||
|
char[] buffer = t.termBuffer();
|
||||||
|
buffer[1] = 'o';
|
||||||
|
assertEquals(t.termText(), "hollo3");
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue