mirror of https://github.com/apache/lucene.git
Renamed NullTokenizer and Analyzer to WhitespaceTokenizer and Analyzer.
Also re-structured the implementation of several tokenizers so that they share code, basing them on the new class CharAnalyzer. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149644 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b1ab12b556
commit
2ae22a31e3
|
@ -56,18 +56,10 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
/** LowerCaseTokenizer performs the function of LetterTokenizer
|
||||
and LowerCaseFilter together. It divides text at non-letters and converts
|
||||
them to lower case. While it is functionally equivalent to the combination
|
||||
of LetterTokenizer and LowerCaseFilter, there is a performance advantage
|
||||
to doing the two tasks at once, hence this (redundent) implementation.
|
||||
|
||||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public final class NullTokenizer extends Tokenizer {
|
||||
public NullTokenizer(Reader in) {
|
||||
input = in;
|
||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
||||
public abstract class CharTokenizer extends Tokenizer {
|
||||
public CharTokenizer(Reader input) {
|
||||
this.input = input;
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
|
@ -76,6 +68,18 @@ public final class NullTokenizer extends Tokenizer {
|
|||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
/** Returns true iff a character should be included in a token. This
|
||||
* tokenizer generates as tokens adjacent sequences of characters which
|
||||
* satisfy this predicate. Characters for which this is false are used to
|
||||
* define token boundaries and are not included in tokens. */
|
||||
protected abstract boolean isTokenChar(char c);
|
||||
|
||||
/** Called on each token character to normalize it before it is added to the
|
||||
* token. The default implementation does nothing. Subclasses may use this
|
||||
* to, e.g., lowercase tokens. */
|
||||
protected char normalize(char c) { return c; }
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws java.io.IOException {
|
||||
int length = 0;
|
||||
int start = offset;
|
||||
|
@ -96,20 +100,19 @@ public final class NullTokenizer extends Tokenizer {
|
|||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
if (Character.isWhitespace(c)) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
continue;
|
||||
}
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
|
||||
buffer[length++] = c;
|
||||
// buffer it
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
|
@ -63,52 +63,15 @@ import java.io.Reader;
|
|||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public final class LetterTokenizer extends Tokenizer {
|
||||
public class LetterTokenizer extends CharTokenizer {
|
||||
/** Construct a new LetterTokenizer. */
|
||||
public LetterTokenizer(Reader in) {
|
||||
input = in;
|
||||
super(in);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
int length = 0;
|
||||
int start = offset;
|
||||
while (true) {
|
||||
final char c;
|
||||
|
||||
offset++;
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
};
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
if (Character.isLetter(c)) { // if it's a letter
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
|
||||
buffer[length++] = c; // buffer it
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
/** Collects only characters which satisfy
|
||||
* {@link Character.isLetter(char)}.*/
|
||||
protected boolean isTokenChar(char c) {
|
||||
return Character.isLetter(c);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,52 +65,15 @@ import java.io.Reader;
|
|||
Note: this does a decent job for most European languages, but does a terrible
|
||||
job for some Asian languages, where words are not separated by spaces. */
|
||||
|
||||
public final class LowerCaseTokenizer extends Tokenizer {
|
||||
public final class LowerCaseTokenizer extends LetterTokenizer {
|
||||
/** Construct a new LowerCaseTokenizer. */
|
||||
public LowerCaseTokenizer(Reader in) {
|
||||
input = in;
|
||||
super(in);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
int length = 0;
|
||||
int start = offset;
|
||||
while (true) {
|
||||
final char c;
|
||||
|
||||
offset++;
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
};
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else
|
||||
c = (char) ioBuffer[bufferIndex++];
|
||||
|
||||
if (Character.isLetter(c)) { // if it's a letter
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset-1;
|
||||
|
||||
buffer[length++] = Character.toLowerCase(c);
|
||||
// buffer it
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
/** Collects only characters which satisfy
|
||||
* {@link Character.isLetter(char)}.*/
|
||||
protected char normalize(char c) {
|
||||
return Character.toLowerCase(c);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,10 +56,10 @@ package org.apache.lucene.analysis;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
|
||||
/** An Analyzer that uses WhitespaceTokenizer. */
|
||||
|
||||
public final class NullAnalyzer extends Analyzer {
|
||||
public final class WhitespaceAnalyzer extends Analyzer {
|
||||
public final TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new NullTokenizer(reader);
|
||||
return new WhitespaceTokenizer(reader);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
|
||||
* Adjacent sequences of non-Whitespace characters form tokens. */
|
||||
|
||||
public class WhitespaceTokenizer extends CharTokenizer {
|
||||
/** Construct a new WhitespaceTokenizer. */
|
||||
public WhitespaceTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy
|
||||
* {@link Character.isWhitespace(char)}.*/
|
||||
protected boolean isTokenChar(char c) {
|
||||
return !Character.isWhitespace(c);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue