Renamed NullTokenizer and Analyzer to WhitespaceTokenizer and Analyzer.

Also re-structured the implementation of several tokenizers so that they
share code, basing them on the new class CharAnalyzer.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149644 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Doug Cutting 2002-01-24 19:02:52 +00:00
parent b1ab12b556
commit 2ae22a31e3
5 changed files with 117 additions and 115 deletions

View File

@ -56,18 +56,10 @@ package org.apache.lucene.analysis;
import java.io.Reader;
/** LowerCaseTokenizer performs the function of LetterTokenizer
and LowerCaseFilter together. It divides text at non-letters and converts
them to lower case. While it is functionally equivalent to the combination
of LetterTokenizer and LowerCaseFilter, there is a performance advantage
to doing the two tasks at once, hence this (redundent) implementation.
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public final class NullTokenizer extends Tokenizer {
public NullTokenizer(Reader in) {
input = in;
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
this.input = input;
}
private int offset = 0, bufferIndex=0, dataLen=0;
@ -76,6 +68,18 @@ public final class NullTokenizer extends Tokenizer {
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
* satisfy this predicate. Characters for which this is false are used to
* define token boundaries and are not included in tokens. */
protected abstract boolean isTokenChar(char c);
/** Called on each token character to normalize it before it is added to the
* token. The default implementation does nothing. Subclasses may use this
* to, e.g., lowercase tokens. */
protected char normalize(char c) { return c; }
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
@ -96,20 +100,19 @@ public final class NullTokenizer extends Tokenizer {
else
c = (char) ioBuffer[bufferIndex++];
if (Character.isWhitespace(c)) {
if (length > 0)
break;
else
continue;
}
if (isTokenChar(c)) { // if it's a token char
if (length == 0) // start of token
start = offset-1;
if (length == 0) // start of token
start = offset-1;
buffer[length++] = normalize(c); // buffer it, normalized
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
buffer[length++] = c;
// buffer it
if (length == MAX_WORD_LEN) // buffer overflow!
break;
}
return new Token(new String(buffer, 0, length), start, start+length);

View File

@ -63,52 +63,15 @@ import java.io.Reader;
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public final class LetterTokenizer extends Tokenizer {
public class LetterTokenizer extends CharTokenizer {
/** Construct a new LetterTokenizer. */
public LetterTokenizer(Reader in) {
input = in;
super(in);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
};
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
else
c = (char) ioBuffer[bufferIndex++];
if (Character.isLetter(c)) { // if it's a letter
if (length == 0) // start of token
start = offset-1;
buffer[length++] = c; // buffer it
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
return new Token(new String(buffer, 0, length), start, start+length);
/** Collects only characters which satisfy
* {@link Character.isLetter(char)}.*/
protected boolean isTokenChar(char c) {
return Character.isLetter(c);
}
}

View File

@ -65,52 +65,15 @@ import java.io.Reader;
Note: this does a decent job for most European languages, but does a terrible
job for some Asian languages, where words are not separated by spaces. */
public final class LowerCaseTokenizer extends Tokenizer {
public final class LowerCaseTokenizer extends LetterTokenizer {
/** Construct a new LowerCaseTokenizer. */
public LowerCaseTokenizer(Reader in) {
input = in;
super(in);
}
private int offset = 0, bufferIndex=0, dataLen=0;
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
public final Token next() throws java.io.IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
};
if (dataLen == -1) {
if (length > 0)
break;
else
return null;
}
else
c = (char) ioBuffer[bufferIndex++];
if (Character.isLetter(c)) { // if it's a letter
if (length == 0) // start of token
start = offset-1;
buffer[length++] = Character.toLowerCase(c);
// buffer it
if (length == MAX_WORD_LEN) // buffer overflow!
break;
} else if (length > 0) // at non-Letter w/ chars
break; // return 'em
}
return new Token(new String(buffer, 0, length), start, start+length);
/** Collects only characters which satisfy
* {@link Character.isLetter(char)}.*/
protected char normalize(char c) {
return Character.toLowerCase(c);
}
}

View File

@ -56,10 +56,10 @@ package org.apache.lucene.analysis;
import java.io.Reader;
/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
/** An Analyzer that uses WhitespaceTokenizer. */
public final class NullAnalyzer extends Analyzer {
public final class WhitespaceAnalyzer extends Analyzer {
public final TokenStream tokenStream(String fieldName, Reader reader) {
return new NullTokenizer(reader);
return new WhitespaceTokenizer(reader);
}
}

View File

@ -0,0 +1,73 @@
package org.apache.lucene.analysis;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
* Adjacent sequences of non-Whitespace characters form tokens. */
public class WhitespaceTokenizer extends CharTokenizer {
/** Construct a new WhitespaceTokenizer. */
public WhitespaceTokenizer(Reader in) {
super(in);
}
/** Collects only characters which do not satisfy
* {@link Character.isWhitespace(char)}.*/
protected boolean isTokenChar(char c) {
return !Character.isWhitespace(c);
}
}