Renamed NullTokenizer and Analyzer to WhitespaceTokenizer and Analyzer.

Also re-structured the implementation of several tokenizers so that they share code, basing them on the new class CharAnalyzer. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@149644 13f79535-47bb-0310-9956-ffa450edef68
2002-01-24 19:02:52 +00:00 · 2002-01-24 19:02:52 +00:00 · 2ae22a31e3
parent b1ab12b556
commit 2ae22a31e3
5 changed files with 117 additions and 115 deletions
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@ -56,18 +56,10 @@ package org.apache.lucene.analysis;

 import java.io.Reader;

-/** LowerCaseTokenizer performs the function of LetterTokenizer
-  and LowerCaseFilter together.  It divides text at non-letters and converts
-  them to lower case.  While it is functionally equivalent to the combination
-  of LetterTokenizer and LowerCaseFilter, there is a performance advantage
-  to doing the two tasks at once, hence this (redundent) implementation.
-
-  Note: this does a decent job for most European languages, but does a terrible
-  job for some Asian languages, where words are not separated by spaces. */
-
-public final class NullTokenizer extends Tokenizer {
-  public NullTokenizer(Reader in) {
-    input = in;
+/** An abstract base class for simple, character-oriented tokenizers.*/
+public abstract class CharTokenizer extends Tokenizer {
+  public CharTokenizer(Reader input) {
+    this.input = input;
  }

  private int offset = 0, bufferIndex=0, dataLen=0;
@ -76,6 +68,18 @@ public final class NullTokenizer extends Tokenizer {
  private final char[] buffer = new char[MAX_WORD_LEN];
  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

+  /** Returns true iff a character should be included in a token.  This
+   * tokenizer generates as tokens adjacent sequences of characters which
+   * satisfy this predicate.  Characters for which this is false are used to
+   * define token boundaries and are not included in tokens. */
+  protected abstract boolean isTokenChar(char c);
+
+  /** Called on each token character to normalize it before it is added to the
+   * token.  The default implementation does nothing.  Subclasses may use this
+   * to, e.g., lowercase tokens. */
+  protected char normalize(char c) { return c; }
+
+  /** Returns the next token in the stream, or null at EOS. */
  public final Token next() throws java.io.IOException {
    int length = 0;
    int start = offset;
@ -96,20 +100,19 @@ public final class NullTokenizer extends Tokenizer {
      else
        c = (char) ioBuffer[bufferIndex++];
      
-      if (Character.isWhitespace(c)) {
-        if (length > 0)
-          break;
-        else
-          continue;
-      }
+      if (isTokenChar(c)) {                       // if it's a token char

-      if (length == 0)			  // start of token
-        start = offset-1;
+	if (length == 0)			  // start of token
+	  start = offset-1;
+
+	buffer[length++] = normalize(c);          // buffer it, normalized
+
+	if (length == MAX_WORD_LEN)		  // buffer overflow!
+	  break;
+
+      } else if (length > 0)			  // at non-Letter w/ chars
+	break;					  // return 'em

-      buffer[length++] = c;
-                                                  // buffer it
-      if (length == MAX_WORD_LEN)		  // buffer overflow!
-        break;
    }

    return new Token(new String(buffer, 0, length), start, start+length);
--- a/src/java/org/apache/lucene/analysis/LetterTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/LetterTokenizer.java
@ -63,52 +63,15 @@ import java.io.Reader;
  Note: this does a decent job for most European languages, but does a terrible
  job for some Asian languages, where words are not separated by spaces. */

-public final class LetterTokenizer extends Tokenizer {
+public class LetterTokenizer extends CharTokenizer {
+  /** Construct a new LetterTokenizer. */
  public LetterTokenizer(Reader in) {
-    input = in;
+    super(in);
  }

-  private int offset = 0, bufferIndex=0, dataLen=0;
-  private final static int MAX_WORD_LEN = 255;
-  private final static int IO_BUFFER_SIZE = 1024;
-  private final char[] buffer = new char[MAX_WORD_LEN];
-  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-  public final Token next() throws java.io.IOException {
-    int length = 0;
-    int start = offset;
-    while (true) {
-      final char c;
-
-      offset++;
-      if (bufferIndex >= dataLen) {
-        dataLen = input.read(ioBuffer);
-        bufferIndex = 0;
-      };
-      if (dataLen == -1) {
-	if (length > 0)
-	  break;
-	else
-	  return null;
-      }
-      else
-        c = (char) ioBuffer[bufferIndex++];
-      
-      if (Character.isLetter(c)) {		  // if it's a letter
-
-	if (length == 0)			  // start of token
-	  start = offset-1;
-
-	buffer[length++] = c;			  // buffer it
-
-	if (length == MAX_WORD_LEN)		  // buffer overflow!
-	  break;
-
-      } else if (length > 0)			  // at non-Letter w/ chars
-	break;					  // return 'em
-
-    }
-
-    return new Token(new String(buffer, 0, length), start, start+length);
+  /** Collects only characters which satisfy
+   * {@link Character.isLetter(char)}.*/
+  protected boolean isTokenChar(char c) {
+    return Character.isLetter(c);
  }
 }
--- a/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
@ -65,52 +65,15 @@ import java.io.Reader;
  Note: this does a decent job for most European languages, but does a terrible
  job for some Asian languages, where words are not separated by spaces. */

-public final class LowerCaseTokenizer extends Tokenizer {
+public final class LowerCaseTokenizer extends LetterTokenizer {
+  /** Construct a new LowerCaseTokenizer. */
  public LowerCaseTokenizer(Reader in) {
-    input = in;
+    super(in);
  }

-  private int offset = 0, bufferIndex=0, dataLen=0;
-  private final static int MAX_WORD_LEN = 255;
-  private final static int IO_BUFFER_SIZE = 1024;
-  private final char[] buffer = new char[MAX_WORD_LEN];
-  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
-
-  public final Token next() throws java.io.IOException {
-    int length = 0;
-    int start = offset;
-    while (true) {
-      final char c;
-
-      offset++;
-      if (bufferIndex >= dataLen) {
-        dataLen = input.read(ioBuffer);
-        bufferIndex = 0;
-      };
-      if (dataLen == -1) {
-	if (length > 0)
-	  break;
-	else
-	  return null;
-      }
-      else
-        c = (char) ioBuffer[bufferIndex++];
-      
-      if (Character.isLetter(c)) {		  // if it's a letter
-
-	if (length == 0)			  // start of token
-	  start = offset-1;
-
-	buffer[length++] = Character.toLowerCase(c);
-                                                  // buffer it
-	if (length == MAX_WORD_LEN)		  // buffer overflow!
-	  break;
-
-      } else if (length > 0)			  // at non-Letter w/ chars
-	break;					  // return 'em
-
-    }
-
-    return new Token(new String(buffer, 0, length), start, start+length);
+  /** Collects only characters which satisfy
+   * {@link Character.isLetter(char)}.*/
+  protected char normalize(char c) {
+    return Character.toLowerCase(c);
  }
 }
--- a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
@ -56,10 +56,10 @@ package org.apache.lucene.analysis;

 import java.io.Reader;

-/** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
+/** An Analyzer that uses WhitespaceTokenizer. */

-public final class NullAnalyzer extends Analyzer {
+public final class WhitespaceAnalyzer extends Analyzer {
  public final TokenStream tokenStream(String fieldName, Reader reader) {
-    return new NullTokenizer(reader);
+    return new WhitespaceTokenizer(reader);
  }
 }
--- a/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
@ -0,0 +1,73 @@
+package org.apache.lucene.analysis;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import java.io.Reader;
+
+/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ * Adjacent sequences of non-Whitespace characters form tokens. */
+
+public class WhitespaceTokenizer extends CharTokenizer {
+  /** Construct a new WhitespaceTokenizer. */
+  public WhitespaceTokenizer(Reader in) {
+    super(in);
+  }
+
+  /** Collects only characters which do not satisfy
+   * {@link Character.isWhitespace(char)}.*/
+  protected boolean isTokenChar(char c) {
+    return !Character.isWhitespace(c);
+  }
+}