LUCENE-969: deprecate Token.termText() & optimize core tokenizers by re-using tokens & TokenStreams

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@564715 13f79535-47bb-0310-9956-ffa450edef68
2025-02-22 18:27:21 +00:00 · 2007-08-10 18:34:33 +00:00 · 2007-08-10 18:34:33 +00:00 · d42de32984
commit d42de32984
parent 82eb074afd
28 changed files with 927 additions and 386 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -22,6 +22,12 @@ API Changes
    Field instance during indexing.  This is a sizable performance
    gain, especially for small documents.  (Mike McCandless)

+ 4. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to
+    permit re-using of Token and TokenStream instances during
+    indexing.  Changed Token to use a char[] as the store for the
+    termText instead of String.  This gives faster tokenization
+    performance (~10-15%).  (Mike McCandless)
+
 Bug fixes

 1. LUCENE-933: QueryParser fixed to not produce empty sub 
@ -107,6 +113,10 @@ Optimizations
    JavaCC to generate the tokenizer.
    (Stanislaw Osinski via Mike McCandless)

+ 8. LUCENE-969: Changed core tokenizers & filters to re-use Token and
+    TokenStream instances when possible to improve tokenization
+    performance (~10-15%). (Mike McCandless)
+
 Documentation

 Build
--- a/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
+++ b/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java
@ -73,7 +73,7 @@ public class ReadTokensTask extends PerfTask {
    super.tearDown();
  }

-  Token token = new Token("", 0, 0);
+  Token token = new Token();

  public int doLogic() throws Exception {
    List fields = doc.getFields();
@ -104,13 +104,13 @@ public class ReadTokensTask extends PerfTask {
        }
        
        // Tokenize field
-        stream = analyzer.tokenStream(field.name(), reader);
+        stream = analyzer.reusableTokenStream(field.name(), reader);
      }

      // reset the TokenStream to the first token
      stream.reset();

-      while(stream.next() != null)
+      while(stream.next(token) != null)
        tokenCount++;
    }
    totalTokenCount += tokenCount;
--- a/src/java/org/apache/lucene/analysis/Analyzer.java
+++ b/src/java/org/apache/lucene/analysis/Analyzer.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
 */

 import java.io.Reader;
+import java.io.IOException;

 /** An Analyzer builds TokenStreams, which analyze text.  It thus represents a
 *  policy for extracting index terms from text.
@ -37,6 +38,33 @@ public abstract class Analyzer {
    field name for backward compatibility. */
  public abstract TokenStream tokenStream(String fieldName, Reader reader);

+  /** Creates a TokenStream that is allowed to be re-used
+   *  from the previous time that the same thread called
+   *  this method.  Callers that do not need to use more
+   *  than one TokenStream at the same time from this
+   *  analyzer should use this method for better
+   *  performance.
+   */
+  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+    return tokenStream(fieldName, reader);
+  }
+
+  private ThreadLocal tokenStreams = new ThreadLocal();
+
+  /** Used by Analyzers that implement reusableTokenStream
+   *  to retrieve previously saved TokenStreams for re-use
+   *  by the same thread. */
+  protected Object getPreviousTokenStream() {
+    return tokenStreams.get();
+  }
+
+  /** Used by Analyzers that implement reusableTokenStream
+   *  to save a TokenStream for later re-use by the same
+   *  thread. */
+  protected void setPreviousTokenStream(Object obj) {
+    tokenStreams.set(obj);
+  }
+

  /**
   * Invoked before indexing a Fieldable instance if
@ -56,4 +84,3 @@ public abstract class Analyzer {
    return 0;
  }
 }
-
--- a/src/java/org/apache/lucene/analysis/CharArraySet.java
+++ b/src/java/org/apache/lucene/analysis/CharArraySet.java
@ -0,0 +1,149 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ * A simple class that can store & retrieve char[]'s in a
+ * hash table.  Note that this is not a general purpose
+ * class.  For example, it cannot remove char[]'s from the
+ * set, nor does it resize its hash table to be smaller,
+ * etc.  It is designed for use with StopFilter to enable
+ * quick filtering based on the char[] termBuffer in a
+ * Token.
+ */
+
+final class CharArraySet {
+
+  private final static int INIT_SIZE = 8;
+  private final static double MAX_LOAD_FACTOR = 0.75;
+  private int mask;
+  private char[][] entries;
+  private int count;
+  private boolean ignoreCase;
+
+  /** Create set with enough capacity to hold startSize
+   *  terms */
+  public CharArraySet(int startSize, boolean ignoreCase) {
+    this.ignoreCase = ignoreCase;
+    int size = INIT_SIZE;
+    while(((double) startSize)/size >= MAX_LOAD_FACTOR)
+      size *= 2;
+    mask = size-1;
+    entries = new char[size][];
+  }
+
+  /** Returns true if the characters in text up to length
+   *  len is present in the set. */
+  public boolean contains(char[] text, int len) {
+    int code = getHashCode(text, len);
+    int pos = code & mask;
+    char[] text2 = entries[pos];
+    if (text2 != null && !equals(text, len, text2)) {
+      final int inc = code*1347|1;
+      do {
+        code += inc;
+        pos = code & mask;
+        text2 = entries[pos];
+      } while (text2 != null && !equals(text, len, text2));
+    }
+    return text2 != null;
+  }
+
+  /** Add this String into the set */
+  public void add(String text) {
+    add(text.toCharArray());
+  }
+
+  /** Add this text into the set */
+  public void add(char[] text) {
+    if (ignoreCase)
+      for(int i=0;i<text.length;i++)
+        text[i] = Character.toLowerCase(text[i]);
+    int code = getHashCode(text, text.length);
+    int pos = code & mask;
+    char[] text2 = entries[pos];
+    if (text2 != null) {
+      final int inc = code*1347|1;
+      do {
+        code += inc;
+        pos = code & mask;
+        text2 = entries[pos];
+      } while (text2 != null);
+    }
+    entries[pos] = text;
+    count++;
+
+    if (((double) count)/entries.length > MAX_LOAD_FACTOR) {
+      rehash();
+    }
+  }
+
+  private boolean equals(char[] text1, int len, char[] text2) {
+    if (len != text2.length)
+      return false;
+    for(int i=0;i<len;i++) {
+      if (ignoreCase) {
+        if (Character.toLowerCase(text1[i]) != text2[i])
+          return false;
+      } else {
+        if (text1[i] != text2[i])
+          return false;
+      }
+    }
+    return true;
+  }
+
+  private void rehash() {
+    final int newSize = 2*count;
+    mask = newSize-1;
+
+    char[][] newEntries = new char[newSize][];
+    for(int i=0;i<entries.length;i++) {
+      char[] text = entries[i];
+      if (text != null) {
+        int code = getHashCode(text, text.length);
+        int pos = code & mask;
+        if (newEntries[pos] != null) {
+          final int inc = code*1347|1;
+          do {
+            code += inc;
+            pos = code & mask;
+          } while (newEntries[pos] != null);
+        }
+        newEntries[pos] = text;
+      }
+    }
+
+    entries = newEntries;
+  }
+  
+  private int getHashCode(char[] text, int len) {
+    int downto = len;
+    int code = 0;
+    while (downto > 0) {
+      final char c;
+      if (ignoreCase)
+        c = Character.toLowerCase(text[--downto]);
+      else
+        c = text[--downto];
+      code = (code*31) + c;
+    }
+    return code;
+  }
+}
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@ -28,8 +28,7 @@ public abstract class CharTokenizer extends Tokenizer {

  private int offset = 0, bufferIndex = 0, dataLen = 0;
  private static final int MAX_WORD_LEN = 255;
-  private static final int IO_BUFFER_SIZE = 1024;
-  private final char[] buffer = new char[MAX_WORD_LEN];
+  private static final int IO_BUFFER_SIZE = 4096;
  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];

  /** Returns true iff a character should be included in a token.  This
@ -45,31 +44,32 @@ public abstract class CharTokenizer extends Tokenizer {
    return c;
  }

-  /** Returns the next token in the stream, or null at EOS. */
-  public final Token next() throws IOException {
+  public final Token next(Token token) throws IOException {
    int length = 0;
-    int start = offset;
+    int start = bufferIndex;
+    char[] buffer = token.termBuffer();
    while (true) {
-      final char c;

-      offset++;
      if (bufferIndex >= dataLen) {
+        offset += dataLen;
        dataLen = input.read(ioBuffer);
+        if (dataLen == -1) {
+          if (length > 0)
+            break;
+          else
+            return null;
+        }
        bufferIndex = 0;
      }
-      ;
-      if (dataLen == -1) {
-        if (length > 0)
-          break;
-        else
-          return null;
-      } else
-        c = ioBuffer[bufferIndex++];
+
+      final char c = ioBuffer[bufferIndex++];

      if (isTokenChar(c)) {               // if it's a token char

        if (length == 0)			           // start of token
-          start = offset - 1;
+          start = offset + bufferIndex - 1;
+        else if (length == buffer.length)
+          buffer = token.resizeTermBuffer(1+length);

        buffer[length++] = normalize(c); // buffer it, normalized

@ -78,9 +78,18 @@ public abstract class CharTokenizer extends Tokenizer {

      } else if (length > 0)             // at non-Letter w/ chars
        break;                           // return 'em
-
    }

-    return new Token(new String(buffer, 0, length), start, start + length);
+    token.termLength = length;
+    token.startOffset = start;
+    token.endOffset = start+length;
+    return token;
+  }
+
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    bufferIndex = 0;
+    offset = 0;
+    dataLen = 0;
  }
 }
--- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
+++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
@ -25,144 +25,166 @@ package org.apache.lucene.analysis;
 * <p>
 */
 public class ISOLatin1AccentFilter extends TokenFilter {
-	public ISOLatin1AccentFilter(TokenStream input) {
-		super(input);
-	}
+  public ISOLatin1AccentFilter(TokenStream input) {
+    super(input);
+  }

-	public final Token next() throws java.io.IOException {
-		final Token t = input.next();
-    if (t != null)
-      t.setTermText(removeAccents(t.termText()));
-    return t;
-	}
+  private char[] output = new char[256];
+  private int outputPos;

-	/**
-	 * To replace accented characters in a String by unaccented equivalents.
-	 */
-	public final static String removeAccents(String input) {
-		final StringBuffer output = new StringBuffer();
-		for (int i = 0; i < input.length(); i++) {
-			switch (input.charAt(i)) {
-				case '\u00C0' : // À
-				case '\u00C1' : // Á
-				case '\u00C2' : // Â
-				case '\u00C3' : // Ã
-				case '\u00C4' : // Ä
-				case '\u00C5' : // Å
-					output.append("A");
-					break;
-				case '\u00C6' : // Æ
-					output.append("AE");
-					break;
-				case '\u00C7' : // Ç
-					output.append("C");
-					break;
-				case '\u00C8' : // È
-				case '\u00C9' : // É
-				case '\u00CA' : // Ê
-				case '\u00CB' : // Ë
-					output.append("E");
-					break;
-				case '\u00CC' : // Ì
-				case '\u00CD' : // Í
-				case '\u00CE' : // Î
-				case '\u00CF' : // Ï
-					output.append("I");
-					break;
-				case '\u00D0' : // Ð
-					output.append("D");
-					break;
-				case '\u00D1' : // Ñ
-					output.append("N");
-					break;
-				case '\u00D2' : // Ò
-				case '\u00D3' : // Ó
-				case '\u00D4' : // Ô
-				case '\u00D5' : // Õ
-				case '\u00D6' : // Ö
-				case '\u00D8' : // Ø
-					output.append("O");
-					break;
-				case '\u0152' : // Œ
-					output.append("OE");
-					break;
-				case '\u00DE' : // Þ
-					output.append("TH");
-					break;
-				case '\u00D9' : // Ù
-				case '\u00DA' : // Ú
-				case '\u00DB' : // Û
-				case '\u00DC' : // Ü
-					output.append("U");
-					break;
-				case '\u00DD' : // Ý
-				case '\u0178' : // Ÿ
-					output.append("Y");
-					break;
-				case '\u00E0' : // à
-				case '\u00E1' : // á
-				case '\u00E2' : // â
-				case '\u00E3' : // ã
-				case '\u00E4' : // ä
-				case '\u00E5' : // å
-					output.append("a");
-					break;
-				case '\u00E6' : // æ
-					output.append("ae");
-					break;
-				case '\u00E7' : // ç
-					output.append("c");
-					break;
-				case '\u00E8' : // è
-				case '\u00E9' : // é
-				case '\u00EA' : // ê
-				case '\u00EB' : // ë
-					output.append("e");
-					break;
-				case '\u00EC' : // ì
-				case '\u00ED' : // í
-				case '\u00EE' : // î
-				case '\u00EF' : // ï
-					output.append("i");
-					break;
-				case '\u00F0' : // ð
-					output.append("d");
-					break;
-				case '\u00F1' : // ñ
-					output.append("n");
-					break;
-				case '\u00F2' : // ò
-				case '\u00F3' : // ó
-				case '\u00F4' : // ô
-				case '\u00F5' : // õ
-				case '\u00F6' : // ö
-				case '\u00F8' : // ø
-					output.append("o");
-					break;
-				case '\u0153' : // œ
-					output.append("oe");
-					break;
-				case '\u00DF' : // ß
-					output.append("ss");
-					break;
-				case '\u00FE' : // þ
-					output.append("th");
-					break;
-				case '\u00F9' : // ù
-				case '\u00FA' : // ú
-				case '\u00FB' : // û
-				case '\u00FC' : // ü
-					output.append("u");
-					break;
-				case '\u00FD' : // ý
-				case '\u00FF' : // ÿ
-					output.append("y");
-					break;
-				default :
-					output.append(input.charAt(i));
-					break;
-			}
-		}
-		return output.toString();
-	}
-}
+  public final Token next(Token result) throws java.io.IOException {
+    result = input.next(result);
+    if (result != null) {
+      outputPos = 0;
+      removeAccents(result.termBuffer(), result.termLength());
+      result.setTermBuffer(output, 0, outputPos);
+      return result;
+    } else
+      return null;
+  }
+
+  private final void addChar(char c) {
+    if (outputPos == output.length) {
+      char[] newArray = new char[2*output.length];
+      System.arraycopy(output, 0, newArray, 0, output.length);
+      output = newArray;
+    }
+    output[outputPos++] = c;
+  }
+
+  /**
+   * To replace accented characters in a String by unaccented equivalents.
+   */
+  public final void removeAccents(char[] input, int length) {
+    int pos = 0;
+    for (int i=0; i<length; i++, pos++) {
+      switch (input[pos]) {
+      case '\u00C0' : // À
+      case '\u00C1' : // Á
+      case '\u00C2' : // Â
+      case '\u00C3' : // Ã
+      case '\u00C4' : // Ä
+      case '\u00C5' : // Å
+        addChar('A');
+        break;
+      case '\u00C6' : // Æ
+        addChar('A');
+        addChar('E');
+        break;
+      case '\u00C7' : // Ç
+        addChar('C');
+        break;
+      case '\u00C8' : // È
+      case '\u00C9' : // É
+      case '\u00CA' : // Ê
+      case '\u00CB' : // Ë
+        addChar('E');
+        break;
+      case '\u00CC' : // Ì
+      case '\u00CD' : // Í
+      case '\u00CE' : // Î
+      case '\u00CF' : // Ï
+        addChar('I');
+        break;
+      case '\u00D0' : // Ð
+        addChar('D');
+        break;
+      case '\u00D1' : // Ñ
+        addChar('N');
+        break;
+      case '\u00D2' : // Ò
+      case '\u00D3' : // Ó
+      case '\u00D4' : // Ô
+      case '\u00D5' : // Õ
+      case '\u00D6' : // Ö
+      case '\u00D8' : // Ø
+        addChar('O');
+        break;
+      case '\u0152' : // Œ
+        addChar('O');
+        addChar('E');
+        break;
+      case '\u00DE' : // Þ
+        addChar('T');
+        addChar('H');
+        break;
+      case '\u00D9' : // Ù
+      case '\u00DA' : // Ú
+      case '\u00DB' : // Û
+      case '\u00DC' : // Ü
+        addChar('U');
+        break;
+      case '\u00DD' : // Ý
+      case '\u0178' : // Ÿ
+        addChar('Y');
+        break;
+      case '\u00E0' : // à
+      case '\u00E1' : // á
+      case '\u00E2' : // â
+      case '\u00E3' : // ã
+      case '\u00E4' : // ä
+      case '\u00E5' : // å
+        addChar('a');
+        break;
+      case '\u00E6' : // æ
+        addChar('a');
+        addChar('e');
+        break;
+      case '\u00E7' : // ç
+        addChar('c');
+        break;
+      case '\u00E8' : // è
+      case '\u00E9' : // é
+      case '\u00EA' : // ê
+      case '\u00EB' : // ë
+        addChar('e');
+        break;
+      case '\u00EC' : // ì
+      case '\u00ED' : // í
+      case '\u00EE' : // î
+      case '\u00EF' : // ï
+        addChar('i');
+        break;
+      case '\u00F0' : // ð
+        addChar('d');
+        break;
+      case '\u00F1' : // ñ
+        addChar('n');
+        break;
+      case '\u00F2' : // ò
+      case '\u00F3' : // ó
+      case '\u00F4' : // ô
+      case '\u00F5' : // õ
+      case '\u00F6' : // ö
+      case '\u00F8' : // ø
+        addChar('o');
+        break;
+      case '\u0153' : // œ
+        addChar('o');
+        addChar('e');
+        break;
+      case '\u00DF' : // ß
+        addChar('s');
+        addChar('s');
+        break;
+      case '\u00FE' : // þ
+        addChar('t');
+        addChar('h');
+        break;
+      case '\u00F9' : // ù
+      case '\u00FA' : // ú
+      case '\u00FB' : // û
+      case '\u00FC' : // ü
+        addChar('u');
+        break;
+      case '\u00FD' : // ý
+      case '\u00FF' : // ÿ
+        addChar('y');
+        break;
+      default :
+        addChar(input[pos]);
+        break;
+      }
+    }
+  }
+}
--- a/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java
@ -28,4 +28,13 @@ public class KeywordAnalyzer extends Analyzer {
                                 final Reader reader) {
    return new KeywordTokenizer(reader);
  }
-}
+  public TokenStream reusableTokenStream(String fieldName,
+                                         final Reader reader) {
+    Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
+    if (tokenizer == null) {
+      tokenizer = new KeywordTokenizer(reader);
+      setPreviousTokenStream(tokenizer);
+    }
+    return tokenizer;
+  }
+}
--- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
@ -28,7 +28,6 @@ public class KeywordTokenizer extends Tokenizer {
  private static final int DEFAULT_BUFFER_SIZE = 256;

  private boolean done;
-  private final char[] buffer;

  public KeywordTokenizer(Reader input) {
    this(input, DEFAULT_BUFFER_SIZE);
@ -36,23 +35,23 @@ public class KeywordTokenizer extends Tokenizer {

  public KeywordTokenizer(Reader input, int bufferSize) {
    super(input);
-    this.buffer = new char[bufferSize];
    this.done = false;
  }

-  public Token next() throws IOException {
+  public Token next(Token result) throws IOException {
    if (!done) {
      done = true;
-      StringBuffer buffer = new StringBuffer();
-      int length;
+      int upto = 0;
+      char[] buffer = result.termBuffer();
      while (true) {
-        length = input.read(this.buffer);
+        final int length = input.read(buffer, upto, buffer.length-upto);
        if (length == -1) break;
-
-        buffer.append(this.buffer, 0, length);
+        upto += length;
+        if (upto == buffer.length)
+          buffer = result.resizeTermBuffer(1+buffer.length);
      }
-      String text = buffer.toString();
-      return new Token(text, 0, text.length());
+      result.termLength = upto;
+      return result;
    }
    return null;
  }
--- a/src/java/org/apache/lucene/analysis/LengthFilter.java
+++ b/src/java/org/apache/lucene/analysis/LengthFilter.java
@ -44,12 +44,12 @@ public final class LengthFilter extends TokenFilter {
  /**
   * Returns the next input Token whose termText() is the right len
   */
-  public final Token next() throws IOException
+  public final Token next(Token result) throws IOException
  {
    // return the first non-stop word found
-    for (Token token = input.next(); token != null; token = input.next())
+    for (Token token = input.next(result); token != null; token = input.next(result))
    {
-      int len = token.termText().length();
+      int len = token.termLength();
      if (len >= min && len <= max) {
          return token;
      }
--- a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
+++ b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
@ -29,14 +29,17 @@ public final class LowerCaseFilter extends TokenFilter {
    super(in);
  }

-  public final Token next() throws IOException {
-    Token t = input.next();
+  public final Token next(Token result) throws IOException {
+    result = input.next(result);
+    if (result != null) {

-    if (t == null)
+      final char[] buffer = result.termBuffer();
+      final int length = result.termLength;
+      for(int i=0;i<length;i++)
+        buffer[i] = Character.toLowerCase(buffer[i]);
+
+      return result;
+    } else
      return null;
-
-    t.termText = t.termText.toLowerCase();
-
-    return t;
  }
 }
--- a/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
+++ b/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
 */

 import java.io.Reader;
+import java.io.IOException;
 import java.util.Map;
 import java.util.HashMap;

@ -75,6 +76,14 @@ public class PerFieldAnalyzerWrapper extends Analyzer {
    return analyzer.tokenStream(fieldName, reader);
  }
  
+  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+    Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
+    if (analyzer == null)
+      analyzer = defaultAnalyzer;
+
+    return analyzer.reusableTokenStream(fieldName, reader);
+  }
+  
  /** Return the positionIncrementGap from the analyzer assigned to fieldName */
  public int getPositionIncrementGap(String fieldName) {
    Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName);
--- a/src/java/org/apache/lucene/analysis/PorterStemFilter.java
+++ b/src/java/org/apache/lucene/analysis/PorterStemFilter.java
@ -45,16 +45,13 @@ public final class PorterStemFilter extends TokenFilter {
    stemmer = new PorterStemmer();
  }

-  /** Returns the next input Token, after being stemmed */
-  public final Token next() throws IOException {
-    Token token = input.next();
-    if (token == null)
+  public final Token next(Token result) throws IOException {
+    result = input.next(result);
+    if (result != null) {
+      if (stemmer.stem(result.termBuffer(), 0, result.termLength))
+        result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
+      return result;
+    } else
      return null;
-    else {
-      String s = stemmer.stem(token.termText);
-      if (s != token.termText) // Yes, I mean object reference comparison here
-  	    token.termText = s;
-      return token;
-    }
  }
 }
--- a/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
 */

 import java.io.Reader;
+import java.io.IOException;

 /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */

@ -25,4 +26,14 @@ public final class SimpleAnalyzer extends Analyzer {
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new LowerCaseTokenizer(reader);
  }
+
+  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+    Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
+    if (tokenizer == null) {
+      tokenizer = new LowerCaseTokenizer(reader);
+      setPreviousTokenStream(tokenizer);
+    } else
+      tokenizer.reset(reader);
+    return tokenizer;
+  }
 }
--- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java
@ -71,5 +71,22 @@ public final class StopAnalyzer extends Analyzer {
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
  }
+
+  /** Filters LowerCaseTokenizer with StopFilter. */
+  private class SavedStreams {
+    Tokenizer source;
+    TokenStream result;
+  };
+  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      streams.source = new LowerCaseTokenizer(reader);
+      streams.result = new StopFilter(streams.source, stopWords);
+      setPreviousTokenStream(streams);
+    } else
+      streams.source.reset(reader);
+    return streams.result;
+  }
 }

--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;

 import java.io.IOException;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Set;

 /**
@ -27,16 +28,16 @@ import java.util.Set;

 public final class StopFilter extends TokenFilter {

-  private final Set stopWords;
+  private final CharArraySet stopWords;
  private final boolean ignoreCase;

-    /**
-     * Construct a token stream filtering the given input.
-     */
-    public StopFilter(TokenStream input, String [] stopWords)
-    {
-        this(input, stopWords, false);
-    }
+  /**
+   * Construct a token stream filtering the given input.
+   */
+  public StopFilter(TokenStream input, String [] stopWords)
+  {
+    this(input, stopWords, false);
+  }

  /**
   * Constructs a filter which removes words from the input
@ -45,22 +46,25 @@ public final class StopFilter extends TokenFilter {
  public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
    super(in);
    this.ignoreCase = ignoreCase;
-    this.stopWords = makeStopSet(stopWords, ignoreCase);
+    this.stopWords = makeStopCharArraySet(stopWords, ignoreCase);
  }


-    /**
-     * Construct a token stream filtering the given input.
-     * @param input
-     * @param stopWords The set of Stop Words, as Strings.  If ignoreCase is true, all strings should be lower cased
-     * @param ignoreCase -Ignore case when stopping.  The stopWords set must be setup to contain only lower case words 
-     */
-    public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
-    {
-        super(input);
-        this.ignoreCase = ignoreCase;
-        this.stopWords = stopWords;
-    }
+  /**
+   * Construct a token stream filtering the given input.
+   * @param input
+   * @param stopWords The set of Stop Words, as Strings.  If ignoreCase is true, all strings should be lower cased
+   * @param ignoreCase -Ignore case when stopping.  The stopWords set must be setup to contain only lower case words 
+   */
+  public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase)
+  {
+    super(input);
+    this.ignoreCase = ignoreCase;
+    this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
+    Iterator it = stopWords.iterator();
+    while(it.hasNext())
+      this.stopWords.add((String) it.next());
+  }

  /**
   * Constructs a filter which removes words from the input
@ -97,18 +101,23 @@ public final class StopFilter extends TokenFilter {
    for (int i = 0; i < stopWords.length; i++)
      stopTable.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
    return stopTable;
-  }    
+  }
+
+  private static final CharArraySet makeStopCharArraySet(String[] stopWords, boolean ignoreCase) {
+    CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase);
+    for (int i = 0; i < stopWords.length; i++)
+      stopSet.add(ignoreCase ? stopWords[i].toLowerCase() : stopWords[i]);
+    return stopSet;
+  }

  /**
   * Returns the next input Token whose termText() is not a stop word.
   */
-  public final Token next() throws IOException {
+  public final Token next(Token result) throws IOException {
    // return the first non-stop word found
-    for (Token token = input.next(); token != null; token = input.next())
-    {
-        String termText = ignoreCase ? token.termText.toLowerCase() : token.termText;
-        if (!stopWords.contains(termText))
-          return token;
+    while((result = input.next(result)) != null) {
+      if (!stopWords.contains(result.termBuffer(), result.termLength))
+        return result;
    }
    // reached EOS -- return null
    return null;
--- a/src/java/org/apache/lucene/analysis/Token.java
+++ b/src/java/org/apache/lucene/analysis/Token.java
@ -1,8 +1,5 @@
 package org.apache.lucene.analysis;

-import org.apache.lucene.index.Payload;
-import org.apache.lucene.index.TermPositions;
-
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -20,6 +17,9 @@ import org.apache.lucene.index.TermPositions;
 * limitations under the License.
 */

+import org.apache.lucene.index.Payload;
+import org.apache.lucene.index.TermPositions;
+
 /** A Token is an occurence of a term from the text of a field.  It consists of
  a term's text, the start and end offset of the term in the text of the field,
  and a type string.
@ -44,46 +44,103 @@ import org.apache.lucene.index.TermPositions;
  The APIs introduced here might change in the future and will not be 
  supported anymore in such a case.</font>

+  <br><br>
+
+  <p><b>NOTE:</b> As of 2.3, Token stores the term text
+  internally as a malleable char[] termBuffer instead of
+  String termText.  The indexing code and core tokenizers
+  have been changed re-use a single Token instance, changing
+  its buffer and other fields in-place as the Token is
+  processed.  This provides substantially better indexing
+  performance as it saves the GC cost of new'ing a Token and
+  String for every term.  The APIs that accept String
+  termText are still available but a warning about the
+  associated performance cost has been added (below).  The
+  {@link #termText()} method has been deprecated.</p>
+  
+  <p>Tokenizers and filters should try to re-use a Token
+  instance when possible for best performance, by
+  implementing the {@link TokenStream#next(Token)} API.
+  Failing that, to create a new Token you should first use
+  one of the constructors that starts with null text.  Then
+  you should call either {@link #termBuffer()} or {@link
+  #resizeTermBuffer(int)} to retrieve the Token's
+  termBuffer.  Fill in the characters of your term into this
+  buffer, and finally call {@link #setTermLength(int)} to
+  set the length of the term text.  See <a target="_top"
+  href="https://issues.apache.org/jira/browse/LUCENE-969">LUCENE-969</a>
+  for details.</p>
+
  @see org.apache.lucene.index.Payload
-  */
-  // TODO: Remove warning after API has been finalized
+*/
+
+// TODO: Remove warning after API has been finalized
+
 public class Token implements Cloneable {
-  String termText;				  // the text of the term
+
+  private static final String DEFAULT_TYPE = "word";
+  private static int MIN_BUFFER_SIZE = 10;
+
+  /** @deprecated: we will remove this when we remove the
+   * deprecated APIs */
+  private String termText;
+
+  char[] termBuffer;                              // characters for the term text
+  int termLength;                                 // length of term text in buffer
+
  int startOffset;				  // start in source text
  int endOffset;				  // end in source text
-  String type = "word";				  // lexical type
+  String type = DEFAULT_TYPE;                     // lexical type
  
  Payload payload;
  
-  // For better indexing speed, use termBuffer (and
-  // termBufferOffset/termBufferLength) instead of termText
-  // to save new'ing a String per token
-  char[] termBuffer;
-  int termBufferOffset;
-  int termBufferLength;
+  int positionIncrement = 1;

-  private int positionIncrement = 1;
+  /** Constructs a Token will null text. */
+  public Token() {
+  }

-  /** Constructs a Token with the given term text, and start & end offsets.
-      The type defaults to "word." */
+  /** Constructs a Token with null text and start & end
+   *  offsets.
+   *  @param start start offset
+   *  @param end end offset */
+  public Token(int start, int end) {
+    startOffset = start;
+    endOffset = end;
+  }
+
+  /** Constructs a Token with null text and start & end
+   *  offsets plus the Token type.
+   *  @param start start offset
+   *  @param end end offset */
+  public Token(int start, int end, String typ) {
+    startOffset = start;
+    endOffset = end;
+    type = typ;
+  }
+
+  /** Constructs a Token with the given term text, and start
+   *  & end offsets.  The type defaults to "word."
+   *  <b>NOTE:</b> for better indexing speed you should
+   *  instead use the char[] termBuffer methods to set the
+   *  term text.
+   *  @param text term text
+   *  @param start start offset
+   *  @param end end offset */
  public Token(String text, int start, int end) {
    termText = text;
    startOffset = start;
    endOffset = end;
  }

-  /** Constructs a Token with the given term text buffer
-   *  starting at offset for length lenth, and start & end offsets.
-   *  The type defaults to "word." */
-  public Token(char[] text, int offset, int length, int start, int end) {
-    termBuffer = text;
-    termBufferOffset = offset;
-    termBufferLength = length;
-    startOffset = start;
-    endOffset = end;
-  }
-
-  /** Constructs a Token with the given text, start and end offsets, & type. */
+  /** Constructs a Token with the given text, start and end
+   *  offsets, & type.  <b>NOTE:</b> for better indexing
+   *  speed you should instead use the char[] termBuffer
+   *  methods to set the term text.
+   *  @param text term text
+   *  @param start start offset
+   *  @param end end offset
+   *  @param typ token type */
  public Token(String text, int start, int end, String typ) {
    termText = text;
    startOffset = start;
@ -91,19 +148,6 @@ public class Token implements Cloneable {
    type = typ;
  }

-  /** Constructs a Token with the given term text buffer
-   *  starting at offset for length lenth, and start & end
-   *  offsets, & type. */
-  public Token(char[] text, int offset, int length, int start, int end, String typ) {
-    termBuffer = text;
-    termBufferOffset = offset;
-    termBufferLength = length;
-    startOffset = start;
-    endOffset = end;
-    type = typ;
-  }
-
-
  /** Set the position increment.  This determines the position of this token
   * relative to the previous Token in a {@link TokenStream}, used in phrase
   * searching.
@ -139,28 +183,103 @@ public class Token implements Cloneable {
  /** Returns the position increment of this Token.
   * @see #setPositionIncrement
   */
-  public int getPositionIncrement() { return positionIncrement; }
+  public int getPositionIncrement() {
+    return positionIncrement;
+  }

-  /** Sets the Token's term text. */
+  /** Sets the Token's term text.  <b>NOTE:</b> for better
+   *  indexing speed you should instead use the char[]
+   *  termBuffer methods to set the term text. */
  public void setTermText(String text) {
    termText = text;
+    termBuffer = null;
  }

-  /** Returns the Token's term text. */
-  public final String termText() { return termText; }
-  public final char[] termBuffer() { return termBuffer; }
-  public final int termBufferOffset() { return termBufferOffset; }
-  public final int termBufferLength() { return termBufferLength; }
-
-  public void setStartOffset(int offset) {this.startOffset = offset;}
-  public void setEndOffset(int offset) {this.endOffset = offset;}
+  /** Returns the Token's term text.
+   * 
+   * @deprecated Use {@link #termBuffer()} and {@link
+   * #termLength()} instead. */
+  public final String termText() {
+    if (termText == null && termBuffer != null)
+      termText = new String(termBuffer, 0, termLength);
+    return termText;
+  }

+  /** Copies the contents of buffer, starting at offset for
+   *  length characters, into the termBuffer
+   *  array. <b>NOTE:</b> for better indexing speed you
+   *  should instead retrieve the termBuffer, using {@link
+   *  #termBuffer()} or {@link #resizeTermBuffer(int)}, and
+   *  fill it in directly to set the term text.  This saves
+   *  an extra copy. */
  public final void setTermBuffer(char[] buffer, int offset, int length) {
-    this.termBuffer = buffer;
-    this.termBufferOffset = offset;
-    this.termBufferLength = length;
+    resizeTermBuffer(length);
+    System.arraycopy(buffer, offset, termBuffer, 0, length);
+    termLength = length;
+  }
+
+  /** Returns the internal termBuffer character array which
+   *  you can then directly alter.  If the array is too
+   *  small for your token, use {@link
+   *  #resizeTermBuffer(int)} to increase it.  After
+   *  altering the buffer be sure to call {@link
+   *  #setTermLength} to record the number of valid
+   *  characters that were placed into the termBuffer. */
+  public final char[] termBuffer() {
+    initTermBuffer();
+    return termBuffer;
+  }
+
+  /** Grows the termBuffer to at least size newSize.
+   *  @param newSize minimum size of the new termBuffer
+   *  @return newly created termBuffer with length >= newSize
+   */
+  public char[] resizeTermBuffer(int newSize) {
+    initTermBuffer();
+    if (newSize > termBuffer.length) {
+      int size = termBuffer.length;
+      while(size < newSize)
+        size *= 2;
+      char[] newBuffer = new char[size];
+      System.arraycopy(termBuffer, 0, newBuffer, 0, termBuffer.length);
+      termBuffer = newBuffer;
+    }
+    return termBuffer;
+  }
+
+  // TODO: once we remove the deprecated termText() method
+  // and switch entirely to char[] termBuffer we don't need
+  // to use this method anymore
+  private void initTermBuffer() {
+    if (termBuffer == null) {
+      if (termText == null) {
+        termBuffer = new char[MIN_BUFFER_SIZE];
+        termLength = 0;
+      } else {
+        int length = termText.length();
+        if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE;
+        termBuffer = new char[length];
+        termLength = termText.length();
+        termText.getChars(0, termText.length(), termBuffer, 0);
+        termText = null;
+      }
+    } else if (termText != null)
+      termText = null;
+  }
+
+  /** Return number of valid characters (length of the term)
+   *  in the termBuffer array. */
+  public final int termLength() {
+    initTermBuffer();
+    return termLength;
+  }
+
+  /** Set number of valid characters (length of the term) in
+   *  the termBuffer array. */
+  public final void setTermLength(int length) {
+    initTermBuffer();
+    termLength = length;
  }
-    

  /** Returns this Token's starting offset, the position of the first character
    corresponding to this token in the source text.
@ -168,14 +287,50 @@ public class Token implements Cloneable {
    Note that the difference between endOffset() and startOffset() may not be
    equal to termText.length(), as the term text may have been altered by a
    stemmer or some other filter. */
-  public final int startOffset() { return startOffset; }
+  public final int startOffset() {
+    return startOffset;
+  }
+
+  /** Set the starting offset.
+      @see #startOffset() */
+  public void setStartOffset(int offset) {
+    this.startOffset = offset;
+  }

  /** Returns this Token's ending offset, one greater than the position of the
    last character corresponding to this token in the source text. */
-  public final int endOffset() { return endOffset; }
+  public final int endOffset() {
+    return endOffset;
+  }
+
+  /** Set the ending offset.
+      @see #endOffset() */
+  public void setEndOffset(int offset) {
+    this.endOffset = offset;
+  }

  /** Returns this Token's lexical type.  Defaults to "word". */
-  public final String type() { return type; }
+  public final String type() {
+    return type;
+  }
+
+  /** Set the lexical type.
+      @see #type() */
+  public final void setType(String type) {
+    this.type = type;
+  }
+
+  /** 
+   * Returns this Token's payload. 
+   * <p><font color="#FF0000">
+   * WARNING: The status of the <b>Payloads</b> feature is experimental. 
+   * The APIs introduced here might change in the future and will not be 
+   * supported anymore in such a case.</font>
+   */
+  // TODO: Remove warning after API has been finalized
+  public Payload getPayload() {
+    return this.payload;
+  }

  /** 
   * Sets this Token's payload.
@ -189,21 +344,15 @@ public class Token implements Cloneable {
    this.payload = payload;
  }
  
-  /** 
-   * Returns this Token's payload. 
-   * <p><font color="#FF0000">
-   * WARNING: The status of the <b>Payloads</b> feature is experimental. 
-   * The APIs introduced here might change in the future and will not be 
-   * supported anymore in such a case.</font>
-   */
-  // TODO: Remove warning after API has been finalized
-  public Payload getPayload() {
-    return this.payload;
-  }
-
  public String toString() {
    StringBuffer sb = new StringBuffer();
-    sb.append("(" + termText + "," + startOffset + "," + endOffset);
+    sb.append("(");
+    initTermBuffer();
+    if (termBuffer == null)
+      sb.append("null");
+    else
+      sb.append(termBuffer, 0, termLength);
+    sb.append("," + startOffset + "," + endOffset);
    if (!type.equals("word"))
      sb.append(",type="+type);
    if (positionIncrement != 1)
@ -212,11 +361,14 @@ public class Token implements Cloneable {
    return sb.toString();
  }

-  public Object clone() {
-    try {
-      return super.clone();
-    } catch (CloneNotSupportedException e) {
-      throw new RuntimeException(e); // shouldn't happen since we implement Cloneable
-    }
+  /** Reset all state for this token back to defaults. */
+  public void clear() {
+    payload = null;
+    // Leave termBuffer to allow re-use
+    termLength = 0;
+    termText = null;
+    positionIncrement = 1;
+    startOffset = endOffset = 0;
+    type = DEFAULT_TYPE;
  }
 }
--- a/src/java/org/apache/lucene/analysis/TokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/TokenFilter.java
@ -22,6 +22,8 @@ import java.io.IOException;
 /** A TokenFilter is a TokenStream whose input is another token stream.
  <p>
  This is an abstract class.
+  NOTE: subclasses must override at least one of {@link
+  #next()} or {@link #next(Token)}.
  */
 public abstract class TokenFilter extends TokenStream {
  /** The source of tokens for this filter. */
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@ -29,11 +29,36 @@ import java.io.IOException;
  <li>{@link TokenFilter}, a TokenStream
  whose input is another TokenStream.
  </ul>
+  NOTE: subclasses must override at least one of {@link
+  #next()} or {@link #next(Token)}.
  */

 public abstract class TokenStream {
-  /** Returns the next token in the stream, or null at EOS. */
-  public abstract Token next() throws IOException;
+
+  /** Returns the next token in the stream, or null at EOS.
+   *  The returned Token is a "full private copy" (not
+   *  re-used across calls to next()) but will be slower
+   *  than calling {@link #next(Token)} instead.. */
+  public Token next() throws IOException {
+    Token result = next(new Token());
+    return result;
+  }
+
+  /** Returns the next token in the stream, or null at EOS.
+   *  When possible, the input Token should be used as the
+   *  returned Token (this gives fastest tokenization
+   *  performance), but this is not required and a new Token
+   *  may be returned.  Callers may re-use a single Token
+   *  instance for successive calls to this method and must
+   *  therefore fully consume the previously returned Token
+   *  before calling this method again.
+   *  @param result a Token that may or may not be used to
+   *   return
+   *  @return next token in the stream or null if
+   *   end-of-stream was hit*/
+  public Token next(Token result) throws IOException {
+    return next();
+  }

  /** Resets this stream to the beginning. This is an
   *  optional operation, so subclasses may or may not
--- a/src/java/org/apache/lucene/analysis/Tokenizer.java
+++ b/src/java/org/apache/lucene/analysis/Tokenizer.java
@ -23,6 +23,8 @@ import java.io.IOException;
 /** A Tokenizer is a TokenStream whose input is a Reader.
  <p>
  This is an abstract class.
+  NOTE: subclasses must override at least one of {@link
+  #next()} or {@link #next(Token)}.
 */

 public abstract class Tokenizer extends TokenStream {
@ -41,5 +43,12 @@ public abstract class Tokenizer extends TokenStream {
  public void close() throws IOException {
    input.close();
  }
+
+  /** Reset the tokenizer to a new reader.  Typically, an
+   *  analyzer (in its reusableTokenStream method) will use
+   *  this to re-use a previously created tokenizer. */
+  protected void reset(Reader input) throws IOException {
+    this.input = input;
+  }
 }

--- a/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis;
 */

 import java.io.Reader;
+import java.io.IOException;

 /** An Analyzer that uses WhitespaceTokenizer. */

@ -25,4 +26,14 @@ public final class WhitespaceAnalyzer extends Analyzer {
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new WhitespaceTokenizer(reader);
  }
+
+  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+    Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream();
+    if (tokenizer == null) {
+      tokenizer = new WhitespaceTokenizer(reader);
+      setPreviousTokenStream(tokenizer);
+    } else
+      tokenizer.reset(reader);
+    return tokenizer;
+  }
 }
--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -75,4 +75,23 @@ public class StandardAnalyzer extends Analyzer {
    result = new StopFilter(result, stopSet);
    return result;
  }
+
+  private class SavedStreams {
+    StandardTokenizer tokenStream;
+    TokenStream filteredTokenStream;
+  };
+  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
+    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
+    if (streams == null) {
+      streams = new SavedStreams();
+      setPreviousTokenStream(streams);
+      streams.tokenStream = new StandardTokenizer(reader);
+      streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
+      streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
+      streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
+    } else
+      streams.tokenStream.reset(reader);
+    
+    return streams.filteredTokenStream;
+  }
 }
--- a/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardFilter.java
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.standard;
 */

 import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;

 /** Normalizes tokens extracted with {@link StandardTokenizer}. */
@ -37,33 +38,32 @@ public final class StandardFilter extends TokenFilter {
   * <p>Removes <tt>'s</tt> from the end of words.
   * <p>Removes dots from acronyms.
   */
-  public final org.apache.lucene.analysis.Token next() throws java.io.IOException {
-    org.apache.lucene.analysis.Token t = input.next();
+  public final Token next(Token result) throws java.io.IOException {
+    Token t = input.next(result);

    if (t == null)
      return null;

-    String text = t.termText();
-    String type = t.type();
+    char[] buffer = t.termBuffer();
+    final int bufferLength = t.termLength();
+    final String type = t.type();

    if (type == APOSTROPHE_TYPE &&		  // remove 's
-	(text.endsWith("'s") || text.endsWith("'S"))) {
-      return new org.apache.lucene.analysis.Token
-	(text.substring(0,text.length()-2),
-	 t.startOffset(), t.endOffset(), type);
-
+	bufferLength >= 2 &&
+        buffer[bufferLength-2] == '\'' &&
+        (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
+      // Strip last 2 characters off
+      t.setTermLength(bufferLength - 2);
    } else if (type == ACRONYM_TYPE) {		  // remove dots
-      StringBuffer trimmed = new StringBuffer();
-      for (int i = 0; i < text.length(); i++) {
-	char c = text.charAt(i);
-	if (c != '.')
-	  trimmed.append(c);
+      int upto = 0;
+      for(int i=0;i<bufferLength;i++) {
+        char c = buffer[i];
+        if (c != '.')
+          buffer[upto++] = c;
      }
-      return new org.apache.lucene.analysis.Token
-	(trimmed.toString(), t.startOffset(), t.endOffset(), type);
-
-    } else {
-      return t;
+      t.setTermLength(upto);
    }
+
+    return t;
  }
 }
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -43,6 +43,9 @@ import org.apache.lucene.analysis.Tokenizer;
 public class StandardTokenizer extends Tokenizer {
    /** A private instance of the JFlex-constructed scanner */
    private final StandardTokenizerImpl scanner;
+  void setInput(Reader reader) {
+    this.input = reader;
+  }

    /**
     * Creates a new instance of the {@link StandardTokenizer}. Attaches the
@ -58,19 +61,19 @@ public class StandardTokenizer extends Tokenizer {
     *
     * @see org.apache.lucene.analysis.TokenStream#next()
     */
-    public Token next() throws IOException {
+    public Token next(Token result) throws IOException {
 	int tokenType = scanner.getNextToken();

 	if (tokenType == StandardTokenizerImpl.YYEOF) {
 	    return null;
 	}

-	int startPosition = scanner.yychar();
-
-	final String tokenImage = scanner.yytext();
-	return new Token(tokenImage, startPosition, startPosition
-		+ tokenImage.length(),
-		StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+        scanner.getText(result);
+        final int start = scanner.yychar();
+        result.setStartOffset(start);
+        result.setEndOffset(start+result.termLength());
+        result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+        return result;
    }

    /*
@ -82,4 +85,9 @@ public class StandardTokenizer extends Tokenizer {
 	super.reset();
 	scanner.yyreset(input);
    }
+
+    public void reset(Reader reader) throws IOException {
+        input = reader;
+        reset();
+    }
 }
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java
@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.1 on 8/8/07 10:18 PM */
+/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */

 package org.apache.lucene.analysis.standard;

@ -19,7 +19,15 @@ package org.apache.lucene.analysis.standard;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.Token;

+
+/**
+ * This class is a scanner generated by 
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ * on 8/9/07 10:15 AM from the specification file
+ * <tt>/tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
+ */
 class StandardTokenizerImpl {

  /** This character denotes the end of file */
@ -297,6 +305,13 @@ public final int yychar()
    return yychar;
 }

+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t) {
+  t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+

  /**
   * Creates a new scanner
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.standard;
 * limitations under the License.
 */

+import org.apache.lucene.analysis.Token;
+
 %%

 %class StandardTokenizerImpl
@ -52,6 +54,13 @@ public final int yychar()
 {
    return yychar;
 }
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t) {
+  t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
 %}

 // basic word: a sequence of digits & letters
--- a/src/java/org/apache/lucene/index/DocumentsWriter.java
+++ b/src/java/org/apache/lucene/index/DocumentsWriter.java
@ -960,27 +960,17 @@ final class DocumentsWriter {

    /** Test whether the text for current Posting p equals
     *  current tokenText. */
-    boolean postingEquals(final String tokenString, final char[] tokenText,
-                          final int tokenTextLen, final int tokenTextOffset) {
+    boolean postingEquals(final char[] tokenText, final int tokenTextLen) {

      final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT];
      assert text != null;
      int pos = p.textStart & CHAR_BLOCK_MASK;

-      if (tokenText == null) {
-        // Compare to String
-        for(int i=0;i<tokenTextLen;i++)
-          if (tokenString.charAt(i) != text[pos++])
-            return false;
-        return text[pos] == 0xffff;
-      } else {
-        int tokenPos = tokenTextOffset;
-        final int stopAt = tokenTextLen+tokenPos;
-        for(;tokenPos<stopAt;pos++,tokenPos++)
-          if (tokenText[tokenPos] != text[pos])
-            return false;
-        return 0xffff == text[pos];
-      }
+      int tokenPos = 0;
+      for(;tokenPos<tokenTextLen;pos++,tokenPos++)
+        if (tokenText[tokenPos] != text[pos])
+          return false;
+      return 0xffff == text[pos];
    }

    /** Compares term text for two Posting instance and
@ -1241,8 +1231,7 @@ final class DocumentsWriter {
      }

      int offsetEnd;
-      Token token;
-      Token localToken = new Token("", 0, 0);
+      Token localToken = new Token();

      /* Invert one occurrence of one field in the document */
      public void invertField(Fieldable field, Analyzer analyzer, final int maxFieldLength) throws IOException {
@ -1251,12 +1240,12 @@ final class DocumentsWriter {
          position += analyzer.getPositionIncrementGap(fieldInfo.name);

        if (!field.isTokenized()) {		  // un-tokenized field
-          token = localToken;
          String stringValue = field.stringValue();
+          Token token = localToken;
          token.setTermText(stringValue);
          token.setStartOffset(offset);
          token.setEndOffset(offset + stringValue.length());
-          addPosition();
+          addPosition(token);
          offset += stringValue.length();
          length++;
        } else {                                  // tokenized field
@ -1282,7 +1271,7 @@ final class DocumentsWriter {
            }
          
            // Tokenize field and add to postingTable
-            stream = analyzer.tokenStream(fieldInfo.name, reader);
+            stream = analyzer.reusableTokenStream(fieldInfo.name, reader);
          }

          // reset the TokenStream to the first token
@ -1290,9 +1279,10 @@ final class DocumentsWriter {

          try {
            offsetEnd = offset-1;
-            for (token = stream.next(); token != null; token = stream.next()) {
+            Token token;
+            while((token = stream.next(localToken)) != null) {
              position += (token.getPositionIncrement() - 1);
-              addPosition();
+              addPosition(token);
              if (++length >= maxFieldLength) {
                if (infoStream != null)
                  infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
@ -1357,55 +1347,32 @@ final class DocumentsWriter {
       *  for every term of every document.  Its job is to *
       *  update the postings byte stream (Postings hash) *
       *  based on the occurence of a single term. */
-      private void addPosition() {
+      private void addPosition(Token token) {

        final Payload payload = token.getPayload();

-        final String tokenString;
-        final int tokenTextLen;
-        final int tokenTextOffset;
-
        // Get the text of this term.  Term can either
        // provide a String token or offset into a char[]
        // array
        final char[] tokenText = token.termBuffer();
+        final int tokenTextLen = token.termLength();

        int code = 0;
        int code2 = 0;

-        if (tokenText == null) {
+        // Compute hashcode
+        int downto = tokenTextLen;
+        while (downto > 0)
+          code = (code*31) + tokenText[--downto];

-          // Fallback to String token
-          tokenString = token.termText();
-          tokenTextLen = tokenString.length();
-          tokenTextOffset = 0;
-
-          // Compute hashcode.
-          int downto = tokenTextLen;
-          while (downto > 0)
-            code = (code*31) + tokenString.charAt(--downto);
-          
-          // System.out.println("  addPosition: field=" + fieldInfo.name + " string=" + tokenString + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset+token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
-
-        } else {
-          tokenString = null;
-          tokenTextLen = token.termBufferLength();
-          tokenTextOffset = token.termBufferOffset();
-
-          // Compute hashcode
-          int downto = tokenTextLen+tokenTextOffset;
-          while (downto > tokenTextOffset)
-            code = (code*31) + tokenText[--downto];
-
-          // System.out.println("  addPosition: buffer=" + new String(tokenText, tokenTextOffset, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);
-        }
+        // System.out.println("  addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);

        int hashPos = code & postingsHashMask;

        // Locate Posting in hash
        p = postingsHash[hashPos];

-        if (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset)) {
+        if (p != null && !postingEquals(tokenText, tokenTextLen)) {
          // Conflict: keep searching different locations in
          // the hash table.
          final int inc = code*1347|1;
@ -1413,7 +1380,7 @@ final class DocumentsWriter {
            code += inc;
            hashPos = code & postingsHashMask;
            p = postingsHash[hashPos];
-          } while (p != null && !postingEquals(tokenString, tokenText, tokenTextLen, tokenTextOffset));
+          } while (p != null && !postingEquals(tokenText, tokenTextLen));
        }
        
        final int proxCode;
@ -1492,10 +1459,7 @@ final class DocumentsWriter {
          p.textStart = textUpto + charPool.byteOffset;
          charPool.byteUpto += textLen1;

-          if (tokenString == null)
-            System.arraycopy(tokenText, tokenTextOffset, text, textUpto, tokenTextLen);
-          else
-            tokenString.getChars(0, tokenTextLen, text, textUpto);
+          System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);

          text[textUpto+tokenTextLen] = 0xffff;
          
--- a/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
+++ b/src/test/org/apache/lucene/analysis/TestCachingTokenFilter.java
@ -94,7 +94,7 @@ public class TestCachingTokenFilter extends TestCase {
    Token token;
    while ((token = stream.next()) != null) {
      assertTrue(count < tokens.length);
-      assertEquals(tokens[count], token.termText);
+      assertEquals(tokens[count], token.termText());
      count++;
    }
    
--- a/src/test/org/apache/lucene/analysis/TestToken.java
+++ b/src/test/org/apache/lucene/analysis/TestToken.java
@ -0,0 +1,56 @@
+package org.apache.lucene.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.*;
+import junit.framework.*;
+
+public class TestToken extends TestCase {
+
+  public TestToken(String name) {
+    super(name);
+  }
+
+  public void testToString() throws Exception {
+    char[] b = {'a', 'l', 'o', 'h', 'a'};
+    Token t = new Token("", 0, 5);
+    t.setTermBuffer(b, 0, 5);
+    assertEquals("(aloha,0,5)", t.toString());
+
+    t.setTermText("hi there");
+    assertEquals("(hi there,0,5)", t.toString());
+  }
+
+  public void testMixedStringArray() throws Exception {
+    Token t = new Token("hello", 0, 5);
+    assertEquals(t.termText(), "hello");
+    assertEquals(t.termLength(), 5);
+    assertEquals(new String(t.termBuffer(), 0, 5), "hello");
+    t.setTermText("hello2");
+    assertEquals(t.termLength(), 6);
+    assertEquals(new String(t.termBuffer(), 0, 6), "hello2");
+    t.setTermBuffer("hello3".toCharArray(), 0, 6);
+    assertEquals(t.termText(), "hello3");
+
+    // Make sure if we get the buffer and change a character
+    // that termText() reflects the change
+    char[] buffer = t.termBuffer();
+    buffer[1] = 'o';
+    assertEquals(t.termText(), "hollo3");
+  }
+}