LUCENE-7465: add SimplePatternTokenizer and SimpleSplitPatternTokenizer, for tokenization using Lucene's regexp/automaton implementation

2017-02-13 12:50:16 -05:00 · 2017-02-13 12:50:16 -05:00 · 93fa72f77b
parent 7dcf9de41f
commit 93fa72f77b
14 changed files with 1243 additions and 95 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -107,6 +107,11 @@ New Features
  SortedNumericSelector.Type can give a ValueSource view of a 
  SortedNumericDocValues field. (Tomás Fernández Löbbe)

+* LUCENE-7465: Add SimplePatternTokenizer and
+  SimplePatternSplitTokenizer, using Lucene's regexp/automaton
+  implementation for analysis/tokenization (Clinton Gormley, Mike
+  McCandless)
+
 Bug Fixes

 * LUCENE-7630: Fix (Edge)NGramTokenFilter to no longer drop payloads
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizer.java
@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
+ * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster.  This is just
+ * like {@link SimplePatternTokenizer} except that the pattern shold make valid token separator characters, like
+ * {@code String.split}.  Empty string tokens are never produced.
+ *
+ * @lucene.experimental
+ */
+
+public final class SimplePatternSplitTokenizer extends Tokenizer {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private final CharacterRunAutomaton runDFA;
+
+  // TODO: this is copied from SimplePatternTokenizer, but there are subtle differences e.g. we track sepUpto an tokenUpto;
+  // find a clean way to share it:
+
+  // TODO: we could likely use a single rolling buffer instead of two separate char buffers here.  We could also use PushBackReader but I
+  // suspect it's slowish:
+
+  private char[] pendingChars = new char[8];
+  private int tokenUpto;
+  private int pendingLimit;
+  private int pendingUpto;
+  private int offset;
+  private int sepUpto;
+  private final char[] buffer = new char[1024];
+  private int bufferLimit;
+  private int bufferNextRead;
+
+  /** See {@link RegExp} for the accepted syntax. */
+  public SimplePatternSplitTokenizer(String regexp) {
+    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  /** Runs a pre-built automaton. */
+  public SimplePatternSplitTokenizer(Automaton dfa) {
+    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
+  }
+
+  /** See {@link RegExp} for the accepted syntax. */
+  public SimplePatternSplitTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
+    this(factory, new RegExp(regexp).toAutomaton());
+  }
+
+  /** Runs a pre-built automaton. */
+  public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
+    super(factory);
+
+    // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
+    // realizing this ctor is otherwise trappy
+    if (dfa.isDeterministic() == false) {
+      throw new IllegalArgumentException("please determinize the incoming automaton first");
+    }
+
+    runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  private void fillToken(int offsetStart) {
+    termAtt.setLength(tokenUpto);
+    offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+tokenUpto));
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    int offsetStart = offset;
+
+    clearAttributes();
+
+    tokenUpto = 0;
+
+    while (true) {
+      sepUpto = 0;
+
+      // The runDFA operates in Unicode space, not UTF16 (java's char):
+      int ch = nextCodePoint();
+      if (ch == -1) {
+        if (tokenUpto > 0) {
+          fillToken(offsetStart);
+          return true;
+        } else {
+          return false;
+        }
+      }
+      int state = runDFA.step(0, ch);
+
+      if (state != -1) {
+        // a token separator just possibly started; keep scanning to see if the token is accepted:
+        int lastAcceptLength = -1;
+        do {
+
+          if (runDFA.isAccept(state)) {
+            // record that the token separator matches here, but keep scanning in case a longer match also works (greedy):
+            lastAcceptLength = sepUpto;
+          }
+
+          ch = nextCodePoint();
+          if (ch == -1) {
+            break;
+          }
+          state = runDFA.step(state, ch);
+        } while (state != -1);
+        
+        if (lastAcceptLength != -1) {
+          // strip the trailing separater we just matched from the token:
+          tokenUpto -= lastAcceptLength;
+          // we found a token separator
+          int extra = sepUpto - lastAcceptLength;
+          if (extra != 0) {
+            pushBack(extra);
+          }
+          if (tokenUpto > 0) {
+            fillToken(offsetStart);
+            return true;
+          } else {
+            // we matched one token separator immediately after another
+            offsetStart = offset;
+          }
+        } else if (ch == -1) {
+          if (tokenUpto > 0) {
+            fillToken(offsetStart);
+            return true;
+          } else {
+            return false;
+          }
+        } else {
+          // false alarm: there was no token separator here; push back all but the first character we scanned
+          pushBack(sepUpto-1);
+        }
+      }
+    }
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
+    offsetAtt.setOffset(ofs, ofs);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    offset = 0;
+    pendingUpto = 0;
+    pendingLimit = 0;
+    sepUpto = 0;
+    bufferNextRead = 0;
+    bufferLimit = 0;
+  }
+
+  /** Pushes back the last {@code count} characters in current token's buffer. */
+  private void pushBack(int count) {
+    tokenUpto -= count;
+    assert tokenUpto >= 0;
+    if (pendingLimit == 0) {
+      if (bufferNextRead >= count) {
+        // optimize common case when the chars we are pushing back are still in the buffer
+        bufferNextRead -= count;
+      } else {
+        if (count > pendingChars.length) {
+          pendingChars = ArrayUtil.grow(pendingChars, count);
+        }
+        System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+        pendingLimit = count;
+      }
+    } else {
+      // we are pushing back what is already in our pending buffer
+      pendingUpto -= count;
+      assert pendingUpto >= 0;
+    }
+    offset -= count;
+  }
+
+  private void appendToToken(char ch) {
+    char[] buffer = termAtt.buffer();
+    if (tokenUpto == buffer.length) {
+      buffer = termAtt.resizeBuffer(tokenUpto + 1);
+    }
+    buffer[tokenUpto++] = ch;
+    sepUpto++;
+  }
+
+  private int nextCodeUnit() throws IOException {
+    int result;
+    if (pendingUpto < pendingLimit) {
+      result = pendingChars[pendingUpto++];
+      if (pendingUpto == pendingLimit) {
+        // We used up the pending buffer
+        pendingUpto = 0;
+        pendingLimit = 0;
+      }
+      appendToToken((char) result);
+      offset++;
+    } else if (bufferLimit == -1) {
+      return -1;
+    } else {
+      assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
+      if (bufferNextRead == bufferLimit) {
+        bufferLimit = input.read(buffer, 0, buffer.length);
+        if (bufferLimit == -1) {
+          return -1;
+        }
+        bufferNextRead = 0;
+      }
+      result = buffer[bufferNextRead++];
+      offset++;
+      appendToToken((char) result);
+    }
+    return result;
+  }
+  
+  private int nextCodePoint() throws IOException {
+
+    int ch = nextCodeUnit();
+    if (ch == -1) {
+      return ch;
+    }
+    if (Character.isHighSurrogate((char) ch)) {
+      return Character.toCodePoint((char) ch, (char) nextCodeUnit());
+    } else {
+      return ch;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternSplitTokenizerFactory.java
@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Factory for {@link SimplePatternSplitTokenizer}, for producing tokens by splitting according to the provided regexp.
+ *
+ * <p>This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
+ * for the input stream.  The syntax is more limited than {@link PatternTokenizer}, but the
+ * tokenization is quite a bit faster.  It takes two arguments:
+ * <br>
+ * <ul>
+ * <li>"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}</li>
+ * <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp</li>
+ * </ul>
+ * <p>
+ * The pattern matches the characters that should split tokens, like {@code String.split}, and the
+ * matching is greedy such that the longest token separator matching at a given point is matched.  Empty
+ * tokens are never created.
+ *
+ * <p>For example, to match tokens delimited by simple whitespace characters:
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.SimplePatternSplitTokenizerFactory" pattern="[ \t\r\n]+"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre> 
+ * 
+ * @lucene.experimental
+ *
+ * @see SimplePatternSplitTokenizer
+ */
+public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {
+  public static final String PATTERN = "pattern";
+  private final Automaton dfa;
+  private final int maxDeterminizedStates;
+ 
+  /** Creates a new SimpleSplitPatternTokenizerFactory */
+  public SimplePatternSplitTokenizerFactory(Map<String,String> args) {
+    super(args);
+    maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+    dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
+    if (args.isEmpty() == false) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public SimplePatternSplitTokenizer create(final AttributeFactory factory) {
+    return new SimplePatternSplitTokenizer(factory, dfa);
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizer.java
@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * This tokenizer uses a Lucene {@link RegExp} or (expert usage) a pre-built determinized {@link Automaton}, to locate tokens.
+ * The regexp syntax is more limited than {@link PatternTokenizer}, but the tokenization is quite a bit faster.  The provided
+ * regex should match valid token characters (not token separator characters, like {@code String.split}).  The matching is greedy:
+ * the longest match at a given start point will be the next token.  Empty string tokens are never produced.
+ *
+ * @lucene.experimental
+ */
+
+// TODO: the matcher here is naive and does have N^2 adversarial cases that are unlikely to arise in practice, e.g. if the pattern is
+// aaaaaaaaaab and the input is aaaaaaaaaaa, the work we do here is N^2 where N is the number of a's.  This is because on failing to match
+// a token, we skip one character forward and try again.  A better approach would be to compile something like this regexp
+// instead: .* | <pattern>, because that automaton would not "forget" all the as it had already seen, and would be a single pass
+// through the input.  I think this is the same thing as Aho/Corasick's algorithm (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm).
+// But we cannot implement this (I think?) until/unless Lucene regexps support sub-group capture, so we could know
+// which specific characters the pattern matched.  SynonymFilter has this same limitation.
+
+public final class SimplePatternTokenizer extends Tokenizer {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+  private final CharacterRunAutomaton runDFA;
+
+  // TODO: we could likely use a single rolling buffer instead of two separate char buffers here.  We could also use PushBackReader but I
+  // suspect it's slowish:
+
+  private char[] pendingChars = new char[8];
+  private int pendingLimit;
+  private int pendingUpto;
+  private int offset;
+  private int tokenUpto;
+  private final char[] buffer = new char[1024];
+  private int bufferLimit;
+  private int bufferNextRead;
+
+  /** See {@link RegExp} for the accepted syntax. */
+  public SimplePatternTokenizer(String regexp) {
+    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  /** Runs a pre-built automaton. */
+  public SimplePatternTokenizer(Automaton dfa) {
+    this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, dfa);
+  }
+
+  /** See {@link RegExp} for the accepted syntax. */
+  public SimplePatternTokenizer(AttributeFactory factory, String regexp, int maxDeterminizedStates) {
+    this(factory, new RegExp(regexp).toAutomaton());
+  }
+
+  /** Runs a pre-built automaton. */
+  public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
+    super(factory);
+
+    // we require user to do this up front because it is a possibly very costly operation, and user may be creating us frequently, not
+    // realizing this ctor is otherwise trappy
+    if (dfa.isDeterministic() == false) {
+      throw new IllegalArgumentException("please determinize the incoming automaton first");
+    }
+
+    runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+
+    clearAttributes();
+    tokenUpto = 0;
+
+    while (true) {
+
+      int offsetStart = offset;
+
+      // The runDFA operates in Unicode space, not UTF16 (java's char):
+
+      int ch = nextCodePoint();
+      if (ch == -1) {
+        return false;
+      }
+
+      int state = runDFA.step(0, ch);
+
+      if (state != -1) {
+        // a token just possibly started; keep scanning to see if the token is accepted:
+        int lastAcceptLength = -1;
+        do {
+
+          if (runDFA.isAccept(state)) {
+            // record that the token matches here, but keep scanning in case a longer match also works (greedy):
+            lastAcceptLength = tokenUpto;
+          }
+
+          ch = nextCodePoint();
+          if (ch == -1) {
+            break;
+          }
+          state = runDFA.step(state, ch);
+        } while (state != -1);
+        
+        if (lastAcceptLength != -1) {
+          // we found a token
+          int extra = tokenUpto - lastAcceptLength;
+          if (extra != 0) {
+            pushBack(extra);
+          }
+          termAtt.setLength(lastAcceptLength);
+          offsetAtt.setOffset(correctOffset(offsetStart), correctOffset(offsetStart+lastAcceptLength));
+          return true;
+        } else if (ch == -1) {
+          return false;
+        } else {
+          // false alarm: there was no token here; push back all but the first character we scanned
+          pushBack(tokenUpto-1);
+          tokenUpto = 0;
+        }
+      } else {
+        tokenUpto = 0;
+      }
+    }
+  }
+
+  @Override
+  public void end() throws IOException {
+    super.end();
+    final int ofs = correctOffset(offset + pendingLimit - pendingUpto);
+    offsetAtt.setOffset(ofs, ofs);
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    offset = 0;
+    pendingUpto = 0;
+    pendingLimit = 0;
+    tokenUpto = 0;
+    bufferNextRead = 0;
+    bufferLimit = 0;
+  }
+
+  /** Pushes back the last {@code count} characters in current token's buffer. */
+  private void pushBack(int count) {
+    
+    if (pendingLimit == 0) {
+      if (bufferNextRead >= count) {
+        // optimize common case when the chars we are pushing back are still in the buffer
+        bufferNextRead -= count;
+      } else {
+        if (count > pendingChars.length) {
+          pendingChars = ArrayUtil.grow(pendingChars, count);
+        }
+        System.arraycopy(termAtt.buffer(), tokenUpto - count, pendingChars, 0, count);
+        pendingLimit = count;
+      }
+    } else {
+      // we are pushing back what is already in our pending buffer
+      pendingUpto -= count;
+      assert pendingUpto >= 0;
+    }
+    offset -= count;
+  }
+
+  private void appendToToken(char ch) {
+    char[] buffer = termAtt.buffer();
+    if (tokenUpto == buffer.length) {
+      buffer = termAtt.resizeBuffer(tokenUpto + 1);
+    }
+    buffer[tokenUpto++] = ch;
+  }
+
+  private int nextCodeUnit() throws IOException {
+    int result;
+    if (pendingUpto < pendingLimit) {
+      result = pendingChars[pendingUpto++];
+      if (pendingUpto == pendingLimit) {
+        // We used up the pending buffer
+        pendingUpto = 0;
+        pendingLimit = 0;
+      }
+      appendToToken((char) result);
+      offset++;
+    } else if (bufferLimit == -1) {
+      return -1;
+    } else {
+      assert bufferNextRead <= bufferLimit: "bufferNextRead=" + bufferNextRead + " bufferLimit=" + bufferLimit;
+      if (bufferNextRead == bufferLimit) {
+        bufferLimit = input.read(buffer, 0, buffer.length);
+        if (bufferLimit == -1) {
+          return -1;
+        }
+        bufferNextRead = 0;
+      }
+      result = buffer[bufferNextRead++];
+      offset++;
+      appendToToken((char) result);
+    }
+    return result;
+  }
+  
+  private int nextCodePoint() throws IOException {
+
+    int ch = nextCodeUnit();
+    if (ch == -1) {
+      return ch;
+    }
+    if (Character.isHighSurrogate((char) ch)) {
+      return Character.toCodePoint((char) ch, (char) nextCodeUnit());
+    } else {
+      return ch;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/SimplePatternTokenizerFactory.java
@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.util.TokenizerFactory;
+import org.apache.lucene.util.AttributeFactory;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Factory for {@link SimplePatternTokenizer}, for matching tokens based on the provided regexp.
+ *
+ * <p>This tokenizer uses Lucene {@link RegExp} pattern matching to construct distinct tokens
+ * for the input stream.  The syntax is more limited than {@link PatternTokenizer}, but the
+ * tokenization is quite a bit faster.  It takes two arguments:
+ * <br>
+ * <ul>
+ * <li>"pattern" (required) is the regular expression, according to the syntax described at {@link RegExp}</li>
+ * <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the determined automaton computed from the regexp</li>
+ * </ul>
+ * <p>
+ * The pattern matches the characters to include in a token (not the split characters), and the
+ * matching is greedy such that the longest token matching at a given point is created.  Empty
+ * tokens are never created.
+ *
+ * <p>For example, to match tokens delimited by simple whitespace characters:
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_ptn" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.SimplePatternTokenizerFactory" pattern="[^ \t\r\n]+"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre> 
+ *
+ * @lucene.experimental
+ * 
+ * @see SimplePatternTokenizer
+ */
+public class SimplePatternTokenizerFactory extends TokenizerFactory {
+  public static final String PATTERN = "pattern";
+  private final Automaton dfa;
+  private final int maxDeterminizedStates;
+ 
+  /** Creates a new SimplePatternTokenizerFactory */
+  public SimplePatternTokenizerFactory(Map<String,String> args) {
+    super(args);
+    maxDeterminizedStates = getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+    dfa = Operations.determinize(new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
+    if (args.isEmpty() == false) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+  
+  @Override
+  public SimplePatternTokenizer create(final AttributeFactory factory) {
+    return new SimplePatternTokenizer(factory, dfa);
+  }
+}
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenizerFactory
@ -21,6 +21,8 @@ org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory
 org.apache.lucene.analysis.ngram.NGramTokenizerFactory
 org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory
 org.apache.lucene.analysis.pattern.PatternTokenizerFactory
+org.apache.lucene.analysis.pattern.SimplePatternSplitTokenizerFactory
+org.apache.lucene.analysis.pattern.SimplePatternTokenizerFactory
 org.apache.lucene.analysis.standard.ClassicTokenizerFactory
 org.apache.lucene.analysis.standard.StandardTokenizerFactory
 org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@ -96,7 +96,11 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Rethrow;
 import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.Operations;
+import org.apache.lucene.util.automaton.RegExp;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.tartarus.snowball.SnowballProgram;
@ -494,6 +498,9 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
        if (random.nextBoolean()) return null;
        return DateFormat.getDateInstance(DateFormat.DEFAULT, randomLocale(random));
    });
+    put(Automaton.class, random -> {
+        return Operations.determinize(new RegExp(AutomatonTestUtil.randomRegexp(random()), RegExp.NONE).toAutomaton(), Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+    });
  }};
  
  static final Set<Class<?>> allowedTokenizerArgs, allowedTokenFilterArgs, allowedCharFilterArgs;
@ -503,7 +510,8 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
    allowedTokenizerArgs.add(Reader.class);
    allowedTokenizerArgs.add(AttributeFactory.class);
    allowedTokenizerArgs.add(AttributeSource.class);
-    
+    allowedTokenizerArgs.add(Automaton.class);
+
    allowedTokenFilterArgs = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>());
    allowedTokenFilterArgs.addAll(argProducers.keySet());
    allowedTokenFilterArgs.add(TokenStream.class);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternSplitTokenizer.java
@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+
+public class TestSimplePatternSplitTokenizer extends BaseTokenStreamTestCase {
+
+  public void testGreedy() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("(foo)+");
+    t.setReader(new StringReader("bar foofoo baz"));
+    assertTokenStreamContents(t,
+                              new String[] {"bar ", " baz"},
+                              new int[] {0, 10},
+                              new int[] {4, 14});
+  }
+
+  public void testBackToBack() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("foo");
+    t.setReader(new StringReader("bar foofoo baz"));
+    assertTokenStreamContents(t,
+                              new String[] {"bar ", " baz"},
+                              new int[] {0, 10},
+                              new int[] {4, 14});
+  }
+
+  public void testBigLookahead() throws Exception {
+    StringBuilder b = new StringBuilder();
+    for(int i=0;i<100;i++) {
+      b.append('a');
+    }
+    b.append('b');
+    Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+
+    b = new StringBuilder();
+    for(int i=0;i<200;i++) {
+      b.append('a');
+    }
+    t.setReader(new StringReader(b.toString()));
+    t.reset();
+    assertTrue(t.incrementToken());
+    assertEquals(b.toString(), termAtt.toString());
+    assertFalse(t.incrementToken());
+  }
+
+  public void testNoTokens() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer(".*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    String s;
+    while (true) {
+      s = TestUtil.randomUnicodeString(random());
+      if (s.length() > 0) {
+        break;
+      }
+    }
+    t.setReader(new StringReader(s));
+    t.reset();
+    assertFalse(t.incrementToken());
+  }
+
+  public void testEmptyStringPatternNoMatch() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("a*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("bbb"));
+    t.reset();
+    assertTrue(t.incrementToken());
+    assertEquals("bbb", termAtt.toString());
+    assertFalse(t.incrementToken());
+  }
+
+  public void testSplitSingleCharWhitespace() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("a \tb   c"));
+    assertTokenStreamContents(t,
+                              new String[] {"a", "b", "c"},
+                              new int[] {0, 3, 7},
+                              new int[] {1, 4, 8});
+  }
+
+  public void testSplitMultiCharWhitespace() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("a \tb   c"));
+    assertTokenStreamContents(t,
+                              new String[] {"a", "b", "c"},
+                              new int[] {0, 3, 7},
+                              new int[] {1, 4, 8});
+  }
+
+  public void testLeadingNonToken() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("    a c"));
+    assertTokenStreamContents(t,
+                              new String[] {"a", "c"},
+                              new int[] {4, 6},
+                              new int[] {5, 7});
+  }
+
+  public void testTrailingNonToken() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("a c   "));
+    assertTokenStreamContents(t,
+                              new String[] {"a", "c"},
+                              new int[] {0, 2},
+                              new int[] {1, 3});
+  }
+
+  public void testEmptyStringPatternOneMatch() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("a*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("bbab"));
+    assertTokenStreamContents(t,
+                              new String[] {"bb", "b"},
+                              new int[] {0, 3},
+                              new int[] {2, 4});
+  }
+
+  public void testEndOffset() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("a+");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
+    t.setReader(new StringReader("aaabbb"));
+    t.reset();
+    assertTrue(t.incrementToken());
+    assertEquals("bbb", termAtt.toString());
+    assertFalse(t.incrementToken());
+    t.end();
+    assertEquals(6, offsetAtt.endOffset());
+  }
+
+  public void testFixedToken() throws Exception {
+    Tokenizer t = new SimplePatternSplitTokenizer("aaaa");
+
+    t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
+    assertTokenStreamContents(t,
+                              new String[] {"aaa"},
+                              new int[] {12},
+                              new int[] {15});
+  }
+
+  public void testBasic() throws Exception 
+  {
+    String[][] tests = {
+      // pattern        input                    output
+      { "--",          "aaa--bbb--ccc",         "aaa bbb ccc" },
+      { ":",           "aaa:bbb:ccc",           "aaa bbb ccc" },
+      { ":",           "boo:and:foo",           "boo and foo" },
+      { "o",           "boo:and:foo",           "b :and:f" },
+    };
+    
+    for(String[] test : tests) {     
+      TokenStream stream = new SimplePatternSplitTokenizer(test[0]);
+      ((Tokenizer)stream).setReader(new StringReader(test[1]));
+      String out = tsToString(stream);
+      assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
+    } 
+  }
+
+  public void testNotDeterminized() throws Exception {
+    Automaton a = new Automaton();
+    int start = a.createState();
+    int mid1 = a.createState();
+    int mid2 = a.createState();
+    int end = a.createState();
+    a.setAccept(end, true);
+    a.addTransition(start, mid1, 'a', 'z');
+    a.addTransition(start, mid2, 'a', 'z');
+    a.addTransition(mid1, end, 'b');
+    a.addTransition(mid2, end, 'b');
+    expectThrows(IllegalArgumentException.class, () -> {new SimplePatternSplitTokenizer(a);});
+  }
+
+  public void testOffsetCorrection() throws Exception {
+    final String INPUT = "G&uuml;nther G&uuml;nther is here";
+
+    // create MappingCharFilter
+    List<String> mappingRules = new ArrayList<>();
+    mappingRules.add( "\"&uuml;\" => \"ü\"" );
+    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+    builder.add("&uuml;", "ü");
+    NormalizeCharMap normMap = builder.build();
+    CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
+
+    // create SimplePatternSplitTokenizer
+    Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
+    stream.setReader(charStream);
+    assertTokenStreamContents(stream,
+        new String[] { " ", " is here" },
+        new int[] { 12, 25 },
+        new int[] { 13, 33 },
+        INPUT.length());
+  }
+  
+  /** 
+   * TODO: rewrite tests not to use string comparison.
+   */
+  private static String tsToString(TokenStream in) throws IOException {
+    StringBuilder out = new StringBuilder();
+    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
+    // extra safety to enforce, that the state is not preserved and also
+    // assign bogus values
+    in.clearAttributes();
+    termAtt.setEmpty().append("bogusTerm");
+    in.reset();
+    while (in.incrementToken()) {
+      if (out.length() > 0) {
+        out.append(' ');
+      }
+      out.append(termAtt.toString());
+      in.clearAttributes();
+      termAtt.setEmpty().append("bogusTerm");
+    }
+
+    in.close();
+    return out.toString();
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
+        return new TokenStreamComponents(tokenizer);
+      }    
+    };
+    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    a.close();
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new SimplePatternSplitTokenizer("a");
+        return new TokenStreamComponents(tokenizer);
+      }    
+    };
+    checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
+    b.close();
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestSimplePatternTokenizer.java
@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.util.TestUtil;
+import org.apache.lucene.util.automaton.Automaton;
+
+public class TestSimplePatternTokenizer extends BaseTokenStreamTestCase {
+
+  public void testGreedy() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("(foo)+");
+    t.setReader(new StringReader("bar foofoo baz"));
+    assertTokenStreamContents(t,
+                              new String[] {"foofoo"},
+                              new int[] {4},
+                              new int[] {10});
+  }
+
+  public void testBigLookahead() throws Exception {
+    StringBuilder b = new StringBuilder();
+    for(int i=0;i<100;i++) {
+      b.append('a');
+    }
+    b.append('b');
+    Tokenizer t = new SimplePatternTokenizer(b.toString());
+
+    b = new StringBuilder();
+    for(int i=0;i<200;i++) {
+      b.append('a');
+    }
+    t.setReader(new StringReader(b.toString()));
+    t.reset();
+    assertFalse(t.incrementToken());
+  }
+
+  public void testOneToken() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer(".*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    String s;
+    while (true) {
+      s = TestUtil.randomUnicodeString(random());
+      if (s.length() > 0) {
+        break;
+      }
+    }
+    t.setReader(new StringReader(s));
+    t.reset();
+    assertTrue(t.incrementToken());
+    assertEquals(s, termAtt.toString());
+  }
+
+  public void testEmptyStringPatternNoMatch() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("a*");
+    t.setReader(new StringReader("bbb"));
+    t.reset();
+    assertFalse(t.incrementToken());
+  }
+
+  public void testEmptyStringPatternOneMatch() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("a*");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    t.setReader(new StringReader("bbab"));
+    t.reset();
+    assertTrue(t.incrementToken());
+    assertEquals("a", termAtt.toString());
+    assertFalse(t.incrementToken());
+  }
+
+  public void testEndOffset() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("a+");
+    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
+    OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
+    t.setReader(new StringReader("aaabbb"));
+    t.reset();
+    assertTrue(t.incrementToken());
+    assertEquals("aaa", termAtt.toString());
+    assertFalse(t.incrementToken());
+    t.end();
+    assertEquals(6, offsetAtt.endOffset());
+  }
+
+  public void testFixedToken() throws Exception {
+    Tokenizer t = new SimplePatternTokenizer("aaaa");
+
+    t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
+    assertTokenStreamContents(t,
+                              new String[] {"aaaa", "aaaa", "aaaa"},
+                              new int[] {0, 4, 8},
+                              new int[] {4, 8, 12});
+  }
+
+  public void testBasic() throws Exception  {
+    String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
+    String[][] tests = {
+      // pattern        input                    output
+      { ":",           "boo:and:foo",           ": :" },
+      { qpattern,      "aaa 'bbb' 'ccc'",       "'bbb' 'ccc'" },
+    };
+    
+    for(String[] test : tests) {     
+      TokenStream stream = new SimplePatternTokenizer(test[0]);
+      ((Tokenizer)stream).setReader(new StringReader(test[1]));
+      String out = tsToString(stream);
+
+      assertEquals("pattern: "+test[0]+" with input: "+test[1], test[2], out);
+    } 
+  }
+
+  public void testNotDeterminized() throws Exception {
+    Automaton a = new Automaton();
+    int start = a.createState();
+    int mid1 = a.createState();
+    int mid2 = a.createState();
+    int end = a.createState();
+    a.setAccept(end, true);
+    a.addTransition(start, mid1, 'a', 'z');
+    a.addTransition(start, mid2, 'a', 'z');
+    a.addTransition(mid1, end, 'b');
+    a.addTransition(mid2, end, 'b');
+    expectThrows(IllegalArgumentException.class, () -> {new SimplePatternTokenizer(a);});
+  }
+
+  public void testOffsetCorrection() throws Exception {
+    final String INPUT = "G&uuml;nther G&uuml;nther is here";
+
+    // create MappingCharFilter
+    List<String> mappingRules = new ArrayList<>();
+    mappingRules.add( "\"&uuml;\" => \"ü\"" );
+    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+    builder.add("&uuml;", "ü");
+    NormalizeCharMap normMap = builder.build();
+    CharFilter charStream = new MappingCharFilter( normMap, new StringReader(INPUT));
+
+    // create SimplePatternTokenizer
+    Tokenizer stream = new SimplePatternTokenizer("Günther");
+    stream.setReader(charStream);
+    assertTokenStreamContents(stream,
+        new String[] { "Günther", "Günther" },
+        new int[] { 0, 13 },
+        new int[] { 12, 25 },
+        INPUT.length());
+  }
+  
+  /** 
+   * TODO: rewrite tests not to use string comparison.
+   */
+  private static String tsToString(TokenStream in) throws IOException {
+    StringBuilder out = new StringBuilder();
+    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
+    // extra safety to enforce, that the state is not preserved and also
+    // assign bogus values
+    in.clearAttributes();
+    termAtt.setEmpty().append("bogusTerm");
+    in.reset();
+    while (in.incrementToken()) {
+      if (out.length() > 0) {
+        out.append(' ');
+      }
+      out.append(termAtt.toString());
+      in.clearAttributes();
+      termAtt.setEmpty().append("bogusTerm");
+    }
+
+    in.close();
+    return out.toString();
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new SimplePatternTokenizer("a");
+        return new TokenStreamComponents(tokenizer);
+      }    
+    };
+    checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
+    a.close();
+    
+    Analyzer b = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new SimplePatternTokenizer("a");
+        return new TokenStreamComponents(tokenizer);
+      }    
+    };
+    checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER);
+    b.close();
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/ByteRunAutomaton.java
@ -27,9 +27,9 @@ public class ByteRunAutomaton extends RunAutomaton {
    this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
  }
  
-  /** expert: if utf8 is true, the input is already byte-based */
+  /** expert: if isBinary is true, the input is already byte-based */
  public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
-    super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, true, maxDeterminizedStates);
+    super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates);
  }

  /**
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/CharacterRunAutomaton.java
@ -36,7 +36,7 @@ public class CharacterRunAutomaton extends RunAutomaton {
   *   it then a TooComplexToDeterminizeException is thrown.
   */ 
  public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
-    super(a, Character.MAX_CODE_POINT, false, maxDeterminizedStates);
+    super(a, Character.MAX_CODE_POINT+1, maxDeterminizedStates);
  }

  /**
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java
@ -29,24 +29,24 @@

 package org.apache.lucene.util.automaton;

-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.IntsRefBuilder;
-import org.apache.lucene.util.RamUsageEstimator;
-
+import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.BitSet;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.IntsRefBuilder;
+import org.apache.lucene.util.RamUsageEstimator;
+
 /**
 * Automata operations.
 * 
@ -335,7 +335,7 @@ final public class Operations {
    Transition[][] transitions2 = a2.getSortedTransitions();
    Automaton c = new Automaton();
    c.createState();
-    LinkedList<StatePair> worklist = new LinkedList<>();
+    ArrayDeque<StatePair> worklist = new ArrayDeque<>();
    HashMap<StatePair,StatePair> newstates = new HashMap<>();
    StatePair p = new StatePair(0, 0, 0);
    worklist.add(p);
@ -435,7 +435,7 @@ final public class Operations {
    // TODO: cutover to iterators instead
    Transition[][] transitions1 = a1.getSortedTransitions();
    Transition[][] transitions2 = a2.getSortedTransitions();
-    LinkedList<StatePair> worklist = new LinkedList<>();
+    ArrayDeque<StatePair> worklist = new ArrayDeque<>();
    HashSet<StatePair> visited = new HashSet<>();
    StatePair p = new StatePair(0, 0);
    worklist.add(p);
@ -682,7 +682,7 @@ final public class Operations {
    // Create state 0:
    b.createState();

-    LinkedList<SortedIntSet.FrozenIntSet> worklist = new LinkedList<>();
+    ArrayDeque<SortedIntSet.FrozenIntSet> worklist = new ArrayDeque<>();
    Map<SortedIntSet.FrozenIntSet,Integer> newstate = new HashMap<>();

    worklist.add(initialset);
@ -804,7 +804,7 @@ final public class Operations {
      return false;
    }
    
-    LinkedList<Integer> workList = new LinkedList<>();
+    ArrayDeque<Integer> workList = new ArrayDeque<>();
    BitSet seen = new BitSet(a.getNumStates());
    workList.add(0);
    seen.set(0);
@ -907,7 +907,7 @@ final public class Operations {
    if (numStates == 0) {
      return live;
    }
-    LinkedList<Integer> workList = new LinkedList<>();
+    ArrayDeque<Integer> workList = new ArrayDeque<>();
    live.set(0);
    workList.add(0);

@ -946,7 +946,7 @@ final public class Operations {
    }
    Automaton a2 = builder.finish();

-    LinkedList<Integer> workList = new LinkedList<>();
+    ArrayDeque<Integer> workList = new ArrayDeque<>();
    BitSet live = new BitSet(numStates);
    BitSet acceptBits = a.getAcceptStates();
    int s = 0;
@ -1010,22 +1010,6 @@ final public class Operations {
    return result;
  }

-  /**
-   * Finds the largest entry whose value is less than or equal to c, or 0 if
-   * there is no such entry.
-   */
-  static int findIndex(int c, int[] points) {
-    int a = 0;
-    int b = points.length;
-    while (b - a > 1) {
-      int d = (a + b) >>> 1;
-      if (points[d] > c) b = d;
-      else if (points[d] < c) a = d;
-      else return d;
-    }
-    return a;
-  }
-  
  /**
   * Returns true if the language of this automaton is finite.  The
   * automaton must not have any dead states.
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RunAutomaton.java
@ -38,13 +38,62 @@ import java.util.Arrays;
 */
 public abstract class RunAutomaton {
  final Automaton automaton;
-  final int maxInterval;
+  final int alphabetSize;
  final int size;
  final boolean[] accept;
  final int[] transitions; // delta(state,c) = transitions[state*points.length +
                     // getCharClass(c)]
  final int[] points; // char interval start points
-  final int[] classmap; // map from char number to class class
+  final int[] classmap; // map from char number to class
+  
+  /**
+   * Constructs a new <code>RunAutomaton</code> from a deterministic
+   * <code>Automaton</code>.
+   * 
+   * @param a an automaton
+   */
+  protected RunAutomaton(Automaton a, int alphabetSize) {
+    this(a, alphabetSize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
+  }
+
+  /**
+   * Constructs a new <code>RunAutomaton</code> from a deterministic
+   * <code>Automaton</code>.
+   * 
+   * @param a an automaton
+   * @param maxDeterminizedStates maximum number of states that can be created
+   *   while determinizing a
+   */
+  protected RunAutomaton(Automaton a, int alphabetSize, int maxDeterminizedStates) {
+    this.alphabetSize = alphabetSize;
+    a = Operations.determinize(a, maxDeterminizedStates);
+    this.automaton = a;
+    points = a.getStartPoints();
+    size = Math.max(1,a.getNumStates());
+    accept = new boolean[size];
+    transitions = new int[size * points.length];
+    Arrays.fill(transitions, -1);
+    for (int n=0;n<size;n++) {
+      accept[n] = a.isAccept(n);
+      for (int c = 0; c < points.length; c++) {
+        int dest = a.step(n, points[c]);
+        assert dest == -1 || dest < size;
+        transitions[n * points.length + c] = dest;
+      }
+    }
+
+    /*
+     * Set alphabet table for optimal run performance.
+     */
+    classmap = new int[Math.min(256, alphabetSize)];
+    int i = 0;
+    for (int j = 0; j < classmap.length; j++) {
+      if (i + 1 < points.length && j == points[i + 1]) {
+        i++;
+      }
+      classmap[j] = i;
+    }
+  }
  
  /**
   * Returns a string representation of this automaton.
@ -63,7 +112,7 @@ public abstract class RunAutomaton {
          int min = points[j];
          int max;
          if (j + 1 < points.length) max = (points[j + 1] - 1);
-          else max = maxInterval;
+          else max = alphabetSize;
          b.append(" ");
          Automaton.appendCharString(min, b);
          if (min != max) {
@ -103,63 +152,19 @@ public abstract class RunAutomaton {
   * Gets character class of given codepoint
   */
  final int getCharClass(int c) {
-    return Operations.findIndex(c, points);
-  }

-  /**
-   * Constructs a new <code>RunAutomaton</code> from a deterministic
-   * <code>Automaton</code>.
-   * 
-   * @param a an automaton
-   */
-  public RunAutomaton(Automaton a, int maxInterval, boolean tableize) {
-    this(a, maxInterval, tableize, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
-  }
-
-  /**
-   * Constructs a new <code>RunAutomaton</code> from a deterministic
-   * <code>Automaton</code>.
-   * 
-   * @param a an automaton
-   * @param maxDeterminizedStates maximum number of states that can be created
-   *   while determinizing a
-   */
-  public RunAutomaton(Automaton a, int maxInterval, boolean tableize,
-      int maxDeterminizedStates) {
-    this.maxInterval = maxInterval;
-    a = Operations.determinize(a, maxDeterminizedStates);
-    this.automaton = a;
-    points = a.getStartPoints();
-    size = Math.max(1,a.getNumStates());
-    accept = new boolean[size];
-    transitions = new int[size * points.length];
-    Arrays.fill(transitions, -1);
-    for (int n=0;n<size;n++) {
-      accept[n] = a.isAccept(n);
-      for (int c = 0; c < points.length; c++) {
-        int dest = a.step(n, points[c]);
-        assert dest == -1 || dest < size;
-        transitions[n * points.length + c] = dest;
-      }
-    }
-
-    /*
-     * Set alphabet table for optimal run performance.
-     */
-    if (tableize) {
-      classmap = new int[maxInterval + 1];
-      int i = 0;
-      for (int j = 0; j <= maxInterval; j++) {
-        if (i + 1 < points.length && j == points[i + 1]) {
-          i++;
-        }
-        classmap[j] = i;
-      }
-    } else {
-      classmap = null;
+    // binary search
+    int a = 0;
+    int b = points.length;
+    while (b - a > 1) {
+      int d = (a + b) >>> 1;
+      if (points[d] > c) b = d;
+      else if (points[d] < c) a = d;
+      else return d;
    }
+    return a;
  }
-  
+
  /**
   * Returns the state obtained by reading the given char from the given state.
   * Returns -1 if not obtaining any such state. (If the original
@ -168,7 +173,8 @@ public abstract class RunAutomaton {
   * transition function.)
   */
  public final int step(int state, int c) {
-    if (classmap == null) {
+    assert c < alphabetSize;
+    if (c >= classmap.length) {
      return transitions[state * points.length + getCharClass(c)];
    } else {
      return transitions[state * points.length + classmap[c]];
@ -179,7 +185,7 @@ public abstract class RunAutomaton {
  public int hashCode() {
    final int prime = 31;
    int result = 1;
-    result = prime * result + maxInterval;
+    result = prime * result + alphabetSize;
    result = prime * result + points.length;
    result = prime * result + size;
    return result;
@ -191,7 +197,7 @@ public abstract class RunAutomaton {
    if (obj == null) return false;
    if (getClass() != obj.getClass()) return false;
    RunAutomaton other = (RunAutomaton) obj;
-    if (maxInterval != other.maxInterval) return false;
+    if (alphabetSize != other.alphabetSize) return false;
    if (size != other.size) return false;
    if (!Arrays.equals(points, other.points)) return false;
    if (!Arrays.equals(accept, other.accept)) return false;
--- a/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/search/TermAutomatonScorer.java
@ -367,7 +367,7 @@ class TermAutomatonScorer extends Scorer {

  static class TermRunAutomaton extends RunAutomaton {
    public TermRunAutomaton(Automaton a, int termCount) {
-      super(a, termCount, true);
+      super(a, termCount);
    }
  }