LUCENE-7329: Simplify CharacterUtils.

2016-06-13 15:23:08 +02:00 · 2016-06-13 15:23:08 +02:00 · af2ae05d6e
parent 5e2677e0fb
commit af2ae05d6e
12 changed files with 87 additions and 367 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/LowerCaseFilter.java
@ -28,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
 * Normalizes token text to lower case.
 */
 public final class LowerCaseFilter extends TokenFilter {
-  private final CharacterUtils charUtils = CharacterUtils.getInstance();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  
  /**
@ -43,7 +42,7 @@ public final class LowerCaseFilter extends TokenFilter {
  @Override
  public final boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
-      charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
+      CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
      return true;
    } else
      return false;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/core/UpperCaseFilter.java
@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
 * general search matching
 */
 public final class UpperCaseFilter extends TokenFilter {
-  private final CharacterUtils charUtils = CharacterUtils.getInstance();
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  
  /**
@ -48,7 +47,7 @@ public final class UpperCaseFilter extends TokenFilter {
  @Override
  public final boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
-      charUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
+      CharacterUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
      return true;
    } else
      return false;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@ -21,7 +21,6 @@ import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;

 /**
 * Normalizes token text to lower case, removes some Greek diacritics,
@ -29,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
 */
 public final class GreekLowerCaseFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final CharacterUtils charUtils = CharacterUtils.getInstance();

  /**
   * Create a GreekLowerCaseFilter that normalizes Greek token text.
@ -47,7 +45,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
      int chLen = termAtt.length();
      for (int i = 0; i < chLen;) {
        i += Character.toChars(
-            lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
+            lowerCase(Character.codePointAt(chArray, i, chLen)), chArray, i);
       }
      return true;
    } else {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;

 /**
 * Tokenizes the given token into n-grams of given size(s).
@ -38,7 +37,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
  public static final int DEFAULT_MAX_GRAM_SIZE = 1;
  public static final int DEFAULT_MIN_GRAM_SIZE = 1;

-  private final CharacterUtils charUtils;
  private final int minGram;
  private final int maxGram;
  private char[] curTermBuffer;
@ -73,7 +71,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }

-    this.charUtils = CharacterUtils.getInstance();
    this.minGram = minGram;
    this.maxGram = maxGram;
  }
@ -87,7 +84,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
        } else {
          curTermBuffer = termAtt.buffer().clone();
          curTermLength = termAtt.length();
-          curCodePointCount = charUtils.codePointCount(termAtt);
+          curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
          curGramSize = minGram;
          tokStart = offsetAtt.startOffset();
          tokEnd = offsetAtt.endOffset();
@ -108,7 +105,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
            posIncrAtt.setPositionIncrement(0);
          }
          posLenAtt.setPositionLength(savePosLen);
-          final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
+          final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
          termAtt.copyBuffer(curTermBuffer, 0, charLength);
          curGramSize++;
          return true;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenFilter.java
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;

 /**
 * Tokenizes the input into n-grams of the given size(s).
@ -56,9 +55,7 @@ public final class NGramTokenFilter extends TokenFilter {
  private int curPosInc, curPosLen;
  private int tokStart;
  private int tokEnd;
-  private boolean hasIllegalOffsets; // only if the length changed before this filter

-  private final CharacterUtils charUtils;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt;
  private final PositionLengthAttribute posLenAtt;
@ -72,7 +69,6 @@ public final class NGramTokenFilter extends TokenFilter {
   */
  public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
    super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
-    this.charUtils = CharacterUtils.getInstance();
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -104,16 +100,13 @@ public final class NGramTokenFilter extends TokenFilter {
        } else {
          curTermBuffer = termAtt.buffer().clone();
          curTermLength = termAtt.length();
-          curCodePointCount = charUtils.codePointCount(termAtt);
+          curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
          curGramSize = minGram;
          curPos = 0;
          curPosInc = posIncAtt.getPositionIncrement();
          curPosLen = posLenAtt.getPositionLength();
          tokStart = offsetAtt.startOffset();
          tokEnd = offsetAtt.endOffset();
-          // if length by start + end offsets doesn't match the term text then assume
-          // this is a synonym and don't adjust the offsets.
-          hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
        }
      }

@ -123,8 +116,8 @@ public final class NGramTokenFilter extends TokenFilter {
      }
      if ((curPos + curGramSize) <= curCodePointCount) {
        clearAttributes();
-        final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
-        final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+        final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+        final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
        termAtt.copyBuffer(curTermBuffer, start, end - start);
        posIncAtt.setPositionIncrement(curPosInc);
        curPosInc = 0;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/ngram/NGramTokenizer.java
@ -57,7 +57,6 @@ public class NGramTokenizer extends Tokenizer {
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

-  private CharacterUtils charUtils;
  private CharacterUtils.CharacterBuffer charBuffer;
  private int[] buffer; // like charBuffer, but converted to code points
  private int bufferStart, bufferEnd; // remaining slice in buffer
@ -110,7 +109,6 @@ public class NGramTokenizer extends Tokenizer {
  }

  private void init(int minGram, int maxGram, boolean edgesOnly) {
-    charUtils = CharacterUtils.getInstance();
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
@ -142,9 +140,9 @@ public class NGramTokenizer extends Tokenizer {
        bufferStart = 0;

        // fill in remaining space
-        exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
+        exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
        // convert to code points
-        bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
+        bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
      }

      // should we go to the next offset?
@ -168,7 +166,7 @@ public class NGramTokenizer extends Tokenizer {
        continue;
      }

-      final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+      final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
      termAtt.setLength(length);
      posIncAtt.setPositionIncrement(1);
      posLenAtt.setPositionLength(1);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayMap.java
@ -40,7 +40,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
  private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();

  private final static int INIT_SIZE = 8;
-  private final CharacterUtils charUtils;
  private boolean ignoreCase;  
  private int count;
  char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
@ -63,7 +62,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
      size <<= 1;
    keys = new char[size][];
    values = (V[]) new Object[size];
-    this.charUtils = CharacterUtils.getInstance();
  }

  /**
@ -86,7 +84,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
    this.values = toCopy.values;
    this.ignoreCase = toCopy.ignoreCase;
    this.count = toCopy.count;
-    this.charUtils = toCopy.charUtils;
  }
  
  /** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
@ -192,7 +189,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
   */
  public V put(char[] text, V value) {
    if (ignoreCase) {
-      charUtils.toLowerCase(text, 0, text.length);
+      CharacterUtils.toLowerCase(text, 0, text.length);
    }
    int slot = getSlot(text, 0, text.length);
    if (keys[slot] != null) {
@ -237,8 +234,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
    final int limit = off+len;
    if (ignoreCase) {
      for(int i=0;i<len;) {
-        final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
-        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
+        final int codePointAt = Character.codePointAt(text1, off+i, limit);
+        if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
          return false;
        i += Character.charCount(codePointAt); 
      }
@ -257,8 +254,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
      return false;
    if (ignoreCase) {
      for(int i=0;i<len;) {
-        final int codePointAt = charUtils.codePointAt(text1, i);
-        if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
+        final int codePointAt = Character.codePointAt(text1, i);
+        if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
          return false;
        i += Character.charCount(codePointAt);
      }
@ -278,7 +275,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
    final int stop = offset + len;
    if (ignoreCase) {
      for (int i=offset; i<stop;) {
-        final int codePointAt = charUtils.codePointAt(text, i, stop);
+        final int codePointAt = Character.codePointAt(text, i, stop);
        code = code*31 + Character.toLowerCase(codePointAt);
        i += Character.charCount(codePointAt);
      }
@ -297,7 +294,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
    int len = text.length();
    if (ignoreCase) {
      for (int i=0; i<len;) {
-        int codePointAt = charUtils.codePointAt(text, i);
+        int codePointAt = Character.codePointAt(text, i);
        code = code*31 + Character.toLowerCase(codePointAt);
        i += Character.charCount(codePointAt);
      }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharTokenizer.java
@ -199,7 +199,6 @@ public abstract class CharTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  
-  private final CharacterUtils charUtils = CharacterUtils.getInstance();
  private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
  
  /**
@ -229,7 +228,7 @@ public abstract class CharTokenizer extends Tokenizer {
    while (true) {
      if (bufferIndex >= dataLen) {
        offset += dataLen;
-        charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
+        CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
        if (ioBuffer.getLength() == 0) {
          dataLen = 0; // so next offset += dataLen won't decrement offset
          if (length > 0) {
@ -243,7 +242,7 @@ public abstract class CharTokenizer extends Tokenizer {
        bufferIndex = 0;
      }
      // use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
-      final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
+      final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
      final int charCount = Character.charCount(c);
      bufferIndex += charCount;

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CharacterUtils.java
@ -20,76 +20,13 @@ package org.apache.lucene.analysis.util;
 import java.io.IOException;
 import java.io.Reader;

-import org.apache.lucene.util.Version;
-
 /**
- * {@link CharacterUtils} provides a unified interface to Character-related
- * operations to implement backwards compatible character operations based on a
- * {@link Version} instance.
- * 
+ * Utility class to write tokenizers or token filters.
 * @lucene.internal
 */
-public abstract class CharacterUtils {
-  private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
-  private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
+public final class CharacterUtils {

-  /**
-   * Returns a {@link CharacterUtils} implementation.
-   * @return a {@link CharacterUtils} implementation according to the given
-   *         {@link Version} instance.
-   */
-  public static CharacterUtils getInstance() {
-    return JAVA_5;
-  }
-  
-  /** 
-   * explicitly returns a version matching java 4 semantics 
-   * @deprecated Only for n-gram backwards compat
-   */
-  @Deprecated
-  public static CharacterUtils getJava4Instance() {
-    return JAVA_4;
-  }
-
-  /**
-   * Returns the code point at the given index of the {@link CharSequence}.
-   * 
-   * @param seq
-   *          a character sequence
-   * @param offset
-   *          the offset to the char values in the chars array to be converted
-   * 
-   * @return the Unicode code point at the given index
-   * @throws NullPointerException
-   *           - if the sequence is null.
-   * @throws IndexOutOfBoundsException
-   *           - if the value offset is negative or not less than the length of
-   *           the character sequence.
-   */
-  public abstract int codePointAt(final CharSequence seq, final int offset);
-  
-  /**
-   * Returns the code point at the given index of the char array where only elements
-   * with index less than the limit are used.
-   * 
-   * @param chars
-   *          a character array
-   * @param offset
-   *          the offset to the char values in the chars array to be converted
-   * @param limit the index afer the last element that should be used to calculate
-   *        codepoint.  
-   * 
-   * @return the Unicode code point at the given index
-   * @throws NullPointerException
-   *           - if the array is null.
-   * @throws IndexOutOfBoundsException
-   *           - if the value offset is negative or not less than the length of
-   *           the char array.
-   */
-  public abstract int codePointAt(final char[] chars, final int offset, final int limit);
-
-  /** Return the number of characters in <code>seq</code>. */
-  public abstract int codePointCount(CharSequence seq);
+  private CharacterUtils() {} // no instantiation

  /**
   * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
@ -114,13 +51,13 @@ public abstract class CharacterUtils {
   * @param offset the offset to start at
   * @param limit the max char in the buffer to lower case
   */
-  public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
+  public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
    assert buffer.length >= limit;
    assert offset <=0 && offset <= buffer.length;
    for (int i = offset; i < limit;) {
      i += Character.toChars(
              Character.toLowerCase(
-                  codePointAt(buffer, i, limit)), buffer, i);
+                  Character.codePointAt(buffer, i, limit)), buffer, i);
     }
  }

@ -131,25 +68,25 @@ public abstract class CharacterUtils {
   * @param offset the offset to start at
   * @param limit the max char in the buffer to lower case
   */
-  public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
+  public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
    assert buffer.length >= limit;
    assert offset <=0 && offset <= buffer.length;
    for (int i = offset; i < limit;) {
      i += Character.toChars(
              Character.toUpperCase(
-                  codePointAt(buffer, i, limit)), buffer, i);
+                  Character.codePointAt(buffer, i, limit)), buffer, i);
     }
  }

  /** Converts a sequence of Java characters to a sequence of unicode code points.
   *  @return the number of code points written to the destination buffer */
-  public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
+  public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
    if (srcLen < 0) {
      throw new IllegalArgumentException("srcLen must be >= 0");
    }
    int codePointCount = 0;
    for (int i = 0; i < srcLen; ) {
-      final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
+      final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
      final int charCount = Character.charCount(cp);
      dest[destOff + codePointCount++] = cp;
      i += charCount;
@ -159,7 +96,7 @@ public abstract class CharacterUtils {

  /** Converts a sequence of unicode code points to a sequence of Java characters.
   *  @return the number of chars written to the destination buffer */
-  public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
+  public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
    if (srcLen < 0) {
      throw new IllegalArgumentException("srcLen must be >= 0");
    }
@ -202,16 +139,44 @@ public abstract class CharacterUtils {
   * @throws IOException
   *           if the reader throws an {@link IOException}.
   */
-  public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
+  public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
+    assert buffer.buffer.length >= 2;
+    if (numChars < 2 || numChars > buffer.buffer.length) {
+      throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
+    }
+    final char[] charBuffer = buffer.buffer;
+    buffer.offset = 0;
+    final int offset;

-  /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
-  public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
-    return fill(buffer, reader, buffer.buffer.length);
+    // Install the previously saved ending high surrogate:
+    if (buffer.lastTrailingHighSurrogate != 0) {
+      charBuffer[0] = buffer.lastTrailingHighSurrogate;
+      buffer.lastTrailingHighSurrogate = 0;
+      offset = 1;
+    } else {
+      offset = 0;
+    }
+
+    final int read = readFully(reader, charBuffer, offset, numChars - offset);
+
+    buffer.length = offset + read;
+    final boolean result = buffer.length == numChars;
+    if (buffer.length < numChars) {
+      // We failed to fill the buffer. Even if the last char is a high
+      // surrogate, there is nothing we can do
+      return result;
+    }
+
+    if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
+      buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
+    }
+    return result;
  }

-  /** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
-   *  code points from <code>index</code>. */
-  public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
+  /** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
+  public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
+    return fill(buffer, reader, buffer.buffer.length);
+  }

  static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
    int read = 0;
@ -225,112 +190,6 @@ public abstract class CharacterUtils {
    return read;
  }

-  private static final class Java5CharacterUtils extends CharacterUtils {
-    Java5CharacterUtils() {
-    }
-
-    @Override
-    public int codePointAt(final CharSequence seq, final int offset) {
-      return Character.codePointAt(seq, offset);
-    }
-
-    @Override
-    public int codePointAt(final char[] chars, final int offset, final int limit) {
-     return Character.codePointAt(chars, offset, limit);
-    }
-
-    @Override
-    public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
-      assert buffer.buffer.length >= 2;
-      if (numChars < 2 || numChars > buffer.buffer.length) {
-        throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
-      }
-      final char[] charBuffer = buffer.buffer;
-      buffer.offset = 0;
-      final int offset;
-
-      // Install the previously saved ending high surrogate:
-      if (buffer.lastTrailingHighSurrogate != 0) {
-        charBuffer[0] = buffer.lastTrailingHighSurrogate;
-        buffer.lastTrailingHighSurrogate = 0;
-        offset = 1;
-      } else {
-        offset = 0;
-      }
-
-      final int read = readFully(reader, charBuffer, offset, numChars - offset);
-
-      buffer.length = offset + read;
-      final boolean result = buffer.length == numChars;
-      if (buffer.length < numChars) {
-        // We failed to fill the buffer. Even if the last char is a high
-        // surrogate, there is nothing we can do
-        return result;
-      }
-
-      if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
-        buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
-      }
-      return result;
-    }
-
-    @Override
-    public int codePointCount(CharSequence seq) {
-      return Character.codePointCount(seq, 0, seq.length());
-    }
-
-    @Override
-    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
-      return Character.offsetByCodePoints(buf, start, count, index, offset);
-    }
-  }
-
-  private static final class Java4CharacterUtils extends CharacterUtils {
-    Java4CharacterUtils() {
-    }
-
-    @Override
-    public int codePointAt(final CharSequence seq, final int offset) {
-      return seq.charAt(offset);
-    }
-
-    @Override
-    public int codePointAt(final char[] chars, final int offset, final int limit) {
-      if(offset >= limit)
-        throw new IndexOutOfBoundsException("offset must be less than limit");
-      return chars[offset];
-    }
-
-    @Override
-    public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
-        throws IOException {
-      assert buffer.buffer.length >= 1;
-      if (numChars < 1 || numChars > buffer.buffer.length) {
-        throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
-      }
-      buffer.offset = 0;
-      final int read = readFully(reader, buffer.buffer, 0, numChars);
-      buffer.length = read;
-      buffer.lastTrailingHighSurrogate = 0;
-      return read == numChars;
-    }
-
-    @Override
-    public int codePointCount(CharSequence seq) {
-      return seq.length();
-    }
-
-    @Override
-    public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
-      final int result = index + offset;
-      if (result < 0 || result > count) {
-        throw new IndexOutOfBoundsException();
-      }
-      return result;
-    }
-
-  }
-  
  /**
   * A simple IO buffer to use with
   * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestStemmerOverrideFilter.java
@ -85,8 +85,6 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
    int numTerms = atLeast(50);
    boolean ignoreCase = random().nextBoolean();

-    CharacterUtils charUtils = CharacterUtils.getInstance();
-
    for (int i = 0; i < numTerms; i++) {
      String randomRealisticUnicodeString = TestUtil
          .randomRealisticUnicodeString(random());
@ -107,7 +105,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
        if (ignoreCase) {
          // TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
          char[] buffer = inputValue.toCharArray();
-          charUtils.toLowerCase(buffer, 0, buffer.length);
+          CharacterUtils.toLowerCase(buffer, 0, buffer.length);
          seenInputValue = buffer.toString();
        } else {
          seenInputValue = inputValue;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharacterUtils.java
@ -32,102 +32,15 @@ import org.junit.Test;
 */
 public class TestCharacterUtils extends LuceneTestCase {

-  @Test
-  public void testCodePointAtCharSequenceInt() {
-    CharacterUtils java4 = CharacterUtils.getJava4Instance();
-    String cpAt3 = "Abc\ud801\udc1c";
-    String highSurrogateAt3 = "Abc\ud801";
-    assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
-    assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
-    assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
-    expectThrows(IndexOutOfBoundsException.class, () -> {
-      java4.codePointAt(highSurrogateAt3, 4);
-    });
-
-    CharacterUtils java5 = CharacterUtils.getInstance();
-    assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
-    assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
-        cpAt3, 3));
-    assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
-    expectThrows(IndexOutOfBoundsException.class, () -> {
-      java5.codePointAt(highSurrogateAt3, 4);
-    });
-  }
-
-  @Test
-  public void testCodePointAtCharArrayIntInt() {
-    CharacterUtils java4 = CharacterUtils.getJava4Instance();
-    char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
-    char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
-    assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
-    assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3, 5));
-    assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3, 4));
-
-    CharacterUtils java5 = CharacterUtils.getInstance();
-    assertEquals((int) 'A', java5.codePointAt(cpAt3, 0, 2));
-    assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
-        cpAt3, 3, 5));
-    assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
-  }
-
-  @Test
-  public void testCodePointCount() {
-    CharacterUtils java4 = CharacterUtils.getJava4Instance();
-    CharacterUtils java5 = CharacterUtils.getInstance();
-    final String s = TestUtil.randomUnicodeString(random());
-    assertEquals(s.length(), java4.codePointCount(s));
-    assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
-  }
-
-  @Test
-  public void testOffsetByCodePoint() {
-    CharacterUtils java4 = CharacterUtils.getJava4Instance();
-    CharacterUtils java5 = CharacterUtils.getInstance();
-    for (int i = 0; i < 10; ++i) {
-      final char[] s = TestUtil.randomUnicodeString(random()).toCharArray();
-      final int index = TestUtil.nextInt(random(), 0, s.length);
-      final int offset = random().nextInt(7) - 3;
-      try {
-        final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
-        assertEquals(o, index + offset);
-      } catch (IndexOutOfBoundsException e) {
-        assertTrue((index + offset) < 0 || (index + offset) > s.length);
-      }
-  
-      int o;
-      try {
-        o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
-      } catch (IndexOutOfBoundsException e) {
-        try {
-          Character.offsetByCodePoints(s, 0, s.length, index, offset);
-          fail();
-        } catch (IndexOutOfBoundsException e2) {
-          // OK
-        }
-        o = -1;
-      }
-      if (o >= 0) {
-        assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
-      }
-    }
-  }
-
  public void testConversions() {
-    CharacterUtils java4 = CharacterUtils.getJava4Instance();
-    CharacterUtils java5 = CharacterUtils.getInstance();
-    testConversions(java4);
-    testConversions(java5);
-  }
-
-  private void testConversions(CharacterUtils charUtils) {
    final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
    final int[] buf = new int[orig.length];
    final char[] restored = new char[buf.length];
    final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
    final int o2 = TestUtil.nextInt(random(), 0, o1);
    final int o3 = TestUtil.nextInt(random(), 0, o1);
-    final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
-    final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
+    final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
+    final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
    assertEquals(orig.length - o1, charCount);
    assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
  }
@ -152,71 +65,43 @@ public class TestCharacterUtils extends LuceneTestCase {

  @Test
  public void testFillNoHighSurrogate() throws IOException {
-    CharacterUtils versions[] = new CharacterUtils[] { 
-        CharacterUtils.getInstance(), 
-        CharacterUtils.getJava4Instance() };
-    for (CharacterUtils instance : versions) {
-      Reader reader = new StringReader("helloworld");
-      CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
-      assertTrue(instance.fill(buffer,reader));
-      assertEquals(0, buffer.getOffset());
-      assertEquals(6, buffer.getLength());
-      assertEquals("hellow", new String(buffer.getBuffer()));
-      assertFalse(instance.fill(buffer,reader));
-      assertEquals(4, buffer.getLength());
-      assertEquals(0, buffer.getOffset());
+    Reader reader = new StringReader("helloworld");
+    CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
+    assertTrue(CharacterUtils.fill(buffer,reader));
+    assertEquals(0, buffer.getOffset());
+    assertEquals(6, buffer.getLength());
+    assertEquals("hellow", new String(buffer.getBuffer()));
+    assertFalse(CharacterUtils.fill(buffer,reader));
+    assertEquals(4, buffer.getLength());
+    assertEquals(0, buffer.getOffset());

-      assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
-          buffer.getLength()));
-      assertFalse(instance.fill(buffer,reader));
-    }
+    assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
+        buffer.getLength()));
+    assertFalse(CharacterUtils.fill(buffer,reader));
  }

  @Test
-  public void testFillJava15() throws IOException {
+  public void testFill() throws IOException {
    String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
-    CharacterUtils instance = CharacterUtils.getInstance();
    Reader reader = new StringReader(input);
    CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
-    assertTrue(instance.fill(buffer, reader));
+    assertTrue(CharacterUtils.fill(buffer, reader));
    assertEquals(4, buffer.getLength());
    assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
        buffer.getLength()));
-    assertTrue(instance.fill(buffer, reader));
+    assertTrue(CharacterUtils.fill(buffer, reader));
    assertEquals(5, buffer.getLength());
    assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
-    assertTrue(instance.fill(buffer, reader));
+    assertTrue(CharacterUtils.fill(buffer, reader));
    assertEquals(4, buffer.getLength());
    assertEquals("123\ud801", new String(buffer.getBuffer(),
        buffer.getOffset(), buffer.getLength()));
-    assertFalse(instance.fill(buffer, reader));
+    assertFalse(CharacterUtils.fill(buffer, reader));
    assertEquals(3, buffer.getLength());
    assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
        .getOffset(), buffer.getLength()));
-    assertFalse(instance.fill(buffer, reader));
+    assertFalse(CharacterUtils.fill(buffer, reader));
    assertEquals(0, buffer.getLength());
  }

-  @Test
-  public void testFillJava14() throws IOException {
-    String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
-    CharacterUtils instance = CharacterUtils.getJava4Instance();
-    Reader reader = new StringReader(input);
-    CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
-    assertTrue(instance.fill(buffer, reader));
-    assertEquals(5, buffer.getLength());
-    assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer
-        .getOffset(), buffer.getLength()));
-    assertTrue(instance.fill(buffer, reader));
-    assertEquals(5, buffer.getLength());
-    assertEquals("\udc1c7891", new String(buffer.getBuffer()));
-    buffer = CharacterUtils.newCharacterBuffer(6);
-    assertTrue(instance.fill(buffer, reader));
-    assertEquals(6, buffer.getLength());
-    assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
-        .getOffset(), buffer.getLength()));
-    assertFalse(instance.fill(buffer, reader));
-
-  }
-
 }
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.CharsRefBuilder;

 /**
@ -54,7 +53,6 @@ public class MorfologikFilter extends TokenFilter {
  private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);

  private final CharsRefBuilder scratch = new CharsRefBuilder();
-  private final CharacterUtils charUtils = CharacterUtils.getInstance();

  private State current;
  private final TokenStream input;
@ -154,7 +152,7 @@ public class MorfologikFilter extends TokenFilter {
    char buffer[] = scratch.chars();
    for (int i = 0; i < length;) {
      i += Character.toChars(
-          Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);      
+          Character.toLowerCase(Character.codePointAt(chs, i)), buffer, i);      
    }

    return scratch.get();