LUCENE-7329: Simplify CharacterUtils.

This commit is contained in:
Adrien Grand 2016-06-13 15:23:08 +02:00
parent 5e2677e0fb
commit af2ae05d6e
12 changed files with 87 additions and 367 deletions

View File

@ -28,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
* Normalizes token text to lower case.
*/
public final class LowerCaseFilter extends TokenFilter {
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
@ -43,7 +42,7 @@ public final class LowerCaseFilter extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;

View File

@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
* general search matching
*/
public final class UpperCaseFilter extends TokenFilter {
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
/**
@ -48,7 +47,7 @@ public final class UpperCaseFilter extends TokenFilter {
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
charUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
CharacterUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
return true;
} else
return false;

View File

@ -21,7 +21,6 @@ import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Normalizes token text to lower case, removes some Greek diacritics,
@ -29,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
*/
public final class GreekLowerCaseFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final CharacterUtils charUtils = CharacterUtils.getInstance();
/**
* Create a GreekLowerCaseFilter that normalizes Greek token text.
@ -47,7 +45,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
int chLen = termAtt.length();
for (int i = 0; i < chLen;) {
i += Character.toChars(
lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
lowerCase(Character.codePointAt(chArray, i, chLen)), chArray, i);
}
return true;
} else {

View File

@ -25,7 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Tokenizes the given token into n-grams of given size(s).
@ -38,7 +37,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
private final CharacterUtils charUtils;
private final int minGram;
private final int maxGram;
private char[] curTermBuffer;
@ -73,7 +71,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
this.charUtils = CharacterUtils.getInstance();
this.minGram = minGram;
this.maxGram = maxGram;
}
@ -87,7 +84,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = charUtils.codePointCount(termAtt);
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
@ -108,7 +105,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;

View File

@ -26,7 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
/**
* Tokenizes the input into n-grams of the given size(s).
@ -56,9 +55,7 @@ public final class NGramTokenFilter extends TokenFilter {
private int curPosInc, curPosLen;
private int tokStart;
private int tokEnd;
private boolean hasIllegalOffsets; // only if the length changed before this filter
private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
@ -72,7 +69,6 @@ public final class NGramTokenFilter extends TokenFilter {
*/
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
this.charUtils = CharacterUtils.getInstance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -104,16 +100,13 @@ public final class NGramTokenFilter extends TokenFilter {
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = charUtils.codePointCount(termAtt);
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
curPosLen = posLenAtt.getPositionLength();
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
// if length by start + end offsets doesn't match the term text then assume
// this is a synonym and don't adjust the offsets.
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
}
}
@ -123,8 +116,8 @@ public final class NGramTokenFilter extends TokenFilter {
}
if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes();
final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;

View File

@ -57,7 +57,6 @@ public class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
private CharacterUtils charUtils;
private CharacterUtils.CharacterBuffer charBuffer;
private int[] buffer; // like charBuffer, but converted to code points
private int bufferStart, bufferEnd; // remaining slice in buffer
@ -110,7 +109,6 @@ public class NGramTokenizer extends Tokenizer {
}
private void init(int minGram, int maxGram, boolean edgesOnly) {
charUtils = CharacterUtils.getInstance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -142,9 +140,9 @@ public class NGramTokenizer extends Tokenizer {
bufferStart = 0;
// fill in remaining space
exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
// convert to code points
bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
}
// should we go to the next offset?
@ -168,7 +166,7 @@ public class NGramTokenizer extends Tokenizer {
continue;
}
final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
termAtt.setLength(length);
posIncAtt.setPositionIncrement(1);
posLenAtt.setPositionLength(1);

View File

@ -40,7 +40,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
private final static int INIT_SIZE = 8;
private final CharacterUtils charUtils;
private boolean ignoreCase;
private int count;
char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
@ -63,7 +62,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
size <<= 1;
keys = new char[size][];
values = (V[]) new Object[size];
this.charUtils = CharacterUtils.getInstance();
}
/**
@ -86,7 +84,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
this.values = toCopy.values;
this.ignoreCase = toCopy.ignoreCase;
this.count = toCopy.count;
this.charUtils = toCopy.charUtils;
}
/** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
@ -192,7 +189,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
*/
public V put(char[] text, V value) {
if (ignoreCase) {
charUtils.toLowerCase(text, 0, text.length);
CharacterUtils.toLowerCase(text, 0, text.length);
}
int slot = getSlot(text, 0, text.length);
if (keys[slot] != null) {
@ -237,8 +234,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
final int limit = off+len;
if (ignoreCase) {
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
final int codePointAt = Character.codePointAt(text1, off+i, limit);
if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
@ -257,8 +254,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
return false;
if (ignoreCase) {
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, i);
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
final int codePointAt = Character.codePointAt(text1, i);
if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
@ -278,7 +275,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
final int stop = offset + len;
if (ignoreCase) {
for (int i=offset; i<stop;) {
final int codePointAt = charUtils.codePointAt(text, i, stop);
final int codePointAt = Character.codePointAt(text, i, stop);
code = code*31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}
@ -297,7 +294,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
int len = text.length();
if (ignoreCase) {
for (int i=0; i<len;) {
int codePointAt = charUtils.codePointAt(text, i);
int codePointAt = Character.codePointAt(text, i);
code = code*31 + Character.toLowerCase(codePointAt);
i += Character.charCount(codePointAt);
}

View File

@ -199,7 +199,6 @@ public abstract class CharTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
/**
@ -229,7 +228,7 @@ public abstract class CharTokenizer extends Tokenizer {
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
if (ioBuffer.getLength() == 0) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
@ -243,7 +242,7 @@ public abstract class CharTokenizer extends Tokenizer {
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int charCount = Character.charCount(c);
bufferIndex += charCount;

View File

@ -20,76 +20,13 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.util.Version;
/**
* {@link CharacterUtils} provides a unified interface to Character-related
* operations to implement backwards compatible character operations based on a
* {@link Version} instance.
*
* Utility class to write tokenizers or token filters.
* @lucene.internal
*/
public abstract class CharacterUtils {
private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
public final class CharacterUtils {
/**
* Returns a {@link CharacterUtils} implementation.
* @return a {@link CharacterUtils} implementation according to the given
* {@link Version} instance.
*/
public static CharacterUtils getInstance() {
return JAVA_5;
}
/**
* explicitly returns a version matching java 4 semantics
* @deprecated Only for n-gram backwards compat
*/
@Deprecated
public static CharacterUtils getJava4Instance() {
return JAVA_4;
}
/**
* Returns the code point at the given index of the {@link CharSequence}.
*
* @param seq
* a character sequence
* @param offset
* the offset to the char values in the chars array to be converted
*
* @return the Unicode code point at the given index
* @throws NullPointerException
* - if the sequence is null.
* @throws IndexOutOfBoundsException
* - if the value offset is negative or not less than the length of
* the character sequence.
*/
public abstract int codePointAt(final CharSequence seq, final int offset);
/**
* Returns the code point at the given index of the char array where only elements
* with index less than the limit are used.
*
* @param chars
* a character array
* @param offset
* the offset to the char values in the chars array to be converted
* @param limit the index afer the last element that should be used to calculate
* codepoint.
*
* @return the Unicode code point at the given index
* @throws NullPointerException
* - if the array is null.
* @throws IndexOutOfBoundsException
* - if the value offset is negative or not less than the length of
* the char array.
*/
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
/** Return the number of characters in <code>seq</code>. */
public abstract int codePointCount(CharSequence seq);
private CharacterUtils() {} // no instantiation
/**
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
@ -114,13 +51,13 @@ public abstract class CharacterUtils {
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toLowerCase(
codePointAt(buffer, i, limit)), buffer, i);
Character.codePointAt(buffer, i, limit)), buffer, i);
}
}
@ -131,25 +68,25 @@ public abstract class CharacterUtils {
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toUpperCase(
codePointAt(buffer, i, limit)), buffer, i);
Character.codePointAt(buffer, i, limit)), buffer, i);
}
}
/** Converts a sequence of Java characters to a sequence of unicode code points.
* @return the number of code points written to the destination buffer */
public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
if (srcLen < 0) {
throw new IllegalArgumentException("srcLen must be >= 0");
}
int codePointCount = 0;
for (int i = 0; i < srcLen; ) {
final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
final int charCount = Character.charCount(cp);
dest[destOff + codePointCount++] = cp;
i += charCount;
@ -159,7 +96,7 @@ public abstract class CharacterUtils {
/** Converts a sequence of unicode code points to a sequence of Java characters.
* @return the number of chars written to the destination buffer */
public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
if (srcLen < 0) {
throw new IllegalArgumentException("srcLen must be >= 0");
}
@ -202,16 +139,44 @@ public abstract class CharacterUtils {
* @throws IOException
* if the reader throws an {@link IOException}.
*/
public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
assert buffer.buffer.length >= 2;
if (numChars < 2 || numChars > buffer.buffer.length) {
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
}
final char[] charBuffer = buffer.buffer;
buffer.offset = 0;
final int offset;
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
return fill(buffer, reader, buffer.buffer.length);
// Install the previously saved ending high surrogate:
if (buffer.lastTrailingHighSurrogate != 0) {
charBuffer[0] = buffer.lastTrailingHighSurrogate;
buffer.lastTrailingHighSurrogate = 0;
offset = 1;
} else {
offset = 0;
}
final int read = readFully(reader, charBuffer, offset, numChars - offset);
buffer.length = offset + read;
final boolean result = buffer.length == numChars;
if (buffer.length < numChars) {
// We failed to fill the buffer. Even if the last char is a high
// surrogate, there is nothing we can do
return result;
}
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
}
return result;
}
/** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
* code points from <code>index</code>. */
public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
return fill(buffer, reader, buffer.buffer.length);
}
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
int read = 0;
@ -225,112 +190,6 @@ public abstract class CharacterUtils {
return read;
}
private static final class Java5CharacterUtils extends CharacterUtils {
Java5CharacterUtils() {
}
@Override
public int codePointAt(final CharSequence seq, final int offset) {
return Character.codePointAt(seq, offset);
}
@Override
public int codePointAt(final char[] chars, final int offset, final int limit) {
return Character.codePointAt(chars, offset, limit);
}
@Override
public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
assert buffer.buffer.length >= 2;
if (numChars < 2 || numChars > buffer.buffer.length) {
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
}
final char[] charBuffer = buffer.buffer;
buffer.offset = 0;
final int offset;
// Install the previously saved ending high surrogate:
if (buffer.lastTrailingHighSurrogate != 0) {
charBuffer[0] = buffer.lastTrailingHighSurrogate;
buffer.lastTrailingHighSurrogate = 0;
offset = 1;
} else {
offset = 0;
}
final int read = readFully(reader, charBuffer, offset, numChars - offset);
buffer.length = offset + read;
final boolean result = buffer.length == numChars;
if (buffer.length < numChars) {
// We failed to fill the buffer. Even if the last char is a high
// surrogate, there is nothing we can do
return result;
}
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
}
return result;
}
@Override
public int codePointCount(CharSequence seq) {
return Character.codePointCount(seq, 0, seq.length());
}
@Override
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
return Character.offsetByCodePoints(buf, start, count, index, offset);
}
}
private static final class Java4CharacterUtils extends CharacterUtils {
Java4CharacterUtils() {
}
@Override
public int codePointAt(final CharSequence seq, final int offset) {
return seq.charAt(offset);
}
@Override
public int codePointAt(final char[] chars, final int offset, final int limit) {
if(offset >= limit)
throw new IndexOutOfBoundsException("offset must be less than limit");
return chars[offset];
}
@Override
public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
throws IOException {
assert buffer.buffer.length >= 1;
if (numChars < 1 || numChars > buffer.buffer.length) {
throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
}
buffer.offset = 0;
final int read = readFully(reader, buffer.buffer, 0, numChars);
buffer.length = read;
buffer.lastTrailingHighSurrogate = 0;
return read == numChars;
}
@Override
public int codePointCount(CharSequence seq) {
return seq.length();
}
@Override
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
final int result = index + offset;
if (result < 0 || result > count) {
throw new IndexOutOfBoundsException();
}
return result;
}
}
/**
* A simple IO buffer to use with
* {@link CharacterUtils#fill(CharacterBuffer, Reader)}.

View File

@ -85,8 +85,6 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
int numTerms = atLeast(50);
boolean ignoreCase = random().nextBoolean();
CharacterUtils charUtils = CharacterUtils.getInstance();
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil
.randomRealisticUnicodeString(random());
@ -107,7 +105,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
if (ignoreCase) {
// TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
char[] buffer = inputValue.toCharArray();
charUtils.toLowerCase(buffer, 0, buffer.length);
CharacterUtils.toLowerCase(buffer, 0, buffer.length);
seenInputValue = buffer.toString();
} else {
seenInputValue = inputValue;

View File

@ -32,102 +32,15 @@ import org.junit.Test;
*/
public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testCodePointAtCharSequenceInt() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
String cpAt3 = "Abc\ud801\udc1c";
String highSurrogateAt3 = "Abc\ud801";
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
expectThrows(IndexOutOfBoundsException.class, () -> {
java4.codePointAt(highSurrogateAt3, 4);
});
CharacterUtils java5 = CharacterUtils.getInstance();
assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
cpAt3, 3));
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
expectThrows(IndexOutOfBoundsException.class, () -> {
java5.codePointAt(highSurrogateAt3, 4);
});
}
@Test
public void testCodePointAtCharArrayIntInt() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3, 5));
assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3, 4));
CharacterUtils java5 = CharacterUtils.getInstance();
assertEquals((int) 'A', java5.codePointAt(cpAt3, 0, 2));
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
cpAt3, 3, 5));
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
}
@Test
public void testCodePointCount() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
CharacterUtils java5 = CharacterUtils.getInstance();
final String s = TestUtil.randomUnicodeString(random());
assertEquals(s.length(), java4.codePointCount(s));
assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
}
@Test
public void testOffsetByCodePoint() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
CharacterUtils java5 = CharacterUtils.getInstance();
for (int i = 0; i < 10; ++i) {
final char[] s = TestUtil.randomUnicodeString(random()).toCharArray();
final int index = TestUtil.nextInt(random(), 0, s.length);
final int offset = random().nextInt(7) - 3;
try {
final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
assertEquals(o, index + offset);
} catch (IndexOutOfBoundsException e) {
assertTrue((index + offset) < 0 || (index + offset) > s.length);
}
int o;
try {
o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
} catch (IndexOutOfBoundsException e) {
try {
Character.offsetByCodePoints(s, 0, s.length, index, offset);
fail();
} catch (IndexOutOfBoundsException e2) {
// OK
}
o = -1;
}
if (o >= 0) {
assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
}
}
}
public void testConversions() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
CharacterUtils java5 = CharacterUtils.getInstance();
testConversions(java4);
testConversions(java5);
}
private void testConversions(CharacterUtils charUtils) {
final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
final int[] buf = new int[orig.length];
final char[] restored = new char[buf.length];
final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
final int o2 = TestUtil.nextInt(random(), 0, o1);
final int o3 = TestUtil.nextInt(random(), 0, o1);
final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
assertEquals(orig.length - o1, charCount);
assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
}
@ -152,71 +65,43 @@ public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testFillNoHighSurrogate() throws IOException {
CharacterUtils versions[] = new CharacterUtils[] {
CharacterUtils.getInstance(),
CharacterUtils.getJava4Instance() };
for (CharacterUtils instance : versions) {
Reader reader = new StringReader("helloworld");
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
assertTrue(instance.fill(buffer,reader));
assertEquals(0, buffer.getOffset());
assertEquals(6, buffer.getLength());
assertEquals("hellow", new String(buffer.getBuffer()));
assertFalse(instance.fill(buffer,reader));
assertEquals(4, buffer.getLength());
assertEquals(0, buffer.getOffset());
Reader reader = new StringReader("helloworld");
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
assertTrue(CharacterUtils.fill(buffer,reader));
assertEquals(0, buffer.getOffset());
assertEquals(6, buffer.getLength());
assertEquals("hellow", new String(buffer.getBuffer()));
assertFalse(CharacterUtils.fill(buffer,reader));
assertEquals(4, buffer.getLength());
assertEquals(0, buffer.getOffset());
assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
buffer.getLength()));
assertFalse(instance.fill(buffer,reader));
}
assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
buffer.getLength()));
assertFalse(CharacterUtils.fill(buffer,reader));
}
@Test
public void testFillJava15() throws IOException {
public void testFill() throws IOException {
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
CharacterUtils instance = CharacterUtils.getInstance();
Reader reader = new StringReader(input);
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
assertTrue(instance.fill(buffer, reader));
assertTrue(CharacterUtils.fill(buffer, reader));
assertEquals(4, buffer.getLength());
assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
buffer.getLength()));
assertTrue(instance.fill(buffer, reader));
assertTrue(CharacterUtils.fill(buffer, reader));
assertEquals(5, buffer.getLength());
assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
assertTrue(instance.fill(buffer, reader));
assertTrue(CharacterUtils.fill(buffer, reader));
assertEquals(4, buffer.getLength());
assertEquals("123\ud801", new String(buffer.getBuffer(),
buffer.getOffset(), buffer.getLength()));
assertFalse(instance.fill(buffer, reader));
assertFalse(CharacterUtils.fill(buffer, reader));
assertEquals(3, buffer.getLength());
assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
.getOffset(), buffer.getLength()));
assertFalse(instance.fill(buffer, reader));
assertFalse(CharacterUtils.fill(buffer, reader));
assertEquals(0, buffer.getLength());
}
@Test
public void testFillJava14() throws IOException {
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
CharacterUtils instance = CharacterUtils.getJava4Instance();
Reader reader = new StringReader(input);
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
assertTrue(instance.fill(buffer, reader));
assertEquals(5, buffer.getLength());
assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer
.getOffset(), buffer.getLength()));
assertTrue(instance.fill(buffer, reader));
assertEquals(5, buffer.getLength());
assertEquals("\udc1c7891", new String(buffer.getBuffer()));
buffer = CharacterUtils.newCharacterBuffer(6);
assertTrue(instance.fill(buffer, reader));
assertEquals(6, buffer.getLength());
assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
.getOffset(), buffer.getLength()));
assertFalse(instance.fill(buffer, reader));
}
}

View File

@ -34,7 +34,6 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.CharsRefBuilder;
/**
@ -54,7 +53,6 @@ public class MorfologikFilter extends TokenFilter {
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
private final CharsRefBuilder scratch = new CharsRefBuilder();
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private State current;
private final TokenStream input;
@ -154,7 +152,7 @@ public class MorfologikFilter extends TokenFilter {
char buffer[] = scratch.chars();
for (int i = 0; i < length;) {
i += Character.toChars(
Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
Character.toLowerCase(Character.codePointAt(chs, i)), buffer, i);
}
return scratch.get();