mirror of https://github.com/apache/lucene.git
LUCENE-7329: Simplify CharacterUtils.
This commit is contained in:
parent
5e2677e0fb
commit
af2ae05d6e
|
@ -28,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
|
|||
* Normalizes token text to lower case.
|
||||
*/
|
||||
public final class LowerCaseFilter extends TokenFilter {
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
|
@ -43,7 +42,7 @@ public final class LowerCaseFilter extends TokenFilter {
|
|||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
charUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
|
||||
CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
|
|||
* general search matching
|
||||
*/
|
||||
public final class UpperCaseFilter extends TokenFilter {
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
|
@ -48,7 +47,7 @@ public final class UpperCaseFilter extends TokenFilter {
|
|||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
charUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
|
||||
CharacterUtils.toUpperCase(termAtt.buffer(), 0, termAtt.length());
|
||||
return true;
|
||||
} else
|
||||
return false;
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
|
||||
/**
|
||||
* Normalizes token text to lower case, removes some Greek diacritics,
|
||||
|
@ -29,7 +28,6 @@ import org.apache.lucene.analysis.util.CharacterUtils;
|
|||
*/
|
||||
public final class GreekLowerCaseFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
|
||||
/**
|
||||
* Create a GreekLowerCaseFilter that normalizes Greek token text.
|
||||
|
@ -47,7 +45,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
|||
int chLen = termAtt.length();
|
||||
for (int i = 0; i < chLen;) {
|
||||
i += Character.toChars(
|
||||
lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
|
||||
lowerCase(Character.codePointAt(chArray, i, chLen)), chArray, i);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
|
||||
/**
|
||||
* Tokenizes the given token into n-grams of given size(s).
|
||||
|
@ -38,7 +37,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
private final CharacterUtils charUtils;
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
private char[] curTermBuffer;
|
||||
|
@ -73,7 +71,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
this.charUtils = CharacterUtils.getInstance();
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
|
@ -87,7 +84,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
|
@ -108,7 +105,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posLenAtt.setPositionLength(savePosLen);
|
||||
final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||
final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
||||
curGramSize++;
|
||||
return true;
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
|
@ -56,9 +55,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
private int curPosInc, curPosLen;
|
||||
private int tokStart;
|
||||
private int tokEnd;
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLenAtt;
|
||||
|
@ -72,7 +69,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
*/
|
||||
public NGramTokenFilter(TokenStream input, int minGram, int maxGram) {
|
||||
super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));
|
||||
this.charUtils = CharacterUtils.getInstance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -104,16 +100,13 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curCodePointCount = Character.codePointCount(termAtt, 0, termAtt.length());
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
curPosLen = posLenAtt.getPositionLength();
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -123,8 +116,8 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
if ((curPos + curGramSize) <= curCodePointCount) {
|
||||
clearAttributes();
|
||||
final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||
final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
final int start = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||
final int end = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||
posIncAtt.setPositionIncrement(curPosInc);
|
||||
curPosInc = 0;
|
||||
|
|
|
@ -57,7 +57,6 @@ public class NGramTokenizer extends Tokenizer {
|
|||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private CharacterUtils charUtils;
|
||||
private CharacterUtils.CharacterBuffer charBuffer;
|
||||
private int[] buffer; // like charBuffer, but converted to code points
|
||||
private int bufferStart, bufferEnd; // remaining slice in buffer
|
||||
|
@ -110,7 +109,6 @@ public class NGramTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private void init(int minGram, int maxGram, boolean edgesOnly) {
|
||||
charUtils = CharacterUtils.getInstance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -142,9 +140,9 @@ public class NGramTokenizer extends Tokenizer {
|
|||
bufferStart = 0;
|
||||
|
||||
// fill in remaining space
|
||||
exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
|
||||
exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
|
||||
// convert to code points
|
||||
bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
|
||||
bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
|
||||
}
|
||||
|
||||
// should we go to the next offset?
|
||||
|
@ -168,7 +166,7 @@ public class NGramTokenizer extends Tokenizer {
|
|||
continue;
|
||||
}
|
||||
|
||||
final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
|
||||
final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
|
||||
termAtt.setLength(length);
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
posLenAtt.setPositionLength(1);
|
||||
|
|
|
@ -40,7 +40,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
private static final CharArrayMap<?> EMPTY_MAP = new EmptyCharArrayMap<>();
|
||||
|
||||
private final static int INIT_SIZE = 8;
|
||||
private final CharacterUtils charUtils;
|
||||
private boolean ignoreCase;
|
||||
private int count;
|
||||
char[][] keys; // package private because used in CharArraySet's non Set-conform CharArraySetIterator
|
||||
|
@ -63,7 +62,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
size <<= 1;
|
||||
keys = new char[size][];
|
||||
values = (V[]) new Object[size];
|
||||
this.charUtils = CharacterUtils.getInstance();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -86,7 +84,6 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
this.values = toCopy.values;
|
||||
this.ignoreCase = toCopy.ignoreCase;
|
||||
this.count = toCopy.count;
|
||||
this.charUtils = toCopy.charUtils;
|
||||
}
|
||||
|
||||
/** Clears all entries in this map. This method is supported for reusing, but not {@link Map#remove}. */
|
||||
|
@ -192,7 +189,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
*/
|
||||
public V put(char[] text, V value) {
|
||||
if (ignoreCase) {
|
||||
charUtils.toLowerCase(text, 0, text.length);
|
||||
CharacterUtils.toLowerCase(text, 0, text.length);
|
||||
}
|
||||
int slot = getSlot(text, 0, text.length);
|
||||
if (keys[slot] != null) {
|
||||
|
@ -237,8 +234,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
final int limit = off+len;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;) {
|
||||
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
|
||||
final int codePointAt = Character.codePointAt(text1, off+i, limit);
|
||||
if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
|
||||
return false;
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
|
@ -257,8 +254,8 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
return false;
|
||||
if (ignoreCase) {
|
||||
for(int i=0;i<len;) {
|
||||
final int codePointAt = charUtils.codePointAt(text1, i);
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
|
||||
final int codePointAt = Character.codePointAt(text1, i);
|
||||
if (Character.toLowerCase(codePointAt) != Character.codePointAt(text2, i, text2.length))
|
||||
return false;
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
|
@ -278,7 +275,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
final int stop = offset + len;
|
||||
if (ignoreCase) {
|
||||
for (int i=offset; i<stop;) {
|
||||
final int codePointAt = charUtils.codePointAt(text, i, stop);
|
||||
final int codePointAt = Character.codePointAt(text, i, stop);
|
||||
code = code*31 + Character.toLowerCase(codePointAt);
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
|
@ -297,7 +294,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
int len = text.length();
|
||||
if (ignoreCase) {
|
||||
for (int i=0; i<len;) {
|
||||
int codePointAt = charUtils.codePointAt(text, i);
|
||||
int codePointAt = Character.codePointAt(text, i);
|
||||
code = code*31 + Character.toLowerCase(codePointAt);
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
|
|
|
@ -199,7 +199,6 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
private final CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(IO_BUFFER_SIZE);
|
||||
|
||||
/**
|
||||
|
@ -229,7 +228,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
while (true) {
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
|
||||
CharacterUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
|
||||
if (ioBuffer.getLength() == 0) {
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0) {
|
||||
|
@ -243,7 +242,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
bufferIndex = 0;
|
||||
}
|
||||
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
|
||||
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
|
||||
final int c = Character.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
|
||||
final int charCount = Character.charCount(c);
|
||||
bufferIndex += charCount;
|
||||
|
||||
|
|
|
@ -20,76 +20,13 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* {@link CharacterUtils} provides a unified interface to Character-related
|
||||
* operations to implement backwards compatible character operations based on a
|
||||
* {@link Version} instance.
|
||||
*
|
||||
* Utility class to write tokenizers or token filters.
|
||||
* @lucene.internal
|
||||
*/
|
||||
public abstract class CharacterUtils {
|
||||
private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
|
||||
private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
|
||||
public final class CharacterUtils {
|
||||
|
||||
/**
|
||||
* Returns a {@link CharacterUtils} implementation.
|
||||
* @return a {@link CharacterUtils} implementation according to the given
|
||||
* {@link Version} instance.
|
||||
*/
|
||||
public static CharacterUtils getInstance() {
|
||||
return JAVA_5;
|
||||
}
|
||||
|
||||
/**
|
||||
* explicitly returns a version matching java 4 semantics
|
||||
* @deprecated Only for n-gram backwards compat
|
||||
*/
|
||||
@Deprecated
|
||||
public static CharacterUtils getJava4Instance() {
|
||||
return JAVA_4;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the {@link CharSequence}.
|
||||
*
|
||||
* @param seq
|
||||
* a character sequence
|
||||
* @param offset
|
||||
* the offset to the char values in the chars array to be converted
|
||||
*
|
||||
* @return the Unicode code point at the given index
|
||||
* @throws NullPointerException
|
||||
* - if the sequence is null.
|
||||
* @throws IndexOutOfBoundsException
|
||||
* - if the value offset is negative or not less than the length of
|
||||
* the character sequence.
|
||||
*/
|
||||
public abstract int codePointAt(final CharSequence seq, final int offset);
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the char array where only elements
|
||||
* with index less than the limit are used.
|
||||
*
|
||||
* @param chars
|
||||
* a character array
|
||||
* @param offset
|
||||
* the offset to the char values in the chars array to be converted
|
||||
* @param limit the index afer the last element that should be used to calculate
|
||||
* codepoint.
|
||||
*
|
||||
* @return the Unicode code point at the given index
|
||||
* @throws NullPointerException
|
||||
* - if the array is null.
|
||||
* @throws IndexOutOfBoundsException
|
||||
* - if the value offset is negative or not less than the length of
|
||||
* the char array.
|
||||
*/
|
||||
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
|
||||
|
||||
/** Return the number of characters in <code>seq</code>. */
|
||||
public abstract int codePointCount(CharSequence seq);
|
||||
private CharacterUtils() {} // no instantiation
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
|
||||
|
@ -114,13 +51,13 @@ public abstract class CharacterUtils {
|
|||
* @param offset the offset to start at
|
||||
* @param limit the max char in the buffer to lower case
|
||||
*/
|
||||
public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
|
||||
public static void toLowerCase(final char[] buffer, final int offset, final int limit) {
|
||||
assert buffer.length >= limit;
|
||||
assert offset <=0 && offset <= buffer.length;
|
||||
for (int i = offset; i < limit;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
codePointAt(buffer, i, limit)), buffer, i);
|
||||
Character.codePointAt(buffer, i, limit)), buffer, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -131,25 +68,25 @@ public abstract class CharacterUtils {
|
|||
* @param offset the offset to start at
|
||||
* @param limit the max char in the buffer to lower case
|
||||
*/
|
||||
public final void toUpperCase(final char[] buffer, final int offset, final int limit) {
|
||||
public static void toUpperCase(final char[] buffer, final int offset, final int limit) {
|
||||
assert buffer.length >= limit;
|
||||
assert offset <=0 && offset <= buffer.length;
|
||||
for (int i = offset; i < limit;) {
|
||||
i += Character.toChars(
|
||||
Character.toUpperCase(
|
||||
codePointAt(buffer, i, limit)), buffer, i);
|
||||
Character.codePointAt(buffer, i, limit)), buffer, i);
|
||||
}
|
||||
}
|
||||
|
||||
/** Converts a sequence of Java characters to a sequence of unicode code points.
|
||||
* @return the number of code points written to the destination buffer */
|
||||
public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
|
||||
public static int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
|
||||
if (srcLen < 0) {
|
||||
throw new IllegalArgumentException("srcLen must be >= 0");
|
||||
}
|
||||
int codePointCount = 0;
|
||||
for (int i = 0; i < srcLen; ) {
|
||||
final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
|
||||
final int cp = Character.codePointAt(src, srcOff + i, srcOff + srcLen);
|
||||
final int charCount = Character.charCount(cp);
|
||||
dest[destOff + codePointCount++] = cp;
|
||||
i += charCount;
|
||||
|
@ -159,7 +96,7 @@ public abstract class CharacterUtils {
|
|||
|
||||
/** Converts a sequence of unicode code points to a sequence of Java characters.
|
||||
* @return the number of chars written to the destination buffer */
|
||||
public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
|
||||
public static int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
|
||||
if (srcLen < 0) {
|
||||
throw new IllegalArgumentException("srcLen must be >= 0");
|
||||
}
|
||||
|
@ -202,16 +139,44 @@ public abstract class CharacterUtils {
|
|||
* @throws IOException
|
||||
* if the reader throws an {@link IOException}.
|
||||
*/
|
||||
public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
|
||||
public static boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException {
|
||||
assert buffer.buffer.length >= 2;
|
||||
if (numChars < 2 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
|
||||
}
|
||||
final char[] charBuffer = buffer.buffer;
|
||||
buffer.offset = 0;
|
||||
final int offset;
|
||||
|
||||
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
|
||||
public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
|
||||
return fill(buffer, reader, buffer.buffer.length);
|
||||
// Install the previously saved ending high surrogate:
|
||||
if (buffer.lastTrailingHighSurrogate != 0) {
|
||||
charBuffer[0] = buffer.lastTrailingHighSurrogate;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
offset = 1;
|
||||
} else {
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
final int read = readFully(reader, charBuffer, offset, numChars - offset);
|
||||
|
||||
buffer.length = offset + read;
|
||||
final boolean result = buffer.length == numChars;
|
||||
if (buffer.length < numChars) {
|
||||
// We failed to fill the buffer. Even if the last char is a high
|
||||
// surrogate, there is nothing we can do
|
||||
return result;
|
||||
}
|
||||
|
||||
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
|
||||
* code points from <code>index</code>. */
|
||||
public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
|
||||
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
|
||||
public static boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
|
||||
return fill(buffer, reader, buffer.buffer.length);
|
||||
}
|
||||
|
||||
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
|
||||
int read = 0;
|
||||
|
@ -225,112 +190,6 @@ public abstract class CharacterUtils {
|
|||
return read;
|
||||
}
|
||||
|
||||
private static final class Java5CharacterUtils extends CharacterUtils {
|
||||
Java5CharacterUtils() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return Character.codePointAt(seq, offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset, final int limit) {
|
||||
return Character.codePointAt(chars, offset, limit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
|
||||
assert buffer.buffer.length >= 2;
|
||||
if (numChars < 2 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
|
||||
}
|
||||
final char[] charBuffer = buffer.buffer;
|
||||
buffer.offset = 0;
|
||||
final int offset;
|
||||
|
||||
// Install the previously saved ending high surrogate:
|
||||
if (buffer.lastTrailingHighSurrogate != 0) {
|
||||
charBuffer[0] = buffer.lastTrailingHighSurrogate;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
offset = 1;
|
||||
} else {
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
final int read = readFully(reader, charBuffer, offset, numChars - offset);
|
||||
|
||||
buffer.length = offset + read;
|
||||
final boolean result = buffer.length == numChars;
|
||||
if (buffer.length < numChars) {
|
||||
// We failed to fill the buffer. Even if the last char is a high
|
||||
// surrogate, there is nothing we can do
|
||||
return result;
|
||||
}
|
||||
|
||||
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointCount(CharSequence seq) {
|
||||
return Character.codePointCount(seq, 0, seq.length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
|
||||
return Character.offsetByCodePoints(buf, start, count, index, offset);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class Java4CharacterUtils extends CharacterUtils {
|
||||
Java4CharacterUtils() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return seq.charAt(offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset, final int limit) {
|
||||
if(offset >= limit)
|
||||
throw new IndexOutOfBoundsException("offset must be less than limit");
|
||||
return chars[offset];
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
|
||||
throws IOException {
|
||||
assert buffer.buffer.length >= 1;
|
||||
if (numChars < 1 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
|
||||
}
|
||||
buffer.offset = 0;
|
||||
final int read = readFully(reader, buffer.buffer, 0, numChars);
|
||||
buffer.length = read;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
return read == numChars;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointCount(CharSequence seq) {
|
||||
return seq.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
|
||||
final int result = index + offset;
|
||||
if (result < 0 || result > count) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* A simple IO buffer to use with
|
||||
* {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
|
||||
|
|
|
@ -85,8 +85,6 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
|||
int numTerms = atLeast(50);
|
||||
boolean ignoreCase = random().nextBoolean();
|
||||
|
||||
CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
String randomRealisticUnicodeString = TestUtil
|
||||
.randomRealisticUnicodeString(random());
|
||||
|
@ -107,7 +105,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
|||
if (ignoreCase) {
|
||||
// TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
|
||||
char[] buffer = inputValue.toCharArray();
|
||||
charUtils.toLowerCase(buffer, 0, buffer.length);
|
||||
CharacterUtils.toLowerCase(buffer, 0, buffer.length);
|
||||
seenInputValue = buffer.toString();
|
||||
} else {
|
||||
seenInputValue = inputValue;
|
||||
|
|
|
@ -32,102 +32,15 @@ import org.junit.Test;
|
|||
*/
|
||||
public class TestCharacterUtils extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void testCodePointAtCharSequenceInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
String cpAt3 = "Abc\ud801\udc1c";
|
||||
String highSurrogateAt3 = "Abc\ud801";
|
||||
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
|
||||
assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
|
||||
assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
|
||||
expectThrows(IndexOutOfBoundsException.class, () -> {
|
||||
java4.codePointAt(highSurrogateAt3, 4);
|
||||
});
|
||||
|
||||
CharacterUtils java5 = CharacterUtils.getInstance();
|
||||
assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
|
||||
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
|
||||
cpAt3, 3));
|
||||
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
|
||||
expectThrows(IndexOutOfBoundsException.class, () -> {
|
||||
java5.codePointAt(highSurrogateAt3, 4);
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCodePointAtCharArrayIntInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
|
||||
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
|
||||
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
|
||||
assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3, 5));
|
||||
assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3, 4));
|
||||
|
||||
CharacterUtils java5 = CharacterUtils.getInstance();
|
||||
assertEquals((int) 'A', java5.codePointAt(cpAt3, 0, 2));
|
||||
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
|
||||
cpAt3, 3, 5));
|
||||
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCodePointCount() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
CharacterUtils java5 = CharacterUtils.getInstance();
|
||||
final String s = TestUtil.randomUnicodeString(random());
|
||||
assertEquals(s.length(), java4.codePointCount(s));
|
||||
assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetByCodePoint() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
CharacterUtils java5 = CharacterUtils.getInstance();
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
final char[] s = TestUtil.randomUnicodeString(random()).toCharArray();
|
||||
final int index = TestUtil.nextInt(random(), 0, s.length);
|
||||
final int offset = random().nextInt(7) - 3;
|
||||
try {
|
||||
final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
|
||||
assertEquals(o, index + offset);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
assertTrue((index + offset) < 0 || (index + offset) > s.length);
|
||||
}
|
||||
|
||||
int o;
|
||||
try {
|
||||
o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
try {
|
||||
Character.offsetByCodePoints(s, 0, s.length, index, offset);
|
||||
fail();
|
||||
} catch (IndexOutOfBoundsException e2) {
|
||||
// OK
|
||||
}
|
||||
o = -1;
|
||||
}
|
||||
if (o >= 0) {
|
||||
assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testConversions() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
CharacterUtils java5 = CharacterUtils.getInstance();
|
||||
testConversions(java4);
|
||||
testConversions(java5);
|
||||
}
|
||||
|
||||
private void testConversions(CharacterUtils charUtils) {
|
||||
final char[] orig = TestUtil.randomUnicodeString(random(), 100).toCharArray();
|
||||
final int[] buf = new int[orig.length];
|
||||
final char[] restored = new char[buf.length];
|
||||
final int o1 = TestUtil.nextInt(random(), 0, Math.min(5, orig.length));
|
||||
final int o2 = TestUtil.nextInt(random(), 0, o1);
|
||||
final int o3 = TestUtil.nextInt(random(), 0, o1);
|
||||
final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
|
||||
final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
|
||||
final int codePointCount = CharacterUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
|
||||
final int charCount = CharacterUtils.toChars(buf, o2, codePointCount, restored, o3);
|
||||
assertEquals(orig.length - o1, charCount);
|
||||
assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
|
||||
}
|
||||
|
@ -152,71 +65,43 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testFillNoHighSurrogate() throws IOException {
|
||||
CharacterUtils versions[] = new CharacterUtils[] {
|
||||
CharacterUtils.getInstance(),
|
||||
CharacterUtils.getJava4Instance() };
|
||||
for (CharacterUtils instance : versions) {
|
||||
Reader reader = new StringReader("helloworld");
|
||||
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
|
||||
assertTrue(instance.fill(buffer,reader));
|
||||
assertEquals(0, buffer.getOffset());
|
||||
assertEquals(6, buffer.getLength());
|
||||
assertEquals("hellow", new String(buffer.getBuffer()));
|
||||
assertFalse(instance.fill(buffer,reader));
|
||||
assertEquals(4, buffer.getLength());
|
||||
assertEquals(0, buffer.getOffset());
|
||||
Reader reader = new StringReader("helloworld");
|
||||
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
|
||||
assertTrue(CharacterUtils.fill(buffer,reader));
|
||||
assertEquals(0, buffer.getOffset());
|
||||
assertEquals(6, buffer.getLength());
|
||||
assertEquals("hellow", new String(buffer.getBuffer()));
|
||||
assertFalse(CharacterUtils.fill(buffer,reader));
|
||||
assertEquals(4, buffer.getLength());
|
||||
assertEquals(0, buffer.getOffset());
|
||||
|
||||
assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
|
||||
buffer.getLength()));
|
||||
assertFalse(instance.fill(buffer,reader));
|
||||
}
|
||||
assertEquals("orld", new String(buffer.getBuffer(), buffer.getOffset(),
|
||||
buffer.getLength()));
|
||||
assertFalse(CharacterUtils.fill(buffer,reader));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFillJava15() throws IOException {
|
||||
public void testFill() throws IOException {
|
||||
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
|
||||
CharacterUtils instance = CharacterUtils.getInstance();
|
||||
Reader reader = new StringReader(input);
|
||||
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertTrue(CharacterUtils.fill(buffer, reader));
|
||||
assertEquals(4, buffer.getLength());
|
||||
assertEquals("1234", new String(buffer.getBuffer(), buffer.getOffset(),
|
||||
buffer.getLength()));
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertTrue(CharacterUtils.fill(buffer, reader));
|
||||
assertEquals(5, buffer.getLength());
|
||||
assertEquals("\ud801\udc1c789", new String(buffer.getBuffer()));
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertTrue(CharacterUtils.fill(buffer, reader));
|
||||
assertEquals(4, buffer.getLength());
|
||||
assertEquals("123\ud801", new String(buffer.getBuffer(),
|
||||
buffer.getOffset(), buffer.getLength()));
|
||||
assertFalse(instance.fill(buffer, reader));
|
||||
assertFalse(CharacterUtils.fill(buffer, reader));
|
||||
assertEquals(3, buffer.getLength());
|
||||
assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
|
||||
.getOffset(), buffer.getLength()));
|
||||
assertFalse(instance.fill(buffer, reader));
|
||||
assertFalse(CharacterUtils.fill(buffer, reader));
|
||||
assertEquals(0, buffer.getLength());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFillJava14() throws IOException {
|
||||
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
|
||||
CharacterUtils instance = CharacterUtils.getJava4Instance();
|
||||
Reader reader = new StringReader(input);
|
||||
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertEquals(5, buffer.getLength());
|
||||
assertEquals("1234\ud801", new String(buffer.getBuffer(), buffer
|
||||
.getOffset(), buffer.getLength()));
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertEquals(5, buffer.getLength());
|
||||
assertEquals("\udc1c7891", new String(buffer.getBuffer()));
|
||||
buffer = CharacterUtils.newCharacterBuffer(6);
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertEquals(6, buffer.getLength());
|
||||
assertEquals("23\ud801\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
|
||||
.getOffset(), buffer.getLength()));
|
||||
assertFalse(instance.fill(buffer, reader));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
|
||||
/**
|
||||
|
@ -54,7 +53,6 @@ public class MorfologikFilter extends TokenFilter {
|
|||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
private final CharsRefBuilder scratch = new CharsRefBuilder();
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance();
|
||||
|
||||
private State current;
|
||||
private final TokenStream input;
|
||||
|
@ -154,7 +152,7 @@ public class MorfologikFilter extends TokenFilter {
|
|||
char buffer[] = scratch.chars();
|
||||
for (int i = 0; i < length;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(charUtils.codePointAt(chs, i)), buffer, i);
|
||||
Character.toLowerCase(Character.codePointAt(chs, i)), buffer, i);
|
||||
}
|
||||
|
||||
return scratch.get();
|
||||
|
|
Loading…
Reference in New Issue