LUCENE-5042: Fix the n-gram tokenizers and filters.

This commit fixes n-gram tokenizers and filters so that they handle
supplementary characters correctly and adds the ability to pre-tokenize the
stream in tokenizers.


git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1492185 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2013-06-12 13:17:49 +00:00
parent a4d58b6f22
commit e021451d6c
21 changed files with 576 additions and 331 deletions

View File

@ -47,6 +47,10 @@ Changes in backwards compatibility policy
(a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
whitespaces. (Adrien Grand)
* LUCENE-5042: The n-gram and edge n-gram tokenizers and filters now correctly
handle supplementary characters, and the tokenizers have the ability to
pre-tokenize the input stream similarly to CharTokenizer. (Adrien Grand)
* LUCENE-4967: NRTManager is replaced by
ControlledRealTimeReopenThread, for controlling which requests must
see which indexing changes, so that it can work with any

View File

@ -57,7 +57,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
int chLen = termAtt.length();
for (int i = 0; i < chLen;) {
i += Character.toChars(
lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
}
return true;
} else {

View File

@ -25,21 +25,26 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
* Tokenizes the given token into n-grams of given size(s).
* <p>
* This {@link TokenFilter} create n-grams from the beginning edge of a input token.
* <p><a name="match_version" />As of Lucene 4.4, this filter handles correctly
* supplementary characters.
*/
public final class EdgeNGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
private final CharacterUtils charUtils;
private final int minGram;
private final int maxGram;
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
private int tokStart;
private int tokEnd; // only used if the length changed before this filter
@ -74,6 +79,9 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
this.charUtils = version.onOrAfter(Version.LUCENE_44)
? CharacterUtils.getInstance(version)
: CharacterUtils.getJava4Instance();
this.minGram = minGram;
this.maxGram = maxGram;
}
@ -87,6 +95,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = charUtils.codePointCount(termAtt);
curGramSize = minGram;
tokStart = offsetAtt.startOffset();
tokEnd = offsetAtt.endOffset();
@ -95,7 +104,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
}
}
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
// grab gramSize chars from front or back
clearAttributes();
offsetAtt.setOffset(tokStart, tokEnd);
@ -107,7 +116,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
posIncrAtt.setPositionIncrement(0);
}
posLenAtt.setPositionLength(savePosLen);
termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
termAtt.copyBuffer(curTermBuffer, 0, charLength);
curGramSize++;
return true;
}

View File

@ -17,37 +17,23 @@ package org.apache.lucene.analysis.ngram;
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Version;
/**
* Tokenizes the input from an edge into n-grams of given size(s).
* <p>
* This {@link Tokenizer} create n-grams from the beginning edge of a input token.
* <p><a name="match_version" />As of Lucene 4.4, this class supports
* {@link #isTokenChar(int) pre-tokenization} and correctly handles
* supplementary characters.
*/
public final class EdgeNGramTokenizer extends Tokenizer {
public class EdgeNGramTokenizer extends NGramTokenizer {
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private int minGram;
private int maxGram;
private int gramSize;
private boolean started;
private int inLen; // length of the input AFTER trim()
private int charsRead; // length of the input
private String inStr;
/**
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
*
@ -57,8 +43,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
super(input);
init(version, minGram, maxGram);
super(version, input, minGram, maxGram, true);
}
/**
@ -71,102 +56,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
* @param maxGram the largest n-gram to generate
*/
public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
super(factory, input);
init(version, minGram, maxGram);
super(version, factory, input, minGram, maxGram, true);
}
private void init(Version version, int minGram, int maxGram) {
if (version == null) {
throw new IllegalArgumentException("version must not be null");
}
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
if (minGram > maxGram) {
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
this.minGram = minGram;
this.maxGram = maxGram;
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
// if we are just starting, read the whole input
if (!started) {
started = true;
gramSize = minGram;
char[] chars = new char[Math.min(1024, maxGram)];
charsRead = 0;
// TODO: refactor to a shared readFully somewhere:
boolean exhausted = false;
while (charsRead < maxGram) {
final int inc = input.read(chars, charsRead, chars.length-charsRead);
if (inc == -1) {
exhausted = true;
break;
}
charsRead += inc;
if (charsRead == chars.length && charsRead < maxGram) {
chars = ArrayUtil.grow(chars);
}
}
inStr = new String(chars, 0, charsRead);
if (!exhausted) {
// Read extra throwaway chars so that on end() we
// report the correct offset:
char[] throwaway = new char[1024];
while(true) {
final int inc = input.read(throwaway, 0, throwaway.length);
if (inc == -1) {
break;
}
charsRead += inc;
}
}
inLen = inStr.length();
if (inLen == 0) {
return false;
}
posIncrAtt.setPositionIncrement(1);
} else {
posIncrAtt.setPositionIncrement(1);
}
// if the remaining input is too short, we can't generate any n-grams
if (gramSize > inLen) {
return false;
}
// if we have hit the end of our n-gram size range, quit
if (gramSize > maxGram || gramSize > inLen) {
return false;
}
// grab gramSize chars from front or back
termAtt.setEmpty().append(inStr, 0, gramSize);
offsetAtt.setOffset(correctOffset(0), correctOffset(gramSize));
gramSize++;
return true;
}
@Override
public void end() {
// set final offset
final int finalOffset = correctOffset(charsRead);
this.offsetAtt.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
started = false;
}
}

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@ -33,6 +34,7 @@ import org.apache.lucene.util.Version;
* <a name="version"/>
* <p>You must specify the required {@link Version} compatibility when
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
* <li>handles supplementary characters correctly,</li>
* <li>emits all n-grams for the same token at the same position,</li>
* <li>does not modify offsets,</li>
* <li>sorts n-grams by their offset in the original token first, then
@ -42,6 +44,10 @@ import org.apache.lucene.util.Version;
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
* it will lead to broken {@link TokenStream}s that will cause highlighting
* bugs.
* <p>If you were using this {@link TokenFilter} to perform partial highlighting,
* this won't work anymore since this filter doesn't update offsets. You should
* modify your analysis chain to use {@link NGramTokenizer}, and potentially
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
*/
public final class NGramTokenFilter extends TokenFilter {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
@ -51,6 +57,7 @@ public final class NGramTokenFilter extends TokenFilter {
private char[] curTermBuffer;
private int curTermLength;
private int curCodePointCount;
private int curGramSize;
private int curPos;
private int curPosInc, curPosLen;
@ -59,6 +66,7 @@ public final class NGramTokenFilter extends TokenFilter {
private boolean hasIllegalOffsets; // only if the length changed before this filter
private final Version version;
private final CharacterUtils charUtils;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt;
private final PositionLengthAttribute posLenAtt;
@ -75,6 +83,9 @@ public final class NGramTokenFilter extends TokenFilter {
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
this.version = version;
this.charUtils = version.onOrAfter(Version.LUCENE_44)
? CharacterUtils.getInstance(version)
: CharacterUtils.getJava4Instance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -126,6 +137,7 @@ public final class NGramTokenFilter extends TokenFilter {
} else {
curTermBuffer = termAtt.buffer().clone();
curTermLength = termAtt.length();
curCodePointCount = charUtils.codePointCount(termAtt);
curGramSize = minGram;
curPos = 0;
curPosInc = posIncAtt.getPositionIncrement();
@ -138,13 +150,15 @@ public final class NGramTokenFilter extends TokenFilter {
}
}
if (version.onOrAfter(Version.LUCENE_44)) {
if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
++curPos;
curGramSize = minGram;
}
if (curPos + curGramSize <= curTermLength) {
if ((curPos + curGramSize) <= curCodePointCount) {
clearAttributes();
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
termAtt.copyBuffer(curTermBuffer, start, end - start);
posIncAtt.setPositionIncrement(curPosInc);
curPosInc = 0;
posLenAtt.setPositionLength(curPosLen);

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
@ -40,29 +41,47 @@ import org.apache.lucene.util.Version;
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
* </table>
* <a name="version"/>
* <p>Before Lucene 4.4, this class had a different behavior:<ul>
* <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
* <li>The last whitespaces of the 1024 chars block were trimmed.</li>
* <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
* <p>Although highly discouraged, it is still possible to use the old behavior
* through {@link Lucene43NGramTokenizer}.
* <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
* <li>tokenize in a streaming fashion to support streams which are larger
* than 1024 chars (limit of the previous version),
* <li>count grams based on unicode code points instead of java chars (and
* never split in the middle of surrogate pairs),
* <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
* before computing n-grams.</ul>
* <p>Additionally, this class doesn't trim trailing whitespaces and emits
* tokens in a different order, tokens are now emitted by increasing start
* offsets while they used to be emitted by increasing lengths (which prevented
* from supporting large input streams).
* <p>Although <b style="color:red">highly</b> discouraged, it is still possible
* to use the old behavior through {@link Lucene43NGramTokenizer}.
*/
public final class NGramTokenizer extends Tokenizer {
// non-final to allow for overriding isTokenChar, but all other methods should be final
public class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
private char[] buffer;
private int bufferStart, bufferEnd; // remaining slice of the buffer
private CharacterUtils charUtils;
private CharacterUtils.CharacterBuffer charBuffer;
private int[] buffer; // like charBuffer, but converted to code points
private int bufferStart, bufferEnd; // remaining slice in buffer
private int offset;
private int gramSize;
private int minGram, maxGram;
private boolean exhausted;
private int lastCheckedChar; // last offset in the buffer that we checked
private int lastNonTokenChar; // last offset that we found to not be a token char
private boolean edgesOnly; // leading edges n-grams only
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
NGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
super(input);
init(version, minGram, maxGram, edgesOnly);
}
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param version the lucene compatibility <a href="#version">version</a>
@ -71,8 +90,12 @@ public final class NGramTokenizer extends Tokenizer {
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
super(input);
init(version, minGram, maxGram);
this(version, input, minGram, maxGram, false);
}
NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
super(factory, input);
init(version, minGram, maxGram, edgesOnly);
}
/**
@ -84,8 +107,7 @@ public final class NGramTokenizer extends Tokenizer {
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
super(factory, input);
init(version, minGram, maxGram);
this(version, factory, input, minGram, maxGram, false);
}
/**
@ -97,10 +119,13 @@ public final class NGramTokenizer extends Tokenizer {
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
private void init(Version version, int minGram, int maxGram) {
if (!version.onOrAfter(Version.LUCENE_44)) {
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
if (!edgesOnly && !version.onOrAfter(Version.LUCENE_44)) {
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
}
charUtils = version.onOrAfter(Version.LUCENE_44)
? CharacterUtils.getInstance(version)
: CharacterUtils.getJava4Instance();
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
@ -109,64 +134,104 @@ public final class NGramTokenizer extends Tokenizer {
}
this.minGram = minGram;
this.maxGram = maxGram;
buffer = new char[maxGram + 1024];
this.edgesOnly = edgesOnly;
charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
buffer = new int[charBuffer.getBuffer().length];
// Make the term att large enough
termAtt.resizeBuffer(2 * maxGram);
}
/** Returns the next token in the stream, or null at EOS. */
@Override
public boolean incrementToken() throws IOException {
public final boolean incrementToken() throws IOException {
clearAttributes();
// compact
if (bufferStart >= buffer.length - maxGram) {
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
bufferEnd -= bufferStart;
bufferStart = 0;
// termination of this loop is guaranteed by the fact that every iteration
// either advances the buffer (calls consumes()) or increases gramSize
while (true) {
// compact
if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
bufferEnd -= bufferStart;
lastCheckedChar -= bufferStart;
lastNonTokenChar -= bufferStart;
bufferStart = 0;
// fill in remaining space
if (!exhausted) {
// TODO: refactor to a shared readFully
while (bufferEnd < buffer.length) {
final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
if (read == -1) {
exhausted = true;
break;
}
bufferEnd += read;
// fill in remaining space
exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
// convert to code points
bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
}
// should we go to the next offset?
if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
if (bufferStart + 1 + minGram > bufferEnd) {
assert exhausted;
return false;
}
consume();
gramSize = minGram;
}
updateLastNonTokenChar();
// retry if the token to be emitted was going to not only contain token chars
final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
consume();
gramSize = minGram;
continue;
}
final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
termAtt.setLength(length);
posIncAtt.setPositionIncrement(1);
posLenAtt.setPositionLength(1);
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
++gramSize;
return true;
}
}
private void updateLastNonTokenChar() {
final int termEnd = bufferStart + gramSize - 1;
if (termEnd > lastCheckedChar) {
for (int i = termEnd; i > lastCheckedChar; --i) {
if (!isTokenChar(buffer[i])) {
lastNonTokenChar = i;
break;
}
}
lastCheckedChar = termEnd;
}
}
// should we go to the next offset?
if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
bufferStart++;
offset++;
gramSize = minGram;
}
/** Consume one code point. */
private void consume() {
offset += Character.charCount(buffer[bufferStart++]);
}
// are there enough chars remaining?
if (bufferStart + gramSize > bufferEnd) {
return false;
}
termAtt.copyBuffer(buffer, bufferStart, gramSize);
posIncAtt.setPositionIncrement(1);
posLenAtt.setPositionLength(1);
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
++gramSize;
/** Only collect characters which satisfy this condition. */
protected boolean isTokenChar(int chr) {
return true;
}
@Override
public void end() {
final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
public final void end() {
assert bufferStart <= bufferEnd;
int endOffset = offset;
for (int i = bufferStart; i < bufferEnd; ++i) {
endOffset += Character.charCount(buffer[i]);
}
endOffset = correctOffset(endOffset);
offsetAtt.setOffset(endOffset, endOffset);
}
@Override
public void reset() throws IOException {
public final void reset() throws IOException {
super.reset();
bufferStart = bufferEnd = buffer.length;
lastNonTokenChar = lastCheckedChar = bufferStart - 1;
offset = 0;
gramSize = minGram;
exhausted = false;

View File

@ -57,7 +57,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
final char[] buffer = termAtt.buffer();
int length = termAtt.length();
for (int i = 0; i < length;) {
final int ch = Character.codePointAt(buffer, i);
final int ch = Character.codePointAt(buffer, i, length);
iOrAfter = (ch == LATIN_CAPITAL_LETTER_I ||
(iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));
@ -100,7 +100,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
*/
private boolean isBeforeDot(char s[], int pos, int len) {
for (int i = pos; i < len;) {
final int ch = Character.codePointAt(s, i);
final int ch = Character.codePointAt(s, i, len);
if (Character.getType(ch) != Character.NON_SPACING_MARK)
return false;
if (ch == COMBINING_DOT_ABOVE)

View File

@ -262,7 +262,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
if (ignoreCase) {
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}
@ -282,7 +282,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
if (ignoreCase) {
for(int i=0;i<len;) {
final int codePointAt = charUtils.codePointAt(text1, i);
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
return false;
i += Character.charCount(codePointAt);
}

View File

@ -100,7 +100,8 @@ public abstract class CharTokenizer extends Tokenizer {
while (true) {
if (bufferIndex >= dataLen) {
offset += dataLen;
if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
if (ioBuffer.getLength() == 0) {
dataLen = 0; // so next offset += dataLen won't decrement offset
if (length > 0) {
break;
@ -113,7 +114,7 @@ public abstract class CharTokenizer extends Tokenizer {
bufferIndex = 0;
}
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
final int charCount = Character.charCount(c);
bufferIndex += charCount;

View File

@ -51,27 +51,6 @@ public abstract class CharacterUtils {
return JAVA_4;
}
/**
* Returns the code point at the given index of the char array.
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
* of {@link Character#codePointAt(char[], int)} as it would have been
* available on a Java 1.4 JVM or on a later virtual machine version.
*
* @param chars
* a character array
* @param offset
* the offset to the char values in the chars array to be converted
*
* @return the Unicode code point at the given index
* @throws NullPointerException
* - if the array is null.
* @throws IndexOutOfBoundsException
* - if the value offset is negative or not less than the length of
* the char array.
*/
public abstract int codePointAt(final char[] chars, final int offset);
/**
* Returns the code point at the given index of the {@link CharSequence}.
* Depending on the {@link Version} passed to
@ -116,7 +95,10 @@ public abstract class CharacterUtils {
* the char array.
*/
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
/** Return the number of characters in <code>seq</code>. */
public abstract int codePointCount(CharSequence seq);
/**
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
* of the given bufferSize.
@ -140,53 +122,101 @@ public abstract class CharacterUtils {
* @param offset the offset to start at
* @param limit the max char in the buffer to lower case
*/
public void toLowerCase(final char[] buffer, final int offset, final int limit) {
public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
assert buffer.length >= limit;
assert offset <=0 && offset <= buffer.length;
for (int i = offset; i < limit;) {
i += Character.toChars(
Character.toLowerCase(
codePointAt(buffer, i)), buffer, i);
codePointAt(buffer, i, limit)), buffer, i);
}
}
/** Converts a sequence of Java characters to a sequence of unicode code points.
* @return the number of code points written to the destination buffer */
public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
int codePointCount = 0;
for (int i = 0; i < srcLen; ) {
final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
final int charCount = Character.charCount(cp);
dest[destOff + codePointCount++] = cp;
i += charCount;
}
return codePointCount;
}
/** Converts a sequence of unicode code points to a sequence of Java characters.
* @return the number of chars written to the destination buffer */
public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
int written = 0;
for (int i = 0; i < srcLen; ++i) {
written += Character.toChars(src[srcOff + i], dest, destOff + written);
}
return written;
}
/**
* Fills the {@link CharacterBuffer} with characters read from the given
* reader {@link Reader}. This method tries to read as many characters into
* the {@link CharacterBuffer} as possible, each call to fill will start
* filling the buffer from offset <code>0</code> up to the length of the size
* of the internal character array.
* reader {@link Reader}. This method tries to read <code>numChars</code>
* characters into the {@link CharacterBuffer}, each call to fill will start
* filling the buffer from offset <code>0</code> up to <code>numChars</code>.
* In case code points can span across 2 java characters, this method may
* only fill <code>numChars - 1</code> characters in order not to split in
* the middle of a surrogate pair, even if there are remaining characters in
* the {@link Reader}.
* <p>
* Depending on the {@link Version} passed to
* {@link CharacterUtils#getInstance(Version)} this method implements
* supplementary character awareness when filling the given buffer. For all
* {@link Version} &gt; 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees
* {@link Version} &gt; 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
* that the given {@link CharacterBuffer} will never contain a high surrogate
* character as the last element in the buffer unless it is the last available
* character in the reader. In other words, high and low surrogate pairs will
* always be preserved across buffer boarders.
* </p>
* <p>
* A return value of <code>false</code> means that this method call exhausted
* the reader, but there may be some bytes which have been read, which can be
* verified by checking whether <code>buffer.getLength() &gt; 0</code>.
* </p>
*
* @param buffer
* the buffer to fill.
* @param reader
* the reader to read characters from.
* @return <code>true</code> if and only if no more characters are available
* in the reader, otherwise <code>false</code>.
* @param numChars
* the number of chars to read
* @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
* @throws IOException
* if the reader throws an {@link IOException}.
*/
public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException;
public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
return fill(buffer, reader, buffer.buffer.length);
}
/** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
* code points from <code>index</code>. */
public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
int read = 0;
while (read < len) {
final int r = reader.read(dest, offset + read, len - read);
if (r == -1) {
break;
}
read += r;
}
return read;
}
private static final class Java5CharacterUtils extends CharacterUtils {
Java5CharacterUtils() {
}
@Override
public int codePointAt(final char[] chars, final int offset) {
return Character.codePointAt(chars, offset);
}
@Override
public int codePointAt(final CharSequence seq, final int offset) {
return Character.codePointAt(seq, offset);
@ -198,7 +228,11 @@ public abstract class CharacterUtils {
}
@Override
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
assert buffer.buffer.length >= 2;
if (numChars < 2 || numChars > buffer.buffer.length) {
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
}
final char[] charBuffer = buffer.buffer;
buffer.offset = 0;
final int offset;
@ -206,47 +240,36 @@ public abstract class CharacterUtils {
// Install the previously saved ending high surrogate:
if (buffer.lastTrailingHighSurrogate != 0) {
charBuffer[0] = buffer.lastTrailingHighSurrogate;
buffer.lastTrailingHighSurrogate = 0;
offset = 1;
} else {
offset = 0;
}
final int read = reader.read(charBuffer,
offset,
charBuffer.length - offset);
if (read == -1) {
buffer.length = offset;
buffer.lastTrailingHighSurrogate = 0;
return offset != 0;
}
assert read > 0;
buffer.length = read + offset;
final int read = readFully(reader, charBuffer, offset, numChars - offset);
// If we read only a single char, and that char was a
// high surrogate, read again:
if (buffer.length == 1
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
final int read2 = reader.read(charBuffer,
1,
charBuffer.length - 1);
if (read2 == -1) {
// NOTE: mal-formed input (ended on a high
// surrogate)! Consumer must deal with it...
return true;
}
assert read2 > 0;
buffer.length += read2;
buffer.length = offset + read;
final boolean result = buffer.length == numChars;
if (buffer.length < numChars) {
// We failed to fill the buffer. Even if the last char is a high
// surrogate, there is nothing we can do
return result;
}
if (buffer.length > 1
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
} else {
buffer.lastTrailingHighSurrogate = 0;
}
return result;
}
return true;
@Override
public int codePointCount(CharSequence seq) {
return Character.codePointCount(seq, 0, seq.length());
}
@Override
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
return Character.offsetByCodePoints(buf, start, count, index, offset);
}
}
@ -254,11 +277,6 @@ public abstract class CharacterUtils {
Java4CharacterUtils() {
}
@Override
public int codePointAt(final char[] chars, final int offset) {
return chars[offset];
}
@Override
public int codePointAt(final CharSequence seq, final int offset) {
return seq.charAt(offset);
@ -272,13 +290,31 @@ public abstract class CharacterUtils {
}
@Override
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
throws IOException {
assert buffer.buffer.length >= 1;
if (numChars < 1 || numChars > buffer.buffer.length) {
throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
}
buffer.offset = 0;
final int read = reader.read(buffer.buffer);
if(read == -1)
return false;
final int read = readFully(reader, buffer.buffer, 0, numChars);
buffer.length = read;
return true;
buffer.lastTrailingHighSurrogate = 0;
return read == numChars;
}
@Override
public int codePointCount(CharSequence seq) {
return seq.length();
}
@Override
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
final int result = index + offset;
if (result < 0 || result > count) {
throw new IndexOutOfBoundsException();
}
return result;
}
}

View File

@ -170,8 +170,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
int length = highSurEndingLower.length();
assertEquals('\ud801', termBuffer[length - 1]);
assertEquals('\udc3e', termBuffer[length]);
}
public void testLowerCaseTokenizer() throws IOException {

View File

@ -78,7 +78,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
char[] charArray = randomRealisticUnicodeString.toCharArray();
StringBuilder builder = new StringBuilder();
for (int j = 0; j < charArray.length;) {
int cp = Character.codePointAt(charArray, j);
int cp = Character.codePointAt(charArray, j, charArray.length);
if (!Character.isWhitespace(cp)) {
builder.appendCodePoint(cp);
}

View File

@ -32,8 +32,10 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil;
/**
* Tests {@link EdgeNGramTokenFilter} for correctness.
@ -192,9 +194,9 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
}
public void testGraphs() throws IOException {
TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
tk.reset();
assertTokenStreamContents(tk,
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
@ -205,4 +207,25 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
23
);
}
public void testSupplementaryCharacters() throws IOException {
final String s = _TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = _TestUtil.nextInt(random(), 1, 3);
final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer(new StringReader(s));
tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int end = Character.offsetByCodePoints(s, 0, i);
assertEquals(s.substring(0, end), termAtt.toString());
}
assertFalse(tk.incrementToken());
}
}

View File

@ -21,15 +21,15 @@ package org.apache.lucene.analysis.ngram;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util._TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
/**
* Tests {@link EdgeNGramTokenizer} for correctness.
*/
@ -120,25 +120,60 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
false);
}
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
final String s = RandomStrings.randomAsciiOfLength(random(), length);
testNGrams(minGram, maxGram, s, nonTokenChars);
}
private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
NGramTokenizerTest.testNGrams(minGram, maxGram, s, nonTokenChars, true);
}
public void testLargeInput() throws IOException {
final String input = _TestUtil.randomSimpleString(random(), 1024 * 5);
final int minGram = _TestUtil.nextInt(random(), 1, 1024);
final int maxGram = _TestUtil.nextInt(random(), minGram, 5 * 1024);
EdgeNGramTokenizer tk = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader(input), minGram, maxGram);
final CharTermAttribute charTermAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
final PositionIncrementAttribute posIncAtt = tk.addAttribute(PositionIncrementAttribute.class);
tk.reset();
for (int i = minGram; i <= maxGram && i <= input.length(); ++i) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(i, offsetAtt.endOffset());
assertEquals(1, posIncAtt.getPositionIncrement());
assertEquals(input.substring(0, i), charTermAtt.toString());
// test sliding
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
}
public void testLargeMaxGram() throws IOException {
// test sliding with maxGram > 1024
final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
}
public void testPreTokenization() throws IOException {
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
}
public void testHeavyPreTokenization() throws IOException {
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
}
public void testFewTokenChars() throws IOException {
final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
Arrays.fill(chrs, ' ');
for (int i = 0; i < chrs.length; ++i) {
if (random().nextFloat() < 0.1) {
chrs[i] = 'a';
}
}
assertFalse(tk.incrementToken());
tk.end();
assertEquals(input.length(), offsetAtt.startOffset());
final int minGram = _TestUtil.nextInt(random(), 1, 2);
final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
testNGrams(minGram, maxGram, new String(chrs), " ");
}
public void testFullUTF8Range() throws IOException {
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
testNGrams(minGram, maxGram, s, "");
testNGrams(minGram, maxGram, s, "abcdef");
}
}

View File

@ -26,7 +26,10 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil;
import java.io.IOException;
import java.io.Reader;
@ -177,4 +180,27 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
);
}
public void testSupplementaryCharacters() throws IOException {
final String s = _TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = _TestUtil.nextInt(random(), 1, 3);
final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer(new StringReader(s));
tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
}
}

View File

@ -18,9 +18,12 @@ package org.apache.lucene.analysis.ngram;
*/
import static org.apache.lucene.analysis.ngram.NGramTokenizerTest.isTokenChar;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -115,23 +118,74 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
}
private void testNGrams(int minGram, int maxGram, int length) throws IOException {
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
final String s = RandomStrings.randomAsciiOfLength(random(), length);
final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
testNGrams(minGram, maxGram, s, nonTokenChars);
}
private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
testNGrams(minGram, maxGram, s, nonTokenChars, false);
}
static int[] toCodePoints(CharSequence s) {
final int[] codePoints = new int[Character.codePointCount(s, 0, s.length())];
for (int i = 0, j = 0; i < s.length(); ++j) {
codePoints[j] = Character.codePointAt(s, i);
i += Character.charCount(codePoints[j]);
}
return codePoints;
}
static boolean isTokenChar(String nonTokenChars, int codePoint) {
for (int i = 0; i < nonTokenChars.length(); ) {
final int cp = nonTokenChars.codePointAt(i);
if (cp == codePoint) {
return false;
}
i += Character.charCount(cp);
}
return true;
}
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
// convert the string to code points
final int[] codePoints = toCodePoints(s);
final int[] offsets = new int[codePoints.length + 1];
for (int i = 0; i < codePoints.length; ++i) {
offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
}
final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly) {
@Override
protected boolean isTokenChar(int chr) {
return nonTokenChars.indexOf(chr) < 0;
}
};
final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
grams.reset();
for (int start = 0; start < s.length(); ++start) {
for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
for (int start = 0; start < codePoints.length; ++start) {
nextGram:
for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
// not on an edge
continue nextGram;
}
for (int j = start; j < end; ++j) {
if (!isTokenChar(nonTokenChars, codePoints[j])) {
continue nextGram;
}
}
assertTrue(grams.incrementToken());
assertEquals(s.substring(start, end), termAtt.toString());
assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
assertEquals(1, posIncAtt.getPositionIncrement());
assertEquals(start, offsetAtt.startOffset());
assertEquals(end, offsetAtt.endOffset());
assertEquals(1, posLenAtt.getPositionLength());
assertEquals(offsets[start], offsetAtt.startOffset());
assertEquals(offsets[end], offsetAtt.endOffset());
}
}
assertFalse(grams.incrementToken());
grams.end();
assertEquals(s.length(), offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
@ -141,14 +195,47 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
// test sliding
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
}
public void testLargeMaxGram() throws IOException {
// test sliding with maxGram > 1024
final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
}
public void testPreTokenization() throws IOException {
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
}
public void testHeavyPreTokenization() throws IOException {
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
}
public void testFewTokenChars() throws IOException {
final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
Arrays.fill(chrs, ' ');
for (int i = 0; i < chrs.length; ++i) {
if (random().nextFloat() < 0.1) {
chrs[i] = 'a';
}
}
final int minGram = _TestUtil.nextInt(random(), 1, 2);
final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
testNGrams(minGram, maxGram, new String(chrs), " ");
}
public void testFullUTF8Range() throws IOException {
final int minGram = _TestUtil.nextInt(random(), 1, 100);
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
testNGrams(minGram, maxGram, s, "");
testNGrams(minGram, maxGram, s, "abcdef");
}
}

View File

@ -20,10 +20,13 @@ package org.apache.lucene.analysis.util;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil;
import org.junit.Test;
/**
@ -31,32 +34,6 @@ import org.junit.Test;
*/
public class TestCharacterUtils extends LuceneTestCase {
@Test
public void testCodePointAtCharArrayInt() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
try {
java4.codePointAt(highSurrogateAt3, 4);
fail("array index out of bounds");
} catch (IndexOutOfBoundsException e) {
}
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
cpAt3, 3));
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
try {
java5.codePointAt(highSurrogateAt3, 4);
fail("array index out of bounds");
} catch (IndexOutOfBoundsException e) {
}
}
@Test
public void testCodePointAtCharSequenceInt() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
@ -98,7 +75,68 @@ public class TestCharacterUtils extends LuceneTestCase {
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
cpAt3, 3, 5));
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
}
@Test
public void testCodePointCount() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
final String s = _TestUtil.randomUnicodeString(random());
assertEquals(s.length(), java4.codePointCount(s));
assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
}
@Test
public void testOffsetByCodePoint() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
for (int i = 0; i < 10; ++i) {
final char[] s = _TestUtil.randomUnicodeString(random()).toCharArray();
final int index = _TestUtil.nextInt(random(), 0, s.length);
final int offset = random().nextInt(7) - 3;
try {
final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
assertEquals(o, index + offset);
} catch (IndexOutOfBoundsException e) {
assertTrue((index + offset) < 0 || (index + offset) > s.length);
}
int o;
try {
o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
} catch (IndexOutOfBoundsException e) {
try {
Character.offsetByCodePoints(s, 0, s.length, index, offset);
fail();
} catch (IndexOutOfBoundsException e2) {
// OK
}
o = -1;
}
if (o >= 0) {
assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
}
}
}
public void testConversions() {
CharacterUtils java4 = CharacterUtils.getJava4Instance();
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
testConversions(java4);
testConversions(java5);
}
private void testConversions(CharacterUtils charUtils) {
final char[] orig = _TestUtil.randomUnicodeString(random(), 100).toCharArray();
final int[] buf = new int[orig.length];
final char[] restored = new char[buf.length];
final int o1 = random().nextInt(5);
final int o2 = _TestUtil.nextInt(random(), 0, o1);
final int o3 = _TestUtil.nextInt(random(), 0, o1);
final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
assertEquals(orig.length - o1, charCount);
assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
}
@Test
@ -132,7 +170,7 @@ public class TestCharacterUtils extends LuceneTestCase {
assertEquals(0, buffer.getOffset());
assertEquals(6, buffer.getLength());
assertEquals("hellow", new String(buffer.getBuffer()));
assertTrue(instance.fill(buffer,reader));
assertFalse(instance.fill(buffer,reader));
assertEquals(4, buffer.getLength());
assertEquals(0, buffer.getOffset());
@ -159,15 +197,12 @@ public class TestCharacterUtils extends LuceneTestCase {
assertEquals(4, buffer.getLength());
assertEquals("123\ud801", new String(buffer.getBuffer(),
buffer.getOffset(), buffer.getLength()));
assertTrue(instance.fill(buffer, reader));
assertEquals(2, buffer.getLength());
assertEquals("\ud801\udc1c", new String(buffer.getBuffer(), buffer
.getOffset(), buffer.getLength()));
assertTrue(instance.fill(buffer, reader));
assertEquals(1, buffer.getLength());
assertEquals("\ud801", new String(buffer.getBuffer(), buffer
assertFalse(instance.fill(buffer, reader));
assertEquals(3, buffer.getLength());
assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
.getOffset(), buffer.getLength()));
assertFalse(instance.fill(buffer, reader));
assertEquals(0, buffer.getLength());
}
@Test

View File

@ -183,7 +183,10 @@
<forbidden-apis internalRuntimeForbidden="true" classpathref="forbidden-apis.classpath">
<bundledSignatures name="jdk-unsafe-${javac.target}"/>
<bundledSignatures name="jdk-deprecated-${javac.target}"/>
<signaturesFileSet file="${common.dir}/tools/forbiddenApis/executors.txt"/>
<signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
<include name="executors.txt" />
<include name="chars.txt" />
</signaturesFileSet>
<fileset dir="${basedir}/build" includes="**/*.class" />
</forbidden-apis>
</target>

View File

@ -808,7 +808,7 @@ public final class Util {
final int charLimit = offset + length;
while(charIdx < charLimit) {
scratch.grow(intIdx+1);
final int utf32 = Character.codePointAt(s, charIdx);
final int utf32 = Character.codePointAt(s, charIdx, charLimit);
scratch.ints[intIdx] = utf32;
charIdx += Character.charCount(utf32);
intIdx++;

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
java.lang.Character#codePointBefore(char[],int) @ Implicit start offset is error-prone when the char[] is a buffer and the first chars are random chars
java.lang.Character#codePointAt(char[],int) @ Implicit end offset is error-prone when the char[] is a buffer and the last chars are random chars

View File

@ -266,6 +266,7 @@
<bundledSignatures name="commons-io-unsafe-${commons-io.version}"/>
<signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
<include name="executors.txt" />
<include name="chars.txt" />
<include name="servlet-api.txt" />
</signaturesFileSet>
<fileset dir="${basedir}/build">