mirror of https://github.com/apache/lucene.git
LUCENE-5042: Fix the n-gram tokenizers and filters.
This commit fixes n-gram tokenizers and filters so that they handle supplementary characters correctly and adds the ability to pre-tokenize the stream in tokenizers. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1492185 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a4d58b6f22
commit
e021451d6c
|
@ -47,6 +47,10 @@ Changes in backwards compatibility policy
|
|||
(a, ab, b, bc, c) instead of (a, b, c, ab, bc) and doesn't trim trailing
|
||||
whitespaces. (Adrien Grand)
|
||||
|
||||
* LUCENE-5042: The n-gram and edge n-gram tokenizers and filters now correctly
|
||||
handle supplementary characters, and the tokenizers have the ability to
|
||||
pre-tokenize the input stream similarly to CharTokenizer. (Adrien Grand)
|
||||
|
||||
* LUCENE-4967: NRTManager is replaced by
|
||||
ControlledRealTimeReopenThread, for controlling which requests must
|
||||
see which indexing changes, so that it can work with any
|
||||
|
|
|
@ -57,7 +57,7 @@ public final class GreekLowerCaseFilter extends TokenFilter {
|
|||
int chLen = termAtt.length();
|
||||
for (int i = 0; i < chLen;) {
|
||||
i += Character.toChars(
|
||||
lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
|
||||
lowerCase(charUtils.codePointAt(chArray, i, chLen)), chArray, i);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
|
|
|
@ -25,21 +25,26 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizes the given token into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link TokenFilter} create n-grams from the beginning edge of a input token.
|
||||
* <p><a name="match_version" />As of Lucene 4.4, this filter handles correctly
|
||||
* supplementary characters.
|
||||
*/
|
||||
public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
private final CharacterUtils charUtils;
|
||||
private final int minGram;
|
||||
private final int maxGram;
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int tokStart;
|
||||
private int tokEnd; // only used if the length changed before this filter
|
||||
|
@ -74,6 +79,9 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
this.charUtils = version.onOrAfter(Version.LUCENE_44)
|
||||
? CharacterUtils.getInstance(version)
|
||||
: CharacterUtils.getJava4Instance();
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
|
@ -87,6 +95,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curGramSize = minGram;
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
|
@ -95,7 +104,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
if (curGramSize <= maxGram) { // if we have hit the end of our n-gram size range, quit
|
||||
if (curGramSize <= curTermLength) { // if the remaining input is too short, we can't generate any n-grams
|
||||
if (curGramSize <= curCodePointCount) { // if the remaining input is too short, we can't generate any n-grams
|
||||
// grab gramSize chars from front or back
|
||||
clearAttributes();
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
|
@ -107,7 +116,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
|||
posIncrAtt.setPositionIncrement(0);
|
||||
}
|
||||
posLenAtt.setPositionLength(savePosLen);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, curGramSize);
|
||||
final int charLength = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, 0, charLength);
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -17,37 +17,23 @@ package org.apache.lucene.analysis.ngram;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizes the input from an edge into n-grams of given size(s).
|
||||
* <p>
|
||||
* This {@link Tokenizer} create n-grams from the beginning edge of a input token.
|
||||
* <p><a name="match_version" />As of Lucene 4.4, this class supports
|
||||
* {@link #isTokenChar(int) pre-tokenization} and correctly handles
|
||||
* supplementary characters.
|
||||
*/
|
||||
public final class EdgeNGramTokenizer extends Tokenizer {
|
||||
public class EdgeNGramTokenizer extends NGramTokenizer {
|
||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
private int minGram;
|
||||
private int maxGram;
|
||||
private int gramSize;
|
||||
private boolean started;
|
||||
private int inLen; // length of the input AFTER trim()
|
||||
private int charsRead; // length of the input
|
||||
private String inStr;
|
||||
|
||||
/**
|
||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||
*
|
||||
|
@ -57,8 +43,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(version, minGram, maxGram);
|
||||
super(version, input, minGram, maxGram, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -71,102 +56,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(version, minGram, maxGram);
|
||||
super(version, factory, input, minGram, maxGram, true);
|
||||
}
|
||||
|
||||
private void init(Version version, int minGram, int maxGram) {
|
||||
if (version == null) {
|
||||
throw new IllegalArgumentException("version must not be null");
|
||||
}
|
||||
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
// if we are just starting, read the whole input
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
char[] chars = new char[Math.min(1024, maxGram)];
|
||||
charsRead = 0;
|
||||
// TODO: refactor to a shared readFully somewhere:
|
||||
boolean exhausted = false;
|
||||
while (charsRead < maxGram) {
|
||||
final int inc = input.read(chars, charsRead, chars.length-charsRead);
|
||||
if (inc == -1) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
if (charsRead == chars.length && charsRead < maxGram) {
|
||||
chars = ArrayUtil.grow(chars);
|
||||
}
|
||||
}
|
||||
|
||||
inStr = new String(chars, 0, charsRead);
|
||||
|
||||
if (!exhausted) {
|
||||
// Read extra throwaway chars so that on end() we
|
||||
// report the correct offset:
|
||||
char[] throwaway = new char[1024];
|
||||
while(true) {
|
||||
final int inc = input.read(throwaway, 0, throwaway.length);
|
||||
if (inc == -1) {
|
||||
break;
|
||||
}
|
||||
charsRead += inc;
|
||||
}
|
||||
}
|
||||
|
||||
inLen = inStr.length();
|
||||
if (inLen == 0) {
|
||||
return false;
|
||||
}
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
} else {
|
||||
posIncrAtt.setPositionIncrement(1);
|
||||
}
|
||||
|
||||
// if the remaining input is too short, we can't generate any n-grams
|
||||
if (gramSize > inLen) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if we have hit the end of our n-gram size range, quit
|
||||
if (gramSize > maxGram || gramSize > inLen) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// grab gramSize chars from front or back
|
||||
termAtt.setEmpty().append(inStr, 0, gramSize);
|
||||
offsetAtt.setOffset(correctOffset(0), correctOffset(gramSize));
|
||||
gramSize++;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(charsRead);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
started = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -33,6 +34,7 @@ import org.apache.lucene.util.Version;
|
|||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version} compatibility when
|
||||
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
|
||||
* <li>handles supplementary characters correctly,</li>
|
||||
* <li>emits all n-grams for the same token at the same position,</li>
|
||||
* <li>does not modify offsets,</li>
|
||||
* <li>sorts n-grams by their offset in the original token first, then
|
||||
|
@ -42,6 +44,10 @@ import org.apache.lucene.util.Version;
|
|||
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
|
||||
* it will lead to broken {@link TokenStream}s that will cause highlighting
|
||||
* bugs.
|
||||
* <p>If you were using this {@link TokenFilter} to perform partial highlighting,
|
||||
* this won't work anymore since this filter doesn't update offsets. You should
|
||||
* modify your analysis chain to use {@link NGramTokenizer}, and potentially
|
||||
* override {@link NGramTokenizer#isTokenChar(int)} to perform pre-tokenization.
|
||||
*/
|
||||
public final class NGramTokenFilter extends TokenFilter {
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
|
@ -51,6 +57,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curCodePointCount;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int curPosInc, curPosLen;
|
||||
|
@ -59,6 +66,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final Version version;
|
||||
private final CharacterUtils charUtils;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLenAtt;
|
||||
|
@ -75,6 +83,9 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
super(new LengthFilter(version, input, minGram, Integer.MAX_VALUE));
|
||||
this.version = version;
|
||||
this.charUtils = version.onOrAfter(Version.LUCENE_44)
|
||||
? CharacterUtils.getInstance(version)
|
||||
: CharacterUtils.getJava4Instance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -126,6 +137,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curCodePointCount = charUtils.codePointCount(termAtt);
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
|
@ -138,13 +150,15 @@ public final class NGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
if (version.onOrAfter(Version.LUCENE_44)) {
|
||||
if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
|
||||
if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) {
|
||||
++curPos;
|
||||
curGramSize = minGram;
|
||||
}
|
||||
if (curPos + curGramSize <= curTermLength) {
|
||||
if ((curPos + curGramSize) <= curCodePointCount) {
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
|
||||
final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
|
||||
termAtt.copyBuffer(curTermBuffer, start, end - start);
|
||||
posIncAtt.setPositionIncrement(curPosInc);
|
||||
curPosInc = 0;
|
||||
posLenAtt.setPositionLength(curPosLen);
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
|
@ -40,29 +41,47 @@ import org.apache.lucene.util.Version;
|
|||
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
|
||||
* </table>
|
||||
* <a name="version"/>
|
||||
* <p>Before Lucene 4.4, this class had a different behavior:<ul>
|
||||
* <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
|
||||
* <li>The last whitespaces of the 1024 chars block were trimmed.</li>
|
||||
* <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
|
||||
* <p>Although highly discouraged, it is still possible to use the old behavior
|
||||
* through {@link Lucene43NGramTokenizer}.
|
||||
* <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
|
||||
* <li>tokenize in a streaming fashion to support streams which are larger
|
||||
* than 1024 chars (limit of the previous version),
|
||||
* <li>count grams based on unicode code points instead of java chars (and
|
||||
* never split in the middle of surrogate pairs),
|
||||
* <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
|
||||
* before computing n-grams.</ul>
|
||||
* <p>Additionally, this class doesn't trim trailing whitespaces and emits
|
||||
* tokens in a different order, tokens are now emitted by increasing start
|
||||
* offsets while they used to be emitted by increasing lengths (which prevented
|
||||
* from supporting large input streams).
|
||||
* <p>Although <b style="color:red">highly</b> discouraged, it is still possible
|
||||
* to use the old behavior through {@link Lucene43NGramTokenizer}.
|
||||
*/
|
||||
public final class NGramTokenizer extends Tokenizer {
|
||||
// non-final to allow for overriding isTokenChar, but all other methods should be final
|
||||
public class NGramTokenizer extends Tokenizer {
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private char[] buffer;
|
||||
private int bufferStart, bufferEnd; // remaining slice of the buffer
|
||||
private CharacterUtils charUtils;
|
||||
private CharacterUtils.CharacterBuffer charBuffer;
|
||||
private int[] buffer; // like charBuffer, but converted to code points
|
||||
private int bufferStart, bufferEnd; // remaining slice in buffer
|
||||
private int offset;
|
||||
private int gramSize;
|
||||
private int minGram, maxGram;
|
||||
private boolean exhausted;
|
||||
private int lastCheckedChar; // last offset in the buffer that we checked
|
||||
private int lastNonTokenChar; // last offset that we found to not be a token char
|
||||
private boolean edgesOnly; // leading edges n-grams only
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
NGramTokenizer(Version version, Reader input, int minGram, int maxGram, boolean edgesOnly) {
|
||||
super(input);
|
||||
init(version, minGram, maxGram, edgesOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
|
@ -71,8 +90,12 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(version, minGram, maxGram);
|
||||
this(version, input, minGram, maxGram, false);
|
||||
}
|
||||
|
||||
NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, boolean edgesOnly) {
|
||||
super(factory, input);
|
||||
init(version, minGram, maxGram, edgesOnly);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -84,8 +107,7 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(version, minGram, maxGram);
|
||||
this(version, factory, input, minGram, maxGram, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -97,10 +119,13 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
private void init(Version version, int minGram, int maxGram) {
|
||||
if (!version.onOrAfter(Version.LUCENE_44)) {
|
||||
private void init(Version version, int minGram, int maxGram, boolean edgesOnly) {
|
||||
if (!edgesOnly && !version.onOrAfter(Version.LUCENE_44)) {
|
||||
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
|
||||
}
|
||||
charUtils = version.onOrAfter(Version.LUCENE_44)
|
||||
? CharacterUtils.getInstance(version)
|
||||
: CharacterUtils.getJava4Instance();
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
|
@ -109,64 +134,104 @@ public final class NGramTokenizer extends Tokenizer {
|
|||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
buffer = new char[maxGram + 1024];
|
||||
this.edgesOnly = edgesOnly;
|
||||
charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
|
||||
buffer = new int[charBuffer.getBuffer().length];
|
||||
// Make the term att large enough
|
||||
termAtt.resizeBuffer(2 * maxGram);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
public final boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
|
||||
// compact
|
||||
if (bufferStart >= buffer.length - maxGram) {
|
||||
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
|
||||
bufferEnd -= bufferStart;
|
||||
bufferStart = 0;
|
||||
// termination of this loop is guaranteed by the fact that every iteration
|
||||
// either advances the buffer (calls consumes()) or increases gramSize
|
||||
while (true) {
|
||||
// compact
|
||||
if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
|
||||
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
|
||||
bufferEnd -= bufferStart;
|
||||
lastCheckedChar -= bufferStart;
|
||||
lastNonTokenChar -= bufferStart;
|
||||
bufferStart = 0;
|
||||
|
||||
// fill in remaining space
|
||||
if (!exhausted) {
|
||||
// TODO: refactor to a shared readFully
|
||||
while (bufferEnd < buffer.length) {
|
||||
final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
|
||||
if (read == -1) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
bufferEnd += read;
|
||||
// fill in remaining space
|
||||
exhausted = !charUtils.fill(charBuffer, input, buffer.length - bufferEnd);
|
||||
// convert to code points
|
||||
bufferEnd += charUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
|
||||
}
|
||||
|
||||
// should we go to the next offset?
|
||||
if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
|
||||
if (bufferStart + 1 + minGram > bufferEnd) {
|
||||
assert exhausted;
|
||||
return false;
|
||||
}
|
||||
consume();
|
||||
gramSize = minGram;
|
||||
}
|
||||
|
||||
updateLastNonTokenChar();
|
||||
|
||||
// retry if the token to be emitted was going to not only contain token chars
|
||||
final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
|
||||
final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
|
||||
if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
|
||||
consume();
|
||||
gramSize = minGram;
|
||||
continue;
|
||||
}
|
||||
|
||||
final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
|
||||
termAtt.setLength(length);
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
posLenAtt.setPositionLength(1);
|
||||
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
|
||||
++gramSize;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private void updateLastNonTokenChar() {
|
||||
final int termEnd = bufferStart + gramSize - 1;
|
||||
if (termEnd > lastCheckedChar) {
|
||||
for (int i = termEnd; i > lastCheckedChar; --i) {
|
||||
if (!isTokenChar(buffer[i])) {
|
||||
lastNonTokenChar = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
lastCheckedChar = termEnd;
|
||||
}
|
||||
}
|
||||
|
||||
// should we go to the next offset?
|
||||
if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
|
||||
bufferStart++;
|
||||
offset++;
|
||||
gramSize = minGram;
|
||||
}
|
||||
/** Consume one code point. */
|
||||
private void consume() {
|
||||
offset += Character.charCount(buffer[bufferStart++]);
|
||||
}
|
||||
|
||||
// are there enough chars remaining?
|
||||
if (bufferStart + gramSize > bufferEnd) {
|
||||
return false;
|
||||
}
|
||||
|
||||
termAtt.copyBuffer(buffer, bufferStart, gramSize);
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
posLenAtt.setPositionLength(1);
|
||||
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
|
||||
++gramSize;
|
||||
/** Only collect characters which satisfy this condition. */
|
||||
protected boolean isTokenChar(int chr) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
|
||||
public final void end() {
|
||||
assert bufferStart <= bufferEnd;
|
||||
int endOffset = offset;
|
||||
for (int i = bufferStart; i < bufferEnd; ++i) {
|
||||
endOffset += Character.charCount(buffer[i]);
|
||||
}
|
||||
endOffset = correctOffset(endOffset);
|
||||
offsetAtt.setOffset(endOffset, endOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
public final void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferStart = bufferEnd = buffer.length;
|
||||
lastNonTokenChar = lastCheckedChar = bufferStart - 1;
|
||||
offset = 0;
|
||||
gramSize = minGram;
|
||||
exhausted = false;
|
||||
|
|
|
@ -57,7 +57,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
|||
final char[] buffer = termAtt.buffer();
|
||||
int length = termAtt.length();
|
||||
for (int i = 0; i < length;) {
|
||||
final int ch = Character.codePointAt(buffer, i);
|
||||
final int ch = Character.codePointAt(buffer, i, length);
|
||||
|
||||
iOrAfter = (ch == LATIN_CAPITAL_LETTER_I ||
|
||||
(iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));
|
||||
|
@ -100,7 +100,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
|||
*/
|
||||
private boolean isBeforeDot(char s[], int pos, int len) {
|
||||
for (int i = pos; i < len;) {
|
||||
final int ch = Character.codePointAt(s, i);
|
||||
final int ch = Character.codePointAt(s, i, len);
|
||||
if (Character.getType(ch) != Character.NON_SPACING_MARK)
|
||||
return false;
|
||||
if (ch == COMBINING_DOT_ABOVE)
|
||||
|
|
|
@ -262,7 +262,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
if (ignoreCase) {
|
||||
for(int i=0;i<len;) {
|
||||
final int codePointAt = charUtils.codePointAt(text1, off+i, limit);
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
|
||||
return false;
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
|
@ -282,7 +282,7 @@ public class CharArrayMap<V> extends AbstractMap<Object,V> {
|
|||
if (ignoreCase) {
|
||||
for(int i=0;i<len;) {
|
||||
final int codePointAt = charUtils.codePointAt(text1, i);
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i))
|
||||
if (Character.toLowerCase(codePointAt) != charUtils.codePointAt(text2, i, text2.length))
|
||||
return false;
|
||||
i += Character.charCount(codePointAt);
|
||||
}
|
||||
|
|
|
@ -100,7 +100,8 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
while (true) {
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
if(!charUtils.fill(ioBuffer, input)) { // read supplementary char aware with CharacterUtils
|
||||
charUtils.fill(ioBuffer, input); // read supplementary char aware with CharacterUtils
|
||||
if (ioBuffer.getLength() == 0) {
|
||||
dataLen = 0; // so next offset += dataLen won't decrement offset
|
||||
if (length > 0) {
|
||||
break;
|
||||
|
@ -113,7 +114,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
bufferIndex = 0;
|
||||
}
|
||||
// use CharacterUtils here to support < 3.1 UTF-16 code unit behavior if the char based methods are gone
|
||||
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex);
|
||||
final int c = charUtils.codePointAt(ioBuffer.getBuffer(), bufferIndex, ioBuffer.getLength());
|
||||
final int charCount = Character.charCount(c);
|
||||
bufferIndex += charCount;
|
||||
|
||||
|
|
|
@ -51,27 +51,6 @@ public abstract class CharacterUtils {
|
|||
return JAVA_4;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the char array.
|
||||
* Depending on the {@link Version} passed to
|
||||
* {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
|
||||
* of {@link Character#codePointAt(char[], int)} as it would have been
|
||||
* available on a Java 1.4 JVM or on a later virtual machine version.
|
||||
*
|
||||
* @param chars
|
||||
* a character array
|
||||
* @param offset
|
||||
* the offset to the char values in the chars array to be converted
|
||||
*
|
||||
* @return the Unicode code point at the given index
|
||||
* @throws NullPointerException
|
||||
* - if the array is null.
|
||||
* @throws IndexOutOfBoundsException
|
||||
* - if the value offset is negative or not less than the length of
|
||||
* the char array.
|
||||
*/
|
||||
public abstract int codePointAt(final char[] chars, final int offset);
|
||||
|
||||
/**
|
||||
* Returns the code point at the given index of the {@link CharSequence}.
|
||||
* Depending on the {@link Version} passed to
|
||||
|
@ -116,7 +95,10 @@ public abstract class CharacterUtils {
|
|||
* the char array.
|
||||
*/
|
||||
public abstract int codePointAt(final char[] chars, final int offset, final int limit);
|
||||
|
||||
|
||||
/** Return the number of characters in <code>seq</code>. */
|
||||
public abstract int codePointCount(CharSequence seq);
|
||||
|
||||
/**
|
||||
* Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
|
||||
* of the given bufferSize.
|
||||
|
@ -140,53 +122,101 @@ public abstract class CharacterUtils {
|
|||
* @param offset the offset to start at
|
||||
* @param limit the max char in the buffer to lower case
|
||||
*/
|
||||
public void toLowerCase(final char[] buffer, final int offset, final int limit) {
|
||||
public final void toLowerCase(final char[] buffer, final int offset, final int limit) {
|
||||
assert buffer.length >= limit;
|
||||
assert offset <=0 && offset <= buffer.length;
|
||||
for (int i = offset; i < limit;) {
|
||||
i += Character.toChars(
|
||||
Character.toLowerCase(
|
||||
codePointAt(buffer, i)), buffer, i);
|
||||
codePointAt(buffer, i, limit)), buffer, i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Converts a sequence of Java characters to a sequence of unicode code points.
|
||||
* @return the number of code points written to the destination buffer */
|
||||
public final int toCodePoints(char[] src, int srcOff, int srcLen, int[] dest, int destOff) {
|
||||
int codePointCount = 0;
|
||||
for (int i = 0; i < srcLen; ) {
|
||||
final int cp = codePointAt(src, srcOff + i, srcOff + srcLen);
|
||||
final int charCount = Character.charCount(cp);
|
||||
dest[destOff + codePointCount++] = cp;
|
||||
i += charCount;
|
||||
}
|
||||
return codePointCount;
|
||||
}
|
||||
|
||||
/** Converts a sequence of unicode code points to a sequence of Java characters.
|
||||
* @return the number of chars written to the destination buffer */
|
||||
public final int toChars(int[] src, int srcOff, int srcLen, char[] dest, int destOff) {
|
||||
int written = 0;
|
||||
for (int i = 0; i < srcLen; ++i) {
|
||||
written += Character.toChars(src[srcOff + i], dest, destOff + written);
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills the {@link CharacterBuffer} with characters read from the given
|
||||
* reader {@link Reader}. This method tries to read as many characters into
|
||||
* the {@link CharacterBuffer} as possible, each call to fill will start
|
||||
* filling the buffer from offset <code>0</code> up to the length of the size
|
||||
* of the internal character array.
|
||||
* reader {@link Reader}. This method tries to read <code>numChars</code>
|
||||
* characters into the {@link CharacterBuffer}, each call to fill will start
|
||||
* filling the buffer from offset <code>0</code> up to <code>numChars</code>.
|
||||
* In case code points can span across 2 java characters, this method may
|
||||
* only fill <code>numChars - 1</code> characters in order not to split in
|
||||
* the middle of a surrogate pair, even if there are remaining characters in
|
||||
* the {@link Reader}.
|
||||
* <p>
|
||||
* Depending on the {@link Version} passed to
|
||||
* {@link CharacterUtils#getInstance(Version)} this method implements
|
||||
* supplementary character awareness when filling the given buffer. For all
|
||||
* {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees
|
||||
* {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader, int)} guarantees
|
||||
* that the given {@link CharacterBuffer} will never contain a high surrogate
|
||||
* character as the last element in the buffer unless it is the last available
|
||||
* character in the reader. In other words, high and low surrogate pairs will
|
||||
* always be preserved across buffer boarders.
|
||||
* </p>
|
||||
* <p>
|
||||
* A return value of <code>false</code> means that this method call exhausted
|
||||
* the reader, but there may be some bytes which have been read, which can be
|
||||
* verified by checking whether <code>buffer.getLength() > 0</code>.
|
||||
* </p>
|
||||
*
|
||||
* @param buffer
|
||||
* the buffer to fill.
|
||||
* @param reader
|
||||
* the reader to read characters from.
|
||||
* @return <code>true</code> if and only if no more characters are available
|
||||
* in the reader, otherwise <code>false</code>.
|
||||
* @param numChars
|
||||
* the number of chars to read
|
||||
* @return <code>false</code> if and only if reader.read returned -1 while trying to fill the buffer
|
||||
* @throws IOException
|
||||
* if the reader throws an {@link IOException}.
|
||||
*/
|
||||
public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException;
|
||||
public abstract boolean fill(CharacterBuffer buffer, Reader reader, int numChars) throws IOException;
|
||||
|
||||
/** Convenience method which calls <code>fill(buffer, reader, buffer.buffer.length)</code>. */
|
||||
public final boolean fill(CharacterBuffer buffer, Reader reader) throws IOException {
|
||||
return fill(buffer, reader, buffer.buffer.length);
|
||||
}
|
||||
|
||||
/** Return the index within <code>buf[start:start+count]</code> which is by <code>offset</code>
|
||||
* code points from <code>index</code>. */
|
||||
public abstract int offsetByCodePoints(char[] buf, int start, int count, int index, int offset);
|
||||
|
||||
static int readFully(Reader reader, char[] dest, int offset, int len) throws IOException {
|
||||
int read = 0;
|
||||
while (read < len) {
|
||||
final int r = reader.read(dest, offset + read, len - read);
|
||||
if (r == -1) {
|
||||
break;
|
||||
}
|
||||
read += r;
|
||||
}
|
||||
return read;
|
||||
}
|
||||
|
||||
private static final class Java5CharacterUtils extends CharacterUtils {
|
||||
Java5CharacterUtils() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset) {
|
||||
return Character.codePointAt(chars, offset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return Character.codePointAt(seq, offset);
|
||||
|
@ -198,7 +228,11 @@ public abstract class CharacterUtils {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
|
||||
public boolean fill(final CharacterBuffer buffer, final Reader reader, int numChars) throws IOException {
|
||||
assert buffer.buffer.length >= 2;
|
||||
if (numChars < 2 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 2 and <= the buffer size");
|
||||
}
|
||||
final char[] charBuffer = buffer.buffer;
|
||||
buffer.offset = 0;
|
||||
final int offset;
|
||||
|
@ -206,47 +240,36 @@ public abstract class CharacterUtils {
|
|||
// Install the previously saved ending high surrogate:
|
||||
if (buffer.lastTrailingHighSurrogate != 0) {
|
||||
charBuffer[0] = buffer.lastTrailingHighSurrogate;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
offset = 1;
|
||||
} else {
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
final int read = reader.read(charBuffer,
|
||||
offset,
|
||||
charBuffer.length - offset);
|
||||
if (read == -1) {
|
||||
buffer.length = offset;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
return offset != 0;
|
||||
}
|
||||
assert read > 0;
|
||||
buffer.length = read + offset;
|
||||
final int read = readFully(reader, charBuffer, offset, numChars - offset);
|
||||
|
||||
// If we read only a single char, and that char was a
|
||||
// high surrogate, read again:
|
||||
if (buffer.length == 1
|
||||
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||
final int read2 = reader.read(charBuffer,
|
||||
1,
|
||||
charBuffer.length - 1);
|
||||
if (read2 == -1) {
|
||||
// NOTE: mal-formed input (ended on a high
|
||||
// surrogate)! Consumer must deal with it...
|
||||
return true;
|
||||
}
|
||||
assert read2 > 0;
|
||||
|
||||
buffer.length += read2;
|
||||
buffer.length = offset + read;
|
||||
final boolean result = buffer.length == numChars;
|
||||
if (buffer.length < numChars) {
|
||||
// We failed to fill the buffer. Even if the last char is a high
|
||||
// surrogate, there is nothing we can do
|
||||
return result;
|
||||
}
|
||||
|
||||
if (buffer.length > 1
|
||||
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||
if (Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
|
||||
} else {
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
return true;
|
||||
@Override
|
||||
public int codePointCount(CharSequence seq) {
|
||||
return Character.codePointCount(seq, 0, seq.length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
|
||||
return Character.offsetByCodePoints(buf, start, count, index, offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -254,11 +277,6 @@ public abstract class CharacterUtils {
|
|||
Java4CharacterUtils() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final char[] chars, final int offset) {
|
||||
return chars[offset];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointAt(final CharSequence seq, final int offset) {
|
||||
return seq.charAt(offset);
|
||||
|
@ -272,13 +290,31 @@ public abstract class CharacterUtils {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
|
||||
public boolean fill(CharacterBuffer buffer, Reader reader, int numChars)
|
||||
throws IOException {
|
||||
assert buffer.buffer.length >= 1;
|
||||
if (numChars < 1 || numChars > buffer.buffer.length) {
|
||||
throw new IllegalArgumentException("numChars must be >= 1 and <= the buffer size");
|
||||
}
|
||||
buffer.offset = 0;
|
||||
final int read = reader.read(buffer.buffer);
|
||||
if(read == -1)
|
||||
return false;
|
||||
final int read = readFully(reader, buffer.buffer, 0, numChars);
|
||||
buffer.length = read;
|
||||
return true;
|
||||
buffer.lastTrailingHighSurrogate = 0;
|
||||
return read == numChars;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int codePointCount(CharSequence seq) {
|
||||
return seq.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int offsetByCodePoints(char[] buf, int start, int count, int index, int offset) {
|
||||
final int result = index + offset;
|
||||
if (result < 0 || result > count) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -170,8 +170,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
|
||||
int length = highSurEndingLower.length();
|
||||
assertEquals('\ud801', termBuffer[length - 1]);
|
||||
assertEquals('\udc3e', termBuffer[length]);
|
||||
|
||||
}
|
||||
|
||||
public void testLowerCaseTokenizer() throws IOException {
|
||||
|
|
|
@ -78,7 +78,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
|||
char[] charArray = randomRealisticUnicodeString.toCharArray();
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int j = 0; j < charArray.length;) {
|
||||
int cp = Character.codePointAt(charArray, j);
|
||||
int cp = Character.codePointAt(charArray, j, charArray.length);
|
||||
if (!Character.isWhitespace(cp)) {
|
||||
builder.appendCodePoint(cp);
|
||||
}
|
||||
|
|
|
@ -32,8 +32,10 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
|
|||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Tests {@link EdgeNGramTokenFilter} for correctness.
|
||||
|
@ -192,9 +194,9 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testGraphs() throws IOException {
|
||||
TokenStream tk = new LetterTokenizer(Version.LUCENE_44, new StringReader("abc d efgh ij klmno p q"));
|
||||
TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
|
||||
tk = new ShingleFilter(tk);
|
||||
tk = new EdgeNGramTokenFilter(Version.LUCENE_44, tk, 7, 10);
|
||||
tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
|
||||
tk.reset();
|
||||
assertTokenStreamContents(tk,
|
||||
new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" },
|
||||
|
@ -205,4 +207,25 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
23
|
||||
);
|
||||
}
|
||||
|
||||
public void testSupplementaryCharacters() throws IOException {
|
||||
final String s = _TestUtil.randomUnicodeString(random(), 10);
|
||||
final int codePointCount = s.codePointCount(0, s.length());
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 3);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
|
||||
TokenStream tk = new KeywordTokenizer(new StringReader(s));
|
||||
tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
|
||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
tk.reset();
|
||||
for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
final int end = Character.offsetByCodePoints(s, 0, i);
|
||||
assertEquals(s.substring(0, end), termAtt.toString());
|
||||
}
|
||||
assertFalse(tk.incrementToken());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -21,15 +21,15 @@ package org.apache.lucene.analysis.ngram;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||
|
||||
/**
|
||||
* Tests {@link EdgeNGramTokenizer} for correctness.
|
||||
*/
|
||||
|
@ -120,25 +120,60 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
false);
|
||||
}
|
||||
|
||||
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
|
||||
final String s = RandomStrings.randomAsciiOfLength(random(), length);
|
||||
testNGrams(minGram, maxGram, s, nonTokenChars);
|
||||
}
|
||||
|
||||
private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
|
||||
NGramTokenizerTest.testNGrams(minGram, maxGram, s, nonTokenChars, true);
|
||||
}
|
||||
|
||||
public void testLargeInput() throws IOException {
|
||||
final String input = _TestUtil.randomSimpleString(random(), 1024 * 5);
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 1024);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 5 * 1024);
|
||||
EdgeNGramTokenizer tk = new EdgeNGramTokenizer(TEST_VERSION_CURRENT, new StringReader(input), minGram, maxGram);
|
||||
final CharTermAttribute charTermAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
final PositionIncrementAttribute posIncAtt = tk.addAttribute(PositionIncrementAttribute.class);
|
||||
tk.reset();
|
||||
for (int i = minGram; i <= maxGram && i <= input.length(); ++i) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(i, offsetAtt.endOffset());
|
||||
assertEquals(1, posIncAtt.getPositionIncrement());
|
||||
assertEquals(input.substring(0, i), charTermAtt.toString());
|
||||
// test sliding
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
|
||||
}
|
||||
|
||||
public void testLargeMaxGram() throws IOException {
|
||||
// test sliding with maxGram > 1024
|
||||
final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
|
||||
}
|
||||
|
||||
public void testPreTokenization() throws IOException {
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
|
||||
}
|
||||
|
||||
public void testHeavyPreTokenization() throws IOException {
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
|
||||
}
|
||||
|
||||
public void testFewTokenChars() throws IOException {
|
||||
final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
|
||||
Arrays.fill(chrs, ' ');
|
||||
for (int i = 0; i < chrs.length; ++i) {
|
||||
if (random().nextFloat() < 0.1) {
|
||||
chrs[i] = 'a';
|
||||
}
|
||||
}
|
||||
assertFalse(tk.incrementToken());
|
||||
tk.end();
|
||||
assertEquals(input.length(), offsetAtt.startOffset());
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 2);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
|
||||
testNGrams(minGram, maxGram, new String(chrs), " ");
|
||||
}
|
||||
|
||||
public void testFullUTF8Range() throws IOException {
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
|
||||
testNGrams(minGram, maxGram, s, "");
|
||||
testNGrams(minGram, maxGram, s, "abcdef");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -26,7 +26,10 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -177,4 +180,27 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
public void testSupplementaryCharacters() throws IOException {
|
||||
final String s = _TestUtil.randomUnicodeString(random(), 10);
|
||||
final int codePointCount = s.codePointCount(0, s.length());
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 3);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
|
||||
TokenStream tk = new KeywordTokenizer(new StringReader(s));
|
||||
tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
|
||||
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
|
||||
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
|
||||
tk.reset();
|
||||
for (int start = 0; start < codePointCount; ++start) {
|
||||
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
|
||||
assertTrue(tk.incrementToken());
|
||||
assertEquals(0, offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
final int startIndex = Character.offsetByCodePoints(s, 0, start);
|
||||
final int endIndex = Character.offsetByCodePoints(s, 0, end);
|
||||
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
|
||||
}
|
||||
}
|
||||
assertFalse(tk.incrementToken());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -18,9 +18,12 @@ package org.apache.lucene.analysis.ngram;
|
|||
*/
|
||||
|
||||
|
||||
import static org.apache.lucene.analysis.ngram.NGramTokenizerTest.isTokenChar;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -115,23 +118,74 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
checkRandomData(random(), a, 50*RANDOM_MULTIPLIER, 1027, false, false);
|
||||
}
|
||||
|
||||
private void testNGrams(int minGram, int maxGram, int length) throws IOException {
|
||||
private static void testNGrams(int minGram, int maxGram, int length, final String nonTokenChars) throws IOException {
|
||||
final String s = RandomStrings.randomAsciiOfLength(random(), length);
|
||||
final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram);
|
||||
testNGrams(minGram, maxGram, s, nonTokenChars);
|
||||
}
|
||||
|
||||
private static void testNGrams(int minGram, int maxGram, String s, String nonTokenChars) throws IOException {
|
||||
testNGrams(minGram, maxGram, s, nonTokenChars, false);
|
||||
}
|
||||
|
||||
static int[] toCodePoints(CharSequence s) {
|
||||
final int[] codePoints = new int[Character.codePointCount(s, 0, s.length())];
|
||||
for (int i = 0, j = 0; i < s.length(); ++j) {
|
||||
codePoints[j] = Character.codePointAt(s, i);
|
||||
i += Character.charCount(codePoints[j]);
|
||||
}
|
||||
return codePoints;
|
||||
}
|
||||
|
||||
static boolean isTokenChar(String nonTokenChars, int codePoint) {
|
||||
for (int i = 0; i < nonTokenChars.length(); ) {
|
||||
final int cp = nonTokenChars.codePointAt(i);
|
||||
if (cp == codePoint) {
|
||||
return false;
|
||||
}
|
||||
i += Character.charCount(cp);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
|
||||
// convert the string to code points
|
||||
final int[] codePoints = toCodePoints(s);
|
||||
final int[] offsets = new int[codePoints.length + 1];
|
||||
for (int i = 0; i < codePoints.length; ++i) {
|
||||
offsets[i+1] = offsets[i] + Character.charCount(codePoints[i]);
|
||||
}
|
||||
final TokenStream grams = new NGramTokenizer(TEST_VERSION_CURRENT, new StringReader(s), minGram, maxGram, edgesOnly) {
|
||||
@Override
|
||||
protected boolean isTokenChar(int chr) {
|
||||
return nonTokenChars.indexOf(chr) < 0;
|
||||
}
|
||||
};
|
||||
final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
|
||||
final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
|
||||
final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
|
||||
final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
|
||||
grams.reset();
|
||||
for (int start = 0; start < s.length(); ++start) {
|
||||
for (int end = start + minGram; end <= start + maxGram && end <= s.length(); ++end) {
|
||||
for (int start = 0; start < codePoints.length; ++start) {
|
||||
nextGram:
|
||||
for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
|
||||
if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
|
||||
// not on an edge
|
||||
continue nextGram;
|
||||
}
|
||||
for (int j = start; j < end; ++j) {
|
||||
if (!isTokenChar(nonTokenChars, codePoints[j])) {
|
||||
continue nextGram;
|
||||
}
|
||||
}
|
||||
assertTrue(grams.incrementToken());
|
||||
assertEquals(s.substring(start, end), termAtt.toString());
|
||||
assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
|
||||
assertEquals(1, posIncAtt.getPositionIncrement());
|
||||
assertEquals(start, offsetAtt.startOffset());
|
||||
assertEquals(end, offsetAtt.endOffset());
|
||||
assertEquals(1, posLenAtt.getPositionLength());
|
||||
assertEquals(offsets[start], offsetAtt.startOffset());
|
||||
assertEquals(offsets[end], offsetAtt.endOffset());
|
||||
}
|
||||
}
|
||||
assertFalse(grams.incrementToken());
|
||||
grams.end();
|
||||
assertEquals(s.length(), offsetAtt.startOffset());
|
||||
assertEquals(s.length(), offsetAtt.endOffset());
|
||||
|
@ -141,14 +195,47 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
|||
// test sliding
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
|
||||
}
|
||||
|
||||
public void testLargeMaxGram() throws IOException {
|
||||
// test sliding with maxGram > 1024
|
||||
final int minGram = _TestUtil.nextInt(random(), 1200, 1300);
|
||||
final int minGram = _TestUtil.nextInt(random(), 1290, 1300);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 1300);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024));
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 3 * 1024, 4 * 1024), "");
|
||||
}
|
||||
|
||||
public void testPreTokenization() throws IOException {
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "a");
|
||||
}
|
||||
|
||||
public void testHeavyPreTokenization() throws IOException {
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
testNGrams(minGram, maxGram, _TestUtil.nextInt(random(), 0, 4 * 1024), "abcdef");
|
||||
}
|
||||
|
||||
public void testFewTokenChars() throws IOException {
|
||||
final char[] chrs = new char[_TestUtil.nextInt(random(), 4000, 5000)];
|
||||
Arrays.fill(chrs, ' ');
|
||||
for (int i = 0; i < chrs.length; ++i) {
|
||||
if (random().nextFloat() < 0.1) {
|
||||
chrs[i] = 'a';
|
||||
}
|
||||
}
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 2);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 2);
|
||||
testNGrams(minGram, maxGram, new String(chrs), " ");
|
||||
}
|
||||
|
||||
public void testFullUTF8Range() throws IOException {
|
||||
final int minGram = _TestUtil.nextInt(random(), 1, 100);
|
||||
final int maxGram = _TestUtil.nextInt(random(), minGram, 100);
|
||||
final String s = _TestUtil.randomUnicodeString(random(), 4 * 1024);
|
||||
testNGrams(minGram, maxGram, s, "");
|
||||
testNGrams(minGram, maxGram, s, "abcdef");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,10 +20,13 @@ package org.apache.lucene.analysis.util;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
|
@ -31,32 +34,6 @@ import org.junit.Test;
|
|||
*/
|
||||
public class TestCharacterUtils extends LuceneTestCase {
|
||||
|
||||
@Test
|
||||
public void testCodePointAtCharArrayInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
|
||||
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
|
||||
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
|
||||
assertEquals((int) '\ud801', java4.codePointAt(cpAt3, 3));
|
||||
assertEquals((int) '\ud801', java4.codePointAt(highSurrogateAt3, 3));
|
||||
try {
|
||||
java4.codePointAt(highSurrogateAt3, 4);
|
||||
fail("array index out of bounds");
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
}
|
||||
|
||||
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
|
||||
assertEquals((int) 'A', java5.codePointAt(cpAt3, 0));
|
||||
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
|
||||
cpAt3, 3));
|
||||
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3));
|
||||
try {
|
||||
java5.codePointAt(highSurrogateAt3, 4);
|
||||
fail("array index out of bounds");
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCodePointAtCharSequenceInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
|
@ -98,7 +75,68 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
assertEquals(Character.toCodePoint('\ud801', '\udc1c'), java5.codePointAt(
|
||||
cpAt3, 3, 5));
|
||||
assertEquals((int) '\ud801', java5.codePointAt(highSurrogateAt3, 3, 4));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCodePointCount() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
|
||||
final String s = _TestUtil.randomUnicodeString(random());
|
||||
assertEquals(s.length(), java4.codePointCount(s));
|
||||
assertEquals(Character.codePointCount(s, 0, s.length()), java5.codePointCount(s));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOffsetByCodePoint() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
final char[] s = _TestUtil.randomUnicodeString(random()).toCharArray();
|
||||
final int index = _TestUtil.nextInt(random(), 0, s.length);
|
||||
final int offset = random().nextInt(7) - 3;
|
||||
try {
|
||||
final int o = java4.offsetByCodePoints(s, 0, s.length, index, offset);
|
||||
assertEquals(o, index + offset);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
assertTrue((index + offset) < 0 || (index + offset) > s.length);
|
||||
}
|
||||
|
||||
int o;
|
||||
try {
|
||||
o = java5.offsetByCodePoints(s, 0, s.length, index, offset);
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
try {
|
||||
Character.offsetByCodePoints(s, 0, s.length, index, offset);
|
||||
fail();
|
||||
} catch (IndexOutOfBoundsException e2) {
|
||||
// OK
|
||||
}
|
||||
o = -1;
|
||||
}
|
||||
if (o >= 0) {
|
||||
assertEquals(Character.offsetByCodePoints(s, 0, s.length, index, offset), o);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testConversions() {
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
CharacterUtils java5 = CharacterUtils.getInstance(TEST_VERSION_CURRENT);
|
||||
testConversions(java4);
|
||||
testConversions(java5);
|
||||
}
|
||||
|
||||
private void testConversions(CharacterUtils charUtils) {
|
||||
final char[] orig = _TestUtil.randomUnicodeString(random(), 100).toCharArray();
|
||||
final int[] buf = new int[orig.length];
|
||||
final char[] restored = new char[buf.length];
|
||||
final int o1 = random().nextInt(5);
|
||||
final int o2 = _TestUtil.nextInt(random(), 0, o1);
|
||||
final int o3 = _TestUtil.nextInt(random(), 0, o1);
|
||||
final int codePointCount = charUtils.toCodePoints(orig, o1, orig.length - o1, buf, o2);
|
||||
final int charCount = charUtils.toChars(buf, o2, codePointCount, restored, o3);
|
||||
assertEquals(orig.length - o1, charCount);
|
||||
assertArrayEquals(Arrays.copyOfRange(orig, o1, o1 + charCount), Arrays.copyOfRange(restored, o3, o3 + charCount));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -132,7 +170,7 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
assertEquals(0, buffer.getOffset());
|
||||
assertEquals(6, buffer.getLength());
|
||||
assertEquals("hellow", new String(buffer.getBuffer()));
|
||||
assertTrue(instance.fill(buffer,reader));
|
||||
assertFalse(instance.fill(buffer,reader));
|
||||
assertEquals(4, buffer.getLength());
|
||||
assertEquals(0, buffer.getOffset());
|
||||
|
||||
|
@ -159,15 +197,12 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
assertEquals(4, buffer.getLength());
|
||||
assertEquals("123\ud801", new String(buffer.getBuffer(),
|
||||
buffer.getOffset(), buffer.getLength()));
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertEquals(2, buffer.getLength());
|
||||
assertEquals("\ud801\udc1c", new String(buffer.getBuffer(), buffer
|
||||
.getOffset(), buffer.getLength()));
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
assertEquals(1, buffer.getLength());
|
||||
assertEquals("\ud801", new String(buffer.getBuffer(), buffer
|
||||
assertFalse(instance.fill(buffer, reader));
|
||||
assertEquals(3, buffer.getLength());
|
||||
assertEquals("\ud801\udc1c\ud801", new String(buffer.getBuffer(), buffer
|
||||
.getOffset(), buffer.getLength()));
|
||||
assertFalse(instance.fill(buffer, reader));
|
||||
assertEquals(0, buffer.getLength());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -183,7 +183,10 @@
|
|||
<forbidden-apis internalRuntimeForbidden="true" classpathref="forbidden-apis.classpath">
|
||||
<bundledSignatures name="jdk-unsafe-${javac.target}"/>
|
||||
<bundledSignatures name="jdk-deprecated-${javac.target}"/>
|
||||
<signaturesFileSet file="${common.dir}/tools/forbiddenApis/executors.txt"/>
|
||||
<signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
|
||||
<include name="executors.txt" />
|
||||
<include name="chars.txt" />
|
||||
</signaturesFileSet>
|
||||
<fileset dir="${basedir}/build" includes="**/*.class" />
|
||||
</forbidden-apis>
|
||||
</target>
|
||||
|
|
|
@ -808,7 +808,7 @@ public final class Util {
|
|||
final int charLimit = offset + length;
|
||||
while(charIdx < charLimit) {
|
||||
scratch.grow(intIdx+1);
|
||||
final int utf32 = Character.codePointAt(s, charIdx);
|
||||
final int utf32 = Character.codePointAt(s, charIdx, charLimit);
|
||||
scratch.ints[intIdx] = utf32;
|
||||
charIdx += Character.charCount(utf32);
|
||||
intIdx++;
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
java.lang.Character#codePointBefore(char[],int) @ Implicit start offset is error-prone when the char[] is a buffer and the first chars are random chars
|
||||
java.lang.Character#codePointAt(char[],int) @ Implicit end offset is error-prone when the char[] is a buffer and the last chars are random chars
|
|
@ -266,6 +266,7 @@
|
|||
<bundledSignatures name="commons-io-unsafe-${commons-io.version}"/>
|
||||
<signaturesFileSet dir="${common.dir}/tools/forbiddenApis">
|
||||
<include name="executors.txt" />
|
||||
<include name="chars.txt" />
|
||||
<include name="servlet-api.txt" />
|
||||
</signaturesFileSet>
|
||||
<fileset dir="${basedir}/build">
|
||||
|
|
Loading…
Reference in New Issue