mirror of https://github.com/apache/lucene.git
LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254)
Co-authored-by: Anh Dung Bui <buidun@amazon.com>
This commit is contained in:
parent
5896e5389a
commit
0c3c8ec09a
|
@ -1765,6 +1765,15 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
|
||||
final int endPos = endPosData.pos;
|
||||
|
||||
/**
|
||||
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
|
||||
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
|
||||
* empty buffer
|
||||
*/
|
||||
if (endPos == lastBackTracePos) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(
|
||||
"\n backtrace: endPos="
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.io.Reader;
|
|||
import java.io.StringReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -886,4 +887,28 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ", new String[] {"北海道", "日本", "ハムファイターズ"});
|
||||
}
|
||||
|
||||
public void testEmptyBacktrace() throws IOException {
|
||||
String text = "";
|
||||
|
||||
// since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
|
||||
// is set to 1024, we want the first 1023 characters to generate multiple paths
|
||||
// so that the regular backtrace is not executed.
|
||||
for (int i = 0; i < 1023; i++) {
|
||||
text += "あ";
|
||||
}
|
||||
|
||||
// and the last 2 characters to be a valid word so that they
|
||||
// will end-up together
|
||||
text += "手紙";
|
||||
|
||||
List<String> outputs = new ArrayList<>();
|
||||
for (int i = 0; i < 511; i++) {
|
||||
outputs.add("ああ");
|
||||
}
|
||||
outputs.add("あ");
|
||||
outputs.add("手紙");
|
||||
|
||||
assertAnalyzesTo(analyzer, text, outputs.toArray(new String[0]));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue