LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254)

Co-authored-by: Anh Dung Bui <buidun@amazon.com>
This commit is contained in:
Dzung Bui 2021-08-20 21:21:58 +09:00 committed by GitHub
parent 5896e5389a
commit 0c3c8ec09a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 0 deletions

View File

@ -1765,6 +1765,15 @@ public final class JapaneseTokenizer extends Tokenizer {
private void backtrace(final Position endPosData, final int fromIDX) throws IOException { private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
final int endPos = endPosData.pos; final int endPos = endPosData.pos;
/**
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
* empty buffer
*/
if (endPos == lastBackTracePos) {
return;
}
if (VERBOSE) { if (VERBOSE) {
System.out.println( System.out.println(
"\n backtrace: endPos=" "\n backtrace: endPos="

View File

@ -24,6 +24,7 @@ import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
import java.util.Random; import java.util.Random;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -886,4 +887,28 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ", new String[] {"北海道", "日本", "ハムファイターズ"}); assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ", new String[] {"北海道", "日本", "ハムファイターズ"});
} }
public void testEmptyBacktrace() throws IOException {
String text = "";
// since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
// is set to 1024, we want the first 1023 characters to generate multiple paths
// so that the regular backtrace is not executed.
for (int i = 0; i < 1023; i++) {
text += "";
}
// and the last 2 characters to be a valid word so that they
// will end-up together
text += "手紙";
List<String> outputs = new ArrayList<>();
for (int i = 0; i < 511; i++) {
outputs.add("ああ");
}
outputs.add("");
outputs.add("手紙");
assertAnalyzesTo(analyzer, text, outputs.toArray(new String[0]));
}
} }