mirror of https://github.com/apache/lucene.git
LUCENE-10059: Fix an AssertionError when JapaneseTokenizer tries to backtrace from and to the same position (#254)
Co-authored-by: Anh Dung Bui <buidun@amazon.com>
This commit is contained in:
parent
5896e5389a
commit
0c3c8ec09a
|
@ -1765,6 +1765,15 @@ public final class JapaneseTokenizer extends Tokenizer {
|
||||||
private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
|
private void backtrace(final Position endPosData, final int fromIDX) throws IOException {
|
||||||
final int endPos = endPosData.pos;
|
final int endPos = endPosData.pos;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LUCENE-10059: If the endPos is the same as lastBackTracePos, we don't want to backtrace to
|
||||||
|
* avoid an assertion error {@link RollingCharBuffer#get(int)} when it tries to generate an
|
||||||
|
* empty buffer
|
||||||
|
*/
|
||||||
|
if (endPos == lastBackTracePos) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(
|
System.out.println(
|
||||||
"\n backtrace: endPos="
|
"\n backtrace: endPos="
|
||||||
|
|
|
@ -24,6 +24,7 @@ import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
@ -886,4 +887,28 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ", new String[] {"北海道", "日本", "ハムファイターズ"});
|
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ", new String[] {"北海道", "日本", "ハムファイターズ"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEmptyBacktrace() throws IOException {
|
||||||
|
String text = "";
|
||||||
|
|
||||||
|
// since the max backtrace gap ({@link JapaneseTokenizer#MAX_BACKTRACE_GAP)
|
||||||
|
// is set to 1024, we want the first 1023 characters to generate multiple paths
|
||||||
|
// so that the regular backtrace is not executed.
|
||||||
|
for (int i = 0; i < 1023; i++) {
|
||||||
|
text += "あ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// and the last 2 characters to be a valid word so that they
|
||||||
|
// will end-up together
|
||||||
|
text += "手紙";
|
||||||
|
|
||||||
|
List<String> outputs = new ArrayList<>();
|
||||||
|
for (int i = 0; i < 511; i++) {
|
||||||
|
outputs.add("ああ");
|
||||||
|
}
|
||||||
|
outputs.add("あ");
|
||||||
|
outputs.add("手紙");
|
||||||
|
|
||||||
|
assertAnalyzesTo(analyzer, text, outputs.toArray(new String[0]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue