LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition of long tokens when discardCompoundToken is activated.

This commit is contained in:
jimczi 2020-11-23 08:55:36 +01:00
parent 77a205387f
commit a5d0654a24
5 changed files with 47 additions and 38 deletions

View File

@ -247,11 +247,18 @@ Optimizations
* LUCENE-9536: Reduced memory usage for OrdinalMap when a segment has all
values. (Julie Tibshirani via Adrien Grand)
Bug Fixes
---------------------
* LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition
of long tokens when discardCompoundToken is activated. (Jim Ferenczi)
Other
---------------------
* SOLR-14995: Update Jetty to 9.4.34 (Mike Drob)
======================= Lucene 8.7.0 =======================
API Changes

View File

@ -507,7 +507,7 @@ public final class JapaneseTokenizer extends Tokenizer {
System.out.println(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count);
}
if ((addPenalty || (!outputCompounds && searchMode)) && type != Type.USER) {
if (addPenalty && type != Type.USER) {
final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
if (VERBOSE) {
if (penalty > 0) {
@ -1670,7 +1670,7 @@ public final class JapaneseTokenizer extends Tokenizer {
int backID = posData.backID[bestIDX];
int nextBestIDX = posData.backIndex[bestIDX];
if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
if (searchMode && altToken == null && backType != Type.USER) {
// In searchMode, if best path had picked a too-long
// token, we use the "penalty" to compute the allowed
@ -1764,34 +1764,35 @@ public final class JapaneseTokenizer extends Tokenizer {
assert offset >= 0;
if (altToken != null && altToken.getPosition() >= backPos) {
if (outputCompounds) {
// We've backtraced to the position where the
// compound token starts; add it now:
// We've backtraced to the position where the
// compound token starts; add it now:
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
if (backCount > 0) {
backCount++;
altToken.setPositionLength(backCount);
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
if (backCount > 0) {
backCount++;
altToken.setPositionLength(backCount);
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
pending.add(altToken);
} else {
// This means alt token was all punct tokens:
if (VERBOSE) {
System.out.println(" discard all-punctuation altToken=" + altToken);
}
assert discardPunctuation;
}
pending.add(altToken);
} else {
// This means alt token was all punct tokens:
if (VERBOSE) {
System.out.println(" discard all-punctuation altToken=" + altToken);
}
assert discardPunctuation;
}
altToken = null;
}

View File

@ -108,13 +108,13 @@ public class TestJapaneseAnalyzer extends BaseTokenStreamTestCase {
a.close();
a = new JapaneseAnalyzer();
assertAnalyzesToPositions(a, "京都大学硬式野球部",
new String[] { "京都",
"",
"硬式",
"野球",
"" },
new int[] {1, 1, 1, 1, 1},
new int[] {1, 1, 1, 1, 1});
new String[] { "京都",
"",
"硬式",
"野球",
"" },
new int[] {1, 1, 1, 1, 1},
new int[] {1, 1, 1, 1, 1});
// toDotFile(a, "成田空港", "/mnt/scratch/out.dot");
a.close();
}

View File

@ -917,5 +917,11 @@ public class
assertAnalyzesTo(extendedModeAnalyzerNoCompound, "株式会社とアカデミア",
new String[]{"株式", "会社", "", "", "", "", "", ""});
assertAnalyzesTo(analyzer, "北海道日本ハムファイターズ",
new String[]{"北海道", "日本", "ハムファイターズ"});
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ",
new String[]{"北海道", "日本", "ハムファイターズ"});
}
}

View File

@ -122,11 +122,6 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
String[] fields = line.split("\t", 2);
String sourceText = fields[0];
String[] tmpExpectedTokens = fields[1].split("\\s+");
if (sourceText.equals("京都大学硬式野球部")) {
// This is the only case that tokenization result is different from discardCompoundToken=false
tmpExpectedTokens[0] = "京都";
tmpExpectedTokens[1] = "大学";
}
List<String> expectedTokenList = new ArrayList<>();
for(int tokIDX=0;tokIDX<tmpExpectedTokens.length;tokIDX++) {