mirror of https://github.com/apache/lucene.git
LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition of long tokens when discardCompoundToken is activated.
This commit is contained in:
parent
77a205387f
commit
a5d0654a24
|
@ -247,11 +247,18 @@ Optimizations
|
|||
* LUCENE-9536: Reduced memory usage for OrdinalMap when a segment has all
|
||||
values. (Julie Tibshirani via Adrien Grand)
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
* LUCENE-9581: Japanese tokenizer should discard the compound token instead of disabling the decomposition
|
||||
of long tokens when discardCompoundToken is activated. (Jim Ferenczi)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
* SOLR-14995: Update Jetty to 9.4.34 (Mike Drob)
|
||||
|
||||
|
||||
======================= Lucene 8.7.0 =======================
|
||||
|
||||
API Changes
|
||||
|
|
|
@ -507,7 +507,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
System.out.println(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count);
|
||||
}
|
||||
|
||||
if ((addPenalty || (!outputCompounds && searchMode)) && type != Type.USER) {
|
||||
if (addPenalty && type != Type.USER) {
|
||||
final int penalty = computePenalty(fromPosData.pos, endPos - fromPosData.pos);
|
||||
if (VERBOSE) {
|
||||
if (penalty > 0) {
|
||||
|
@ -1670,7 +1670,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
int backID = posData.backID[bestIDX];
|
||||
int nextBestIDX = posData.backIndex[bestIDX];
|
||||
|
||||
if (outputCompounds && searchMode && altToken == null && backType != Type.USER) {
|
||||
if (searchMode && altToken == null && backType != Type.USER) {
|
||||
|
||||
// In searchMode, if best path had picked a too-long
|
||||
// token, we use the "penalty" to compute the allowed
|
||||
|
@ -1764,34 +1764,35 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
assert offset >= 0;
|
||||
|
||||
if (altToken != null && altToken.getPosition() >= backPos) {
|
||||
if (outputCompounds) {
|
||||
// We've backtraced to the position where the
|
||||
// compound token starts; add it now:
|
||||
|
||||
// We've backtraced to the position where the
|
||||
// compound token starts; add it now:
|
||||
// The pruning we did when we created the altToken
|
||||
// ensures that the back trace will align back with
|
||||
// the start of the altToken:
|
||||
assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos;
|
||||
|
||||
// The pruning we did when we created the altToken
|
||||
// ensures that the back trace will align back with
|
||||
// the start of the altToken:
|
||||
assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
|
||||
// NOTE: not quite right: the compound token may
|
||||
// have had all punctuation back traced so far, but
|
||||
// then the decompounded token at this position is
|
||||
// not punctuation. In this case backCount is 0,
|
||||
// but we should maybe add the altToken anyway...?
|
||||
|
||||
// NOTE: not quite right: the compound token may
|
||||
// have had all punctuation back traced so far, but
|
||||
// then the decompounded token at this position is
|
||||
// not punctuation. In this case backCount is 0,
|
||||
// but we should maybe add the altToken anyway...?
|
||||
|
||||
if (backCount > 0) {
|
||||
backCount++;
|
||||
altToken.setPositionLength(backCount);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add altToken=" + altToken);
|
||||
if (backCount > 0) {
|
||||
backCount++;
|
||||
altToken.setPositionLength(backCount);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add altToken=" + altToken);
|
||||
}
|
||||
pending.add(altToken);
|
||||
} else {
|
||||
// This means alt token was all punct tokens:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" discard all-punctuation altToken=" + altToken);
|
||||
}
|
||||
assert discardPunctuation;
|
||||
}
|
||||
pending.add(altToken);
|
||||
} else {
|
||||
// This means alt token was all punct tokens:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" discard all-punctuation altToken=" + altToken);
|
||||
}
|
||||
assert discardPunctuation;
|
||||
}
|
||||
altToken = null;
|
||||
}
|
||||
|
|
|
@ -108,13 +108,13 @@ public class TestJapaneseAnalyzer extends BaseTokenStreamTestCase {
|
|||
a.close();
|
||||
a = new JapaneseAnalyzer();
|
||||
assertAnalyzesToPositions(a, "京都大学硬式野球部",
|
||||
new String[] { "京都",
|
||||
"大学",
|
||||
"硬式",
|
||||
"野球",
|
||||
"部" },
|
||||
new int[] {1, 1, 1, 1, 1},
|
||||
new int[] {1, 1, 1, 1, 1});
|
||||
new String[] { "京都大",
|
||||
"学",
|
||||
"硬式",
|
||||
"野球",
|
||||
"部" },
|
||||
new int[] {1, 1, 1, 1, 1},
|
||||
new int[] {1, 1, 1, 1, 1});
|
||||
// toDotFile(a, "成田空港", "/mnt/scratch/out.dot");
|
||||
a.close();
|
||||
}
|
||||
|
|
|
@ -917,5 +917,11 @@ public class
|
|||
|
||||
assertAnalyzesTo(extendedModeAnalyzerNoCompound, "株式会社とアカデミア",
|
||||
new String[]{"株式", "会社", "と", "ア", "カ", "デ", "ミ", "ア"});
|
||||
|
||||
assertAnalyzesTo(analyzer, "北海道日本ハムファイターズ",
|
||||
new String[]{"北海道", "日本", "ハムファイターズ"});
|
||||
|
||||
assertAnalyzesTo(analyzerNoCompound, "北海道日本ハムファイターズ",
|
||||
new String[]{"北海道", "日本", "ハムファイターズ"});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -122,11 +122,6 @@ public class TestSearchMode extends BaseTokenStreamTestCase {
|
|||
String[] fields = line.split("\t", 2);
|
||||
String sourceText = fields[0];
|
||||
String[] tmpExpectedTokens = fields[1].split("\\s+");
|
||||
if (sourceText.equals("京都大学硬式野球部")) {
|
||||
// This is the only case that tokenization result is different from discardCompoundToken=false
|
||||
tmpExpectedTokens[0] = "京都";
|
||||
tmpExpectedTokens[1] = "大学";
|
||||
}
|
||||
|
||||
List<String> expectedTokenList = new ArrayList<>();
|
||||
for(int tokIDX=0;tokIDX<tmpExpectedTokens.length;tokIDX++) {
|
||||
|
|
Loading…
Reference in New Issue