LUCENE-10493: add 'backWordPos' array to JapaneseTokenizer.Position (#793)

This commit is contained in:
Tomoko Uchida 2022-04-07 21:29:07 +09:00 committed by GitHub
parent 94fe7e314f
commit 4d2b08554a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 18 additions and 4 deletions

View File

@ -360,7 +360,7 @@ public final class JapaneseTokenizer extends Tokenizer {
pending.clear();
// Add BOS:
positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN);
positions.get(0).add(0, 0, -1, -1, -1, -1, Type.KNOWN);
}
@Override
@ -415,6 +415,7 @@ public final class JapaneseTokenizer extends Tokenizer {
int[] costs = new int[8];
int[] lastRightID = new int[8];
int[] backPos = new int[8];
int[] backWordPos = new int[8];
int[] backIndex = new int[8];
int[] backID = new int[8];
Type[] backType = new Type[8];
@ -431,6 +432,7 @@ public final class JapaneseTokenizer extends Tokenizer {
costs = ArrayUtil.grow(costs, 1 + count);
lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
backPos = ArrayUtil.grow(backPos, 1 + count);
backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
backIndex = ArrayUtil.grow(backIndex, 1 + count);
backID = ArrayUtil.grow(backID, 1 + count);
@ -456,7 +458,13 @@ public final class JapaneseTokenizer extends Tokenizer {
}
public void add(
int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) {
int cost,
int lastRightID,
int backPos,
int backRPos,
int backIndex,
int backID,
Type backType) {
// NOTE: this isn't quite a true Viterbi search,
// because we should check if lastRightID is
// already present here, and only update if the new
@ -471,6 +479,7 @@ public final class JapaneseTokenizer extends Tokenizer {
this.costs[count] = cost;
this.lastRightID[count] = lastRightID;
this.backPos[count] = backPos;
this.backWordPos[count] = backRPos;
this.backIndex[count] = backIndex;
this.backID[count] = backID;
this.backType[count] = backType;
@ -498,6 +507,7 @@ public final class JapaneseTokenizer extends Tokenizer {
private void add(
JaMorphData morphAtts,
Position fromPosData,
int wordPos,
int endPos,
int wordID,
Type type,
@ -568,7 +578,7 @@ public final class JapaneseTokenizer extends Tokenizer {
// positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX,
// wordID, type);
assert leftID == morphAtts.getRightId(wordID);
positions.get(endPos).add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type);
positions.get(endPos).add(leastCost, leftID, fromPosData.pos, wordPos, leastIDX, wordID, type);
}
@Override
@ -903,6 +913,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add(
userDictionary.getMorphAttributes(),
posData,
pos,
posAhead + 1,
output + arc.nextFinalOutput().intValue(),
Type.USER,
@ -956,6 +967,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add(
dictionary.getMorphAttributes(),
posData,
pos,
posAhead + 1,
wordIdRef.ints[wordIdRef.offset + ofs],
Type.KNOWN,
@ -1012,6 +1024,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add(
unkDictionary.getMorphAttributes(),
posData,
pos,
posData.pos + unknownWordLength,
wordIdRef.ints[wordIdRef.offset + ofs],
Type.UNKNOWN,
@ -1157,7 +1170,7 @@ public final class JapaneseTokenizer extends Tokenizer {
}
positions
.get(toPos)
.add(newCost, dict2.getRightId(wordID), pos, bestStartIDX, wordID, forwardType);
.add(newCost, dict2.getRightId(wordID), pos, -1, bestStartIDX, wordID, forwardType);
}
} else {
// On non-initial positions, we maximize score
@ -1177,6 +1190,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add(
getDict(forwardType).getMorphAttributes(),
posData,
pos,
toPos,
posData.forwardID[forwardArcIDX],
forwardType,