LUCENE-10493: add 'backWordPos' array to JapaneseTokenizer.Position (#793)

This commit is contained in:
Tomoko Uchida 2022-04-07 21:29:07 +09:00 committed by GitHub
parent 94fe7e314f
commit 4d2b08554a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 18 additions and 4 deletions

View File

@ -360,7 +360,7 @@ public final class JapaneseTokenizer extends Tokenizer {
pending.clear(); pending.clear();
// Add BOS: // Add BOS:
positions.get(0).add(0, 0, -1, -1, -1, Type.KNOWN); positions.get(0).add(0, 0, -1, -1, -1, -1, Type.KNOWN);
} }
@Override @Override
@ -415,6 +415,7 @@ public final class JapaneseTokenizer extends Tokenizer {
int[] costs = new int[8]; int[] costs = new int[8];
int[] lastRightID = new int[8]; int[] lastRightID = new int[8];
int[] backPos = new int[8]; int[] backPos = new int[8];
int[] backWordPos = new int[8];
int[] backIndex = new int[8]; int[] backIndex = new int[8];
int[] backID = new int[8]; int[] backID = new int[8];
Type[] backType = new Type[8]; Type[] backType = new Type[8];
@ -431,6 +432,7 @@ public final class JapaneseTokenizer extends Tokenizer {
costs = ArrayUtil.grow(costs, 1 + count); costs = ArrayUtil.grow(costs, 1 + count);
lastRightID = ArrayUtil.grow(lastRightID, 1 + count); lastRightID = ArrayUtil.grow(lastRightID, 1 + count);
backPos = ArrayUtil.grow(backPos, 1 + count); backPos = ArrayUtil.grow(backPos, 1 + count);
backWordPos = ArrayUtil.grow(backWordPos, 1 + count);
backIndex = ArrayUtil.grow(backIndex, 1 + count); backIndex = ArrayUtil.grow(backIndex, 1 + count);
backID = ArrayUtil.grow(backID, 1 + count); backID = ArrayUtil.grow(backID, 1 + count);
@ -456,7 +458,13 @@ public final class JapaneseTokenizer extends Tokenizer {
} }
public void add( public void add(
int cost, int lastRightID, int backPos, int backIndex, int backID, Type backType) { int cost,
int lastRightID,
int backPos,
int backRPos,
int backIndex,
int backID,
Type backType) {
// NOTE: this isn't quite a true Viterbi search, // NOTE: this isn't quite a true Viterbi search,
// because we should check if lastRightID is // because we should check if lastRightID is
// already present here, and only update if the new // already present here, and only update if the new
@ -471,6 +479,7 @@ public final class JapaneseTokenizer extends Tokenizer {
this.costs[count] = cost; this.costs[count] = cost;
this.lastRightID[count] = lastRightID; this.lastRightID[count] = lastRightID;
this.backPos[count] = backPos; this.backPos[count] = backPos;
this.backWordPos[count] = backRPos;
this.backIndex[count] = backIndex; this.backIndex[count] = backIndex;
this.backID[count] = backID; this.backID[count] = backID;
this.backType[count] = backType; this.backType[count] = backType;
@ -498,6 +507,7 @@ public final class JapaneseTokenizer extends Tokenizer {
private void add( private void add(
JaMorphData morphAtts, JaMorphData morphAtts,
Position fromPosData, Position fromPosData,
int wordPos,
int endPos, int endPos,
int wordID, int wordID,
Type type, Type type,
@ -568,7 +578,7 @@ public final class JapaneseTokenizer extends Tokenizer {
// positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, // positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX,
// wordID, type); // wordID, type);
assert leftID == morphAtts.getRightId(wordID); assert leftID == morphAtts.getRightId(wordID);
positions.get(endPos).add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type); positions.get(endPos).add(leastCost, leftID, fromPosData.pos, wordPos, leastIDX, wordID, type);
} }
@Override @Override
@ -903,6 +913,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add( add(
userDictionary.getMorphAttributes(), userDictionary.getMorphAttributes(),
posData, posData,
pos,
posAhead + 1, posAhead + 1,
output + arc.nextFinalOutput().intValue(), output + arc.nextFinalOutput().intValue(),
Type.USER, Type.USER,
@ -956,6 +967,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add( add(
dictionary.getMorphAttributes(), dictionary.getMorphAttributes(),
posData, posData,
pos,
posAhead + 1, posAhead + 1,
wordIdRef.ints[wordIdRef.offset + ofs], wordIdRef.ints[wordIdRef.offset + ofs],
Type.KNOWN, Type.KNOWN,
@ -1012,6 +1024,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add( add(
unkDictionary.getMorphAttributes(), unkDictionary.getMorphAttributes(),
posData, posData,
pos,
posData.pos + unknownWordLength, posData.pos + unknownWordLength,
wordIdRef.ints[wordIdRef.offset + ofs], wordIdRef.ints[wordIdRef.offset + ofs],
Type.UNKNOWN, Type.UNKNOWN,
@ -1157,7 +1170,7 @@ public final class JapaneseTokenizer extends Tokenizer {
} }
positions positions
.get(toPos) .get(toPos)
.add(newCost, dict2.getRightId(wordID), pos, bestStartIDX, wordID, forwardType); .add(newCost, dict2.getRightId(wordID), pos, -1, bestStartIDX, wordID, forwardType);
} }
} else { } else {
// On non-initial positions, we maximize score // On non-initial positions, we maximize score
@ -1177,6 +1190,7 @@ public final class JapaneseTokenizer extends Tokenizer {
add( add(
getDict(forwardType).getMorphAttributes(), getDict(forwardType).getMorphAttributes(),
posData, posData,
pos,
toPos, toPos,
posData.forwardID[forwardArcIDX], posData.forwardID[forwardArcIDX],
forwardType, forwardType,