Fix for LUCENE-3897 (KuromojiTokenizer fails with large docs)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303739 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christian Moen 2012-03-22 11:41:54 +00:00
parent a6fd306dfb
commit d2eebf9330
2 changed files with 53 additions and 10 deletions

View File

@ -588,27 +588,71 @@ public final class KuromojiTokenizer extends Tokenizer {
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
// Safety: if we've buffered too much, force a
// backtrace now:
// backtrace now. We find the least-cost partial
// path, across all paths, backtrace from it, and
// then prune all others. Note that this, in
// general, can produce the wrong result, if the
// total bast path did not in fact back trace
// through this partial best path. But it's the
// best we can do... (short of not having a
// safety!).
// First pass: find least cost parital path so far,
// including ending at future positions:
int leastIDX = -1;
int leastCost = Integer.MAX_VALUE;
for(int idx=0;idx<posData.count;idx++) {
//System.out.println(" idx=" + idx + " cost=" + cost);
final int cost = posData.costs[idx];
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
Position leastPosData = null;
for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
final Position posData2 = positions.get(pos2);
for(int idx=0;idx<posData2.count;idx++) {
//System.out.println(" idx=" + idx + " cost=" + cost);
final int cost = posData.costs[idx];
if (cost < leastCost) {
leastCost = cost;
leastIDX = idx;
leastPosData = posData2;
}
}
}
backtrace(posData, leastIDX);
// We will always have at least one live path:
assert leastIDX != -1;
// Second pass: prune all but the best path:
for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
final Position posData2 = positions.get(pos2);
if (posData2 != leastPosData) {
posData2.reset();
} else {
if (leastIDX != 0) {
posData2.costs[0] = posData2.costs[leastIDX];
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
posData2.backPos[0] = posData2.backPos[leastIDX];
posData2.backIndex[0] = posData2.backIndex[leastIDX];
posData2.backID[0] = posData2.backID[leastIDX];
posData2.backType[0] = posData2.backType[leastIDX];
}
posData2.count = 1;
}
}
backtrace(leastPosData, 0);
// Re-base cost so we don't risk int overflow:
Arrays.fill(posData.costs, 0, posData.count, 0);
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
if (pending.size() != 0) {
return;
} else {
// This means the backtrace only produced
// punctuation tokens, so we must keep parsing.
if (pos != leastPosData.pos) {
// We jumped into a future position; continue to
// the top of the loop to skip until we get
// there:
assert pos < leastPosData.pos;
continue;
}
}
}

View File

@ -192,7 +192,6 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
}
/** blast some random large strings through the analyzer */
@Ignore("FIXME: see LUCENE-3897")
public void testRandomHugeStrings() throws Exception {
checkRandomData(random, analyzer, 200*RANDOM_MULTIPLIER, 8192);
checkRandomData(random, analyzerNoPunct, 200*RANDOM_MULTIPLIER, 8192);