mirror of https://github.com/apache/lucene.git
Fix for LUCENE-3897 (KuromojiTokenizer fails with large docs)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303739 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a6fd306dfb
commit
d2eebf9330
|
@ -588,27 +588,71 @@ public final class KuromojiTokenizer extends Tokenizer {
|
|||
|
||||
if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) {
|
||||
// Safety: if we've buffered too much, force a
|
||||
// backtrace now:
|
||||
// backtrace now. We find the least-cost partial
|
||||
// path, across all paths, backtrace from it, and
|
||||
// then prune all others. Note that this, in
|
||||
// general, can produce the wrong result, if the
|
||||
// total bast path did not in fact back trace
|
||||
// through this partial best path. But it's the
|
||||
// best we can do... (short of not having a
|
||||
// safety!).
|
||||
|
||||
// First pass: find least cost parital path so far,
|
||||
// including ending at future positions:
|
||||
int leastIDX = -1;
|
||||
int leastCost = Integer.MAX_VALUE;
|
||||
for(int idx=0;idx<posData.count;idx++) {
|
||||
//System.out.println(" idx=" + idx + " cost=" + cost);
|
||||
final int cost = posData.costs[idx];
|
||||
if (cost < leastCost) {
|
||||
leastCost = cost;
|
||||
leastIDX = idx;
|
||||
Position leastPosData = null;
|
||||
for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
|
||||
final Position posData2 = positions.get(pos2);
|
||||
for(int idx=0;idx<posData2.count;idx++) {
|
||||
//System.out.println(" idx=" + idx + " cost=" + cost);
|
||||
final int cost = posData.costs[idx];
|
||||
if (cost < leastCost) {
|
||||
leastCost = cost;
|
||||
leastIDX = idx;
|
||||
leastPosData = posData2;
|
||||
}
|
||||
}
|
||||
}
|
||||
backtrace(posData, leastIDX);
|
||||
|
||||
// We will always have at least one live path:
|
||||
assert leastIDX != -1;
|
||||
|
||||
// Second pass: prune all but the best path:
|
||||
for(int pos2=pos;pos2<positions.getNextPos();pos2++) {
|
||||
final Position posData2 = positions.get(pos2);
|
||||
if (posData2 != leastPosData) {
|
||||
posData2.reset();
|
||||
} else {
|
||||
if (leastIDX != 0) {
|
||||
posData2.costs[0] = posData2.costs[leastIDX];
|
||||
posData2.lastRightID[0] = posData2.lastRightID[leastIDX];
|
||||
posData2.backPos[0] = posData2.backPos[leastIDX];
|
||||
posData2.backIndex[0] = posData2.backIndex[leastIDX];
|
||||
posData2.backID[0] = posData2.backID[leastIDX];
|
||||
posData2.backType[0] = posData2.backType[leastIDX];
|
||||
}
|
||||
posData2.count = 1;
|
||||
}
|
||||
}
|
||||
|
||||
backtrace(leastPosData, 0);
|
||||
|
||||
// Re-base cost so we don't risk int overflow:
|
||||
Arrays.fill(posData.costs, 0, posData.count, 0);
|
||||
Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0);
|
||||
|
||||
if (pending.size() != 0) {
|
||||
return;
|
||||
} else {
|
||||
// This means the backtrace only produced
|
||||
// punctuation tokens, so we must keep parsing.
|
||||
if (pos != leastPosData.pos) {
|
||||
// We jumped into a future position; continue to
|
||||
// the top of the loop to skip until we get
|
||||
// there:
|
||||
assert pos < leastPosData.pos;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -192,7 +192,6 @@ public class TestKuromojiTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
@Ignore("FIXME: see LUCENE-3897")
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
checkRandomData(random, analyzer, 200*RANDOM_MULTIPLIER, 8192);
|
||||
checkRandomData(random, analyzerNoPunct, 200*RANDOM_MULTIPLIER, 8192);
|
||||
|
|
Loading…
Reference in New Issue