LUCENE-3940: fix Kuromoji to not produce invalid token graph due to UNK with punctuation being decompounded

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311072 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-04-08 19:17:17 +00:00
parent 879e825083
commit 78b4be5dc6
8 changed files with 235 additions and 90 deletions

View File

@ -28,6 +28,8 @@ import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Map;
import java.util.HashMap;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.Attribute;
@ -129,7 +131,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
}
// Maps position to the start/end offset:
final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
ts.reset();
int pos = -1;
for (int i = 0; i < output.length; i++) {
// extra safety to enforce, that the state is not preserved and also assign bogus values
ts.clearAttributes();
@ -157,14 +164,51 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
// we can enforce some basic things about a few attributes even if the caller doesn't check:
if (offsetAtt != null) {
assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
assertTrue("endOffset must be >= startOffset, got startOffset=" + offsetAtt.startOffset() + ",endOffset=" + offsetAtt.endOffset(),
offsetAtt.endOffset() >= offsetAtt.startOffset());
final int startOffset = offsetAtt.startOffset();
final int endOffset = offsetAtt.endOffset();
assertTrue("startOffset must be >= 0", startOffset >= 0);
assertTrue("endOffset must be >= 0", endOffset >= 0);
assertTrue("endOffset must be >= startOffset, got startOffset=" + startOffset + ",endOffset=" + endOffset,
endOffset >= startOffset);
if (finalOffset != null) {
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
offsetAtt.endOffset() <= finalOffset.intValue());
assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(),
endOffset <= finalOffset.intValue());
}
if (posLengthAtt != null && posIncrAtt != null) {
// Validate offset consistency in the graph, ie
// all tokens leaving from a certain pos have the
// same startOffset, and all tokens arriving to a
// certain pos have the same endOffset:
final int posInc = posIncrAtt.getPositionIncrement();
pos += posInc;
final int posLength = posLengthAtt.getPositionLength();
if (!posToStartOffset.containsKey(pos)) {
// First time we've seen a token leaving from this position:
posToStartOffset.put(pos, startOffset);
//System.out.println(" + s " + pos + " -> " + startOffset);
} else {
// We've seen a token leaving from this position
// before; verify the startOffset is the same:
//System.out.println(" + vs " + pos + " -> " + startOffset);
assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
}
final int endPos = pos + posLength;
if (!posToEndOffset.containsKey(endPos)) {
// First time we've seen a token arriving to this position:
posToEndOffset.put(endPos, endOffset);
//System.out.println(" + e " + endPos + " -> " + endOffset);
} else {
// We've seen a token arriving to this position
// before; verify the endOffset is the same:
//System.out.println(" + ve " + endPos + " -> " + endOffset);
assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
}
}
}
if (posIncrAtt != null) {
@ -395,12 +439,41 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
try {
checkAnalysisConsistency(random, a, useCharFilter, text);
} catch (Throwable t) {
System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + text + "'");
System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'");
Rethrow.rethrow(t);
}
}
}
public static String escape(String s) {
int charUpto = 0;
final StringBuilder sb = new StringBuilder();
while (charUpto < s.length()) {
final int c = s.codePointAt(charUpto);
if (c == 0xa) {
// Strangely, you cannot put \ u000A into Java
// sources (not in a comment nor a string
// constant)...:
sb.append("\\n");
} else if (c == 0xd) {
// ... nor \ u000D:
sb.append("\\r");
} else if (c == '"') {
sb.append("\\\"");
} else if (c == '\\') {
sb.append("\\\\");
} else if (c >= 0x20 && c < 0x80) {
sb.append((char) c);
} else {
// TODO: we can make ascii easier to read if we
// don't escape...
sb.append(String.format("\\u%04x", c));
}
charUpto += Character.charCount(c);
}
return sb.toString();
}
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
if (VERBOSE) {
@ -513,79 +586,79 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
ts.close();
}
}
}
// Final pass: verify clean tokenization matches
// results from first pass:
// Final pass: verify clean tokenization matches
// results from first pass:
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
if (random.nextInt(30) == 7) {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
}
reader = new StringReader(text);
if (random.nextInt(30) == 7) {
if (VERBOSE) {
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
}
reader = new MockReaderWrapper(random, reader);
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
}
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
text.length());
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
null,
text.length());
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
null,
null,
text.length());
} else {
// terms only
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
reader = new MockReaderWrapper(random, reader);
}
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
// offset + pos + type
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
types.toArray(new String[types.size()]),
toIntArray(positions),
null,
text.length());
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
// offset + pos + posLength
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
toIntArray(positionLengths),
text.length());
} else if (posIncAtt != null && offsetAtt != null) {
// offset + pos
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
toIntArray(positions),
null,
text.length());
} else if (offsetAtt != null) {
// offset
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]),
toIntArray(startOffsets),
toIntArray(endOffsets),
null,
null,
null,
text.length());
} else {
// terms only
assertTokenStreamContents(ts,
tokens.toArray(new String[tokens.size()]));
}
}

View File

@ -237,6 +237,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
if (DEBUG) {
System.out.println(" return inserted token");
}
assert insertedTokenConsistent();
insertPending = false;
return true;
}
@ -253,6 +254,16 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
}
}
// If subclass inserted a token, make sure it had in fact
// looked ahead enough:
private boolean insertedTokenConsistent() {
final int posLen = posLenAtt.getPositionLength();
final Position endPosData = positions.get(outputPos + posLen);
assert endPosData.endOffset != -1;
assert offsetAtt.endOffset() == endPosData.endOffset;
return true;
}
// TODO: end()?
// TODO: close()?

View File

@ -31,6 +31,8 @@ public class MockCharFilter extends CharStream {
// for testing only
public MockCharFilter(Reader in, int remainder) {
this.in = CharReader.get(in);
// TODO: instead of fixed remainder... maybe a fixed
// random seed?
this.remainder = remainder;
assert remainder >= 0 && remainder < 10 : "invalid parameter";
}

View File

@ -22,14 +22,22 @@ import java.io.IOException;
import java.util.Random;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util._TestUtil;
// TODO: maybe, instead to be more "natural", we should make
// a MockRemovesTokensTF, ideally subclassing FilteringTF
// (in modules/analysis)
// Randomly injects holes:
public final class MockHoleInjectingTokenFilter extends TokenFilter {
private final long randomSeed;
private Random random;
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private int maxPos;
private int pos;
public MockHoleInjectingTokenFilter(Random random, TokenStream in) {
super(in);
@ -40,16 +48,28 @@ public final class MockHoleInjectingTokenFilter extends TokenFilter {
public void reset() throws IOException {
super.reset();
random = new Random(randomSeed);
maxPos = -1;
pos = -1;
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final int posInc = posIncAtt.getPositionIncrement();
if (posInc > 0 && random.nextInt(5) == 3) {
posIncAtt.setPositionIncrement(posInc + _TestUtil.nextInt(random, 1, 5));
// TODO: should we tweak offsets...?
int nextPos = pos + posInc;
// Carefully inject a hole only where it won't mess up
// the graph:
if (posInc > 0 && maxPos <= nextPos && random.nextInt(5) == 3) {
final int holeSize = _TestUtil.nextInt(random, 1, 5);
posIncAtt.setPositionIncrement(posInc + holeSize);
nextPos += holeSize;
}
pos = nextPos;
maxPos = Math.max(maxPos, pos + posLenAtt.getPositionLength());
return true;
} else {
return false;
@ -58,5 +78,3 @@ public final class MockHoleInjectingTokenFilter extends TokenFilter {
// TODO: end?
}

View File

@ -603,10 +603,10 @@ public final class JapaneseTokenizer extends Tokenizer {
if (posData.count == 0) {
// No arcs arrive here; move to next position:
pos++;
if (VERBOSE) {
System.out.println(" no arcs in; skip");
System.out.println(" no arcs in; skip pos=" + pos);
}
pos++;
continue;
}
@ -785,6 +785,7 @@ public final class JapaneseTokenizer extends Tokenizer {
// Find unknown match:
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
final boolean isPunct = isPunctuation(firstCharacter);
// NOTE: copied from UnknownDictionary.lookup:
int unknownWordLength;
@ -798,7 +799,8 @@ public final class JapaneseTokenizer extends Tokenizer {
if (ch == -1) {
break;
}
if (characterId == characterDefinition.getCharacterClass((char) ch)) {
if (characterId == characterDefinition.getCharacterClass((char) ch) &&
isPunctuation((char) ch) == isPunct) {
unknownWordLength++;
} else {
break;
@ -1099,18 +1101,26 @@ public final class JapaneseTokenizer extends Tokenizer {
// The pruning we did when we created the altToken
// ensures that the back trace will align back with
// the start of the altToken:
// cannot assert...
//assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
// NOTE: not quite right: the compound token may
// have had all punctuation back traced so far, but
// then the decompounded token at this position is
// not punctuation. In this case backCount is 0,
// but we should maybe add the altToken anyway...?
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
if (backCount > 0) {
backCount++;
altToken.setPositionLength(backCount);
if (VERBOSE) {
System.out.println(" add altToken=" + altToken);
}
pending.add(altToken);
} else {
// This means alt token was all punct tokens:
if (VERBOSE) {
System.out.println(" discard all-punctuation altToken=" + altToken);
}
assert discardPunctuation;
}
altToken = null;

View File

@ -50,7 +50,9 @@ public class Token {
@Override
public String toString() {
return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + " leftID=" + dictionary.getLeftId(wordId) + ")";
return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " length=" + length +
" posLen=" + positionLength + " type=" + type + " wordId=" + wordId +
" leftID=" + dictionary.getLeftId(wordId) + ")";
}
/**

File diff suppressed because one or more lines are too long

View File

@ -644,4 +644,17 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
}
}
public void testWithPunctuation() throws Exception {
assertAnalyzesTo(analyzerNoPunct, "羽田。空港",
new String[] { "羽田", "空港" },
new int[] { 1, 1 });
}
public void testCompoundOverPunctuation() throws Exception {
assertAnalyzesToPositions(analyzerNoPunct, "dεε϶ϢϏΎϷΞͺ羽田",
new String[] { "d", "ε", "ε", "ϢϏΎϷΞͺ", "羽田" },
new int[] { 1, 1, 1, 1, 1},
new int[] { 1, 1, 1, 1, 1});
}
}