mirror of https://github.com/apache/lucene.git
LUCENE-3940: fix Kuromoji to not produce invalid token graph due to UNK with punctuation being decompounded
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311072 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
879e825083
commit
78b4be5dc6
|
@ -28,6 +28,8 @@ import java.io.Writer;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.*;
|
||||
import org.apache.lucene.util.Attribute;
|
||||
|
@ -129,7 +131,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
|
||||
}
|
||||
|
||||
// Maps position to the start/end offset:
|
||||
final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
|
||||
final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
|
||||
|
||||
ts.reset();
|
||||
int pos = -1;
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
// extra safety to enforce, that the state is not preserved and also assign bogus values
|
||||
ts.clearAttributes();
|
||||
|
@ -157,14 +164,51 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
|
||||
// we can enforce some basic things about a few attributes even if the caller doesn't check:
|
||||
if (offsetAtt != null) {
|
||||
assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
|
||||
assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
|
||||
assertTrue("endOffset must be >= startOffset, got startOffset=" + offsetAtt.startOffset() + ",endOffset=" + offsetAtt.endOffset(),
|
||||
offsetAtt.endOffset() >= offsetAtt.startOffset());
|
||||
final int startOffset = offsetAtt.startOffset();
|
||||
final int endOffset = offsetAtt.endOffset();
|
||||
assertTrue("startOffset must be >= 0", startOffset >= 0);
|
||||
assertTrue("endOffset must be >= 0", endOffset >= 0);
|
||||
assertTrue("endOffset must be >= startOffset, got startOffset=" + startOffset + ",endOffset=" + endOffset,
|
||||
endOffset >= startOffset);
|
||||
if (finalOffset != null) {
|
||||
assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
|
||||
assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
|
||||
offsetAtt.endOffset() <= finalOffset.intValue());
|
||||
assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
|
||||
assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(),
|
||||
endOffset <= finalOffset.intValue());
|
||||
}
|
||||
|
||||
if (posLengthAtt != null && posIncrAtt != null) {
|
||||
// Validate offset consistency in the graph, ie
|
||||
// all tokens leaving from a certain pos have the
|
||||
// same startOffset, and all tokens arriving to a
|
||||
// certain pos have the same endOffset:
|
||||
final int posInc = posIncrAtt.getPositionIncrement();
|
||||
pos += posInc;
|
||||
|
||||
final int posLength = posLengthAtt.getPositionLength();
|
||||
|
||||
if (!posToStartOffset.containsKey(pos)) {
|
||||
// First time we've seen a token leaving from this position:
|
||||
posToStartOffset.put(pos, startOffset);
|
||||
//System.out.println(" + s " + pos + " -> " + startOffset);
|
||||
} else {
|
||||
// We've seen a token leaving from this position
|
||||
// before; verify the startOffset is the same:
|
||||
//System.out.println(" + vs " + pos + " -> " + startOffset);
|
||||
assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
|
||||
}
|
||||
|
||||
final int endPos = pos + posLength;
|
||||
|
||||
if (!posToEndOffset.containsKey(endPos)) {
|
||||
// First time we've seen a token arriving to this position:
|
||||
posToEndOffset.put(endPos, endOffset);
|
||||
//System.out.println(" + e " + endPos + " -> " + endOffset);
|
||||
} else {
|
||||
// We've seen a token arriving to this position
|
||||
// before; verify the endOffset is the same:
|
||||
//System.out.println(" + ve " + endPos + " -> " + endOffset);
|
||||
assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (posIncrAtt != null) {
|
||||
|
@ -395,12 +439,41 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
try {
|
||||
checkAnalysisConsistency(random, a, useCharFilter, text);
|
||||
} catch (Throwable t) {
|
||||
System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + text + "'");
|
||||
System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'");
|
||||
Rethrow.rethrow(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String escape(String s) {
|
||||
int charUpto = 0;
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (charUpto < s.length()) {
|
||||
final int c = s.codePointAt(charUpto);
|
||||
if (c == 0xa) {
|
||||
// Strangely, you cannot put \ u000A into Java
|
||||
// sources (not in a comment nor a string
|
||||
// constant)...:
|
||||
sb.append("\\n");
|
||||
} else if (c == 0xd) {
|
||||
// ... nor \ u000D:
|
||||
sb.append("\\r");
|
||||
} else if (c == '"') {
|
||||
sb.append("\\\"");
|
||||
} else if (c == '\\') {
|
||||
sb.append("\\\\");
|
||||
} else if (c >= 0x20 && c < 0x80) {
|
||||
sb.append((char) c);
|
||||
} else {
|
||||
// TODO: we can make ascii easier to read if we
|
||||
// don't escape...
|
||||
sb.append(String.format("\\u%04x", c));
|
||||
}
|
||||
charUpto += Character.charCount(c);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {
|
||||
|
||||
if (VERBOSE) {
|
||||
|
@ -513,79 +586,79 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
|
|||
ts.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final pass: verify clean tokenization matches
|
||||
// results from first pass:
|
||||
// Final pass: verify clean tokenization matches
|
||||
// results from first pass:
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
|
||||
if (random.nextInt(30) == 7) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
|
||||
}
|
||||
reader = new StringReader(text);
|
||||
|
||||
if (random.nextInt(30) == 7) {
|
||||
if (VERBOSE) {
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||
}
|
||||
|
||||
reader = new MockReaderWrapper(random, reader);
|
||||
System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
|
||||
}
|
||||
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
reader = new MockReaderWrapper(random, reader);
|
||||
}
|
||||
|
||||
ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
|
||||
if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos + type
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
types.toArray(new String[types.size()]),
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
|
||||
// offset + pos + posLength
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
toIntArray(positionLengths),
|
||||
text.length());
|
||||
} else if (posIncAtt != null && offsetAtt != null) {
|
||||
// offset + pos
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
toIntArray(positions),
|
||||
null,
|
||||
text.length());
|
||||
} else if (offsetAtt != null) {
|
||||
// offset
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]),
|
||||
toIntArray(startOffsets),
|
||||
toIntArray(endOffsets),
|
||||
null,
|
||||
null,
|
||||
null,
|
||||
text.length());
|
||||
} else {
|
||||
// terms only
|
||||
assertTokenStreamContents(ts,
|
||||
tokens.toArray(new String[tokens.size()]));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -237,6 +237,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
|||
if (DEBUG) {
|
||||
System.out.println(" return inserted token");
|
||||
}
|
||||
assert insertedTokenConsistent();
|
||||
insertPending = false;
|
||||
return true;
|
||||
}
|
||||
|
@ -253,6 +254,16 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
|
|||
}
|
||||
}
|
||||
|
||||
// If subclass inserted a token, make sure it had in fact
|
||||
// looked ahead enough:
|
||||
private boolean insertedTokenConsistent() {
|
||||
final int posLen = posLenAtt.getPositionLength();
|
||||
final Position endPosData = positions.get(outputPos + posLen);
|
||||
assert endPosData.endOffset != -1;
|
||||
assert offsetAtt.endOffset() == endPosData.endOffset;
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO: end()?
|
||||
// TODO: close()?
|
||||
|
||||
|
|
|
@ -31,6 +31,8 @@ public class MockCharFilter extends CharStream {
|
|||
// for testing only
|
||||
public MockCharFilter(Reader in, int remainder) {
|
||||
this.in = CharReader.get(in);
|
||||
// TODO: instead of fixed remainder... maybe a fixed
|
||||
// random seed?
|
||||
this.remainder = remainder;
|
||||
assert remainder >= 0 && remainder < 10 : "invalid parameter";
|
||||
}
|
||||
|
|
|
@ -22,14 +22,22 @@ import java.io.IOException;
|
|||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
// TODO: maybe, instead to be more "natural", we should make
|
||||
// a MockRemovesTokensTF, ideally subclassing FilteringTF
|
||||
// (in modules/analysis)
|
||||
|
||||
// Randomly injects holes:
|
||||
public final class MockHoleInjectingTokenFilter extends TokenFilter {
|
||||
|
||||
private final long randomSeed;
|
||||
private Random random;
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private int maxPos;
|
||||
private int pos;
|
||||
|
||||
public MockHoleInjectingTokenFilter(Random random, TokenStream in) {
|
||||
super(in);
|
||||
|
@ -40,16 +48,28 @@ public final class MockHoleInjectingTokenFilter extends TokenFilter {
|
|||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
random = new Random(randomSeed);
|
||||
maxPos = -1;
|
||||
pos = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final int posInc = posIncAtt.getPositionIncrement();
|
||||
if (posInc > 0 && random.nextInt(5) == 3) {
|
||||
posIncAtt.setPositionIncrement(posInc + _TestUtil.nextInt(random, 1, 5));
|
||||
// TODO: should we tweak offsets...?
|
||||
|
||||
int nextPos = pos + posInc;
|
||||
|
||||
// Carefully inject a hole only where it won't mess up
|
||||
// the graph:
|
||||
if (posInc > 0 && maxPos <= nextPos && random.nextInt(5) == 3) {
|
||||
final int holeSize = _TestUtil.nextInt(random, 1, 5);
|
||||
posIncAtt.setPositionIncrement(posInc + holeSize);
|
||||
nextPos += holeSize;
|
||||
}
|
||||
|
||||
pos = nextPos;
|
||||
maxPos = Math.max(maxPos, pos + posLenAtt.getPositionLength());
|
||||
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
@ -58,5 +78,3 @@ public final class MockHoleInjectingTokenFilter extends TokenFilter {
|
|||
|
||||
// TODO: end?
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -603,10 +603,10 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
|
||||
if (posData.count == 0) {
|
||||
// No arcs arrive here; move to next position:
|
||||
pos++;
|
||||
if (VERBOSE) {
|
||||
System.out.println(" no arcs in; skip");
|
||||
System.out.println(" no arcs in; skip pos=" + pos);
|
||||
}
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -785,6 +785,7 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
|
||||
// Find unknown match:
|
||||
final int characterId = characterDefinition.getCharacterClass(firstCharacter);
|
||||
final boolean isPunct = isPunctuation(firstCharacter);
|
||||
|
||||
// NOTE: copied from UnknownDictionary.lookup:
|
||||
int unknownWordLength;
|
||||
|
@ -798,7 +799,8 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
if (ch == -1) {
|
||||
break;
|
||||
}
|
||||
if (characterId == characterDefinition.getCharacterClass((char) ch)) {
|
||||
if (characterId == characterDefinition.getCharacterClass((char) ch) &&
|
||||
isPunctuation((char) ch) == isPunct) {
|
||||
unknownWordLength++;
|
||||
} else {
|
||||
break;
|
||||
|
@ -1099,18 +1101,26 @@ public final class JapaneseTokenizer extends Tokenizer {
|
|||
// The pruning we did when we created the altToken
|
||||
// ensures that the back trace will align back with
|
||||
// the start of the altToken:
|
||||
// cannot assert...
|
||||
//assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
|
||||
assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
|
||||
|
||||
// NOTE: not quite right: the compound token may
|
||||
// have had all punctuation back traced so far, but
|
||||
// then the decompounded token at this position is
|
||||
// not punctuation. In this case backCount is 0,
|
||||
// but we should maybe add the altToken anyway...?
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add altToken=" + altToken);
|
||||
}
|
||||
if (backCount > 0) {
|
||||
backCount++;
|
||||
altToken.setPositionLength(backCount);
|
||||
if (VERBOSE) {
|
||||
System.out.println(" add altToken=" + altToken);
|
||||
}
|
||||
pending.add(altToken);
|
||||
} else {
|
||||
// This means alt token was all punct tokens:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" discard all-punctuation altToken=" + altToken);
|
||||
}
|
||||
assert discardPunctuation;
|
||||
}
|
||||
altToken = null;
|
||||
|
|
|
@ -50,7 +50,9 @@ public class Token {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + " leftID=" + dictionary.getLeftId(wordId) + ")";
|
||||
return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " length=" + length +
|
||||
" posLen=" + positionLength + " type=" + type + " wordId=" + wordId +
|
||||
" leftID=" + dictionary.getLeftId(wordId) + ")";
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -644,4 +644,17 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
|
|||
System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
|
||||
}
|
||||
}
|
||||
|
||||
public void testWithPunctuation() throws Exception {
|
||||
assertAnalyzesTo(analyzerNoPunct, "羽田。空港",
|
||||
new String[] { "羽田", "空港" },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
public void testCompoundOverPunctuation() throws Exception {
|
||||
assertAnalyzesToPositions(analyzerNoPunct, "dεε϶ϢϏΎϷΞͺ羽田",
|
||||
new String[] { "d", "ε", "ε", "ϢϏΎϷΞͺ", "羽田" },
|
||||
new int[] { 1, 1, 1, 1, 1},
|
||||
new int[] { 1, 1, 1, 1, 1});
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue