LUCENE-3940: fix Kuromoji to not produce invalid token graph due to UNK with punctuation being decompounded

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1311072 13f79535-47bb-0310-9956-ffa450edef68
2012-04-08 19:17:17 +00:00 · 2012-04-08 19:17:17 +00:00 · 78b4be5dc6
parent 879e825083
commit 78b4be5dc6
8 changed files with 235 additions and 90 deletions
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/BaseTokenStreamTestCase.java
@ -28,6 +28,8 @@ import java.io.Writer;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
+import java.util.Map;
+import java.util.HashMap;

 import org.apache.lucene.analysis.tokenattributes.*;
 import org.apache.lucene.util.Attribute;
@ -129,7 +131,12 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      posLengthAtt = ts.getAttribute(PositionLengthAttribute.class);
    }
    
+    // Maps position to the start/end offset:
+    final Map<Integer,Integer> posToStartOffset = new HashMap<Integer,Integer>();
+    final Map<Integer,Integer> posToEndOffset = new HashMap<Integer,Integer>();
+
    ts.reset();
+    int pos = -1;
    for (int i = 0; i < output.length; i++) {
      // extra safety to enforce, that the state is not preserved and also assign bogus values
      ts.clearAttributes();
@ -157,14 +164,51 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      
      // we can enforce some basic things about a few attributes even if the caller doesn't check:
      if (offsetAtt != null) {
-        assertTrue("startOffset must be >= 0", offsetAtt.startOffset() >= 0);
-        assertTrue("endOffset must be >= 0", offsetAtt.endOffset() >= 0);
-        assertTrue("endOffset must be >= startOffset, got startOffset=" + offsetAtt.startOffset() + ",endOffset=" + offsetAtt.endOffset(), 
-            offsetAtt.endOffset() >= offsetAtt.startOffset());
+        final int startOffset = offsetAtt.startOffset();
+        final int endOffset = offsetAtt.endOffset();
+        assertTrue("startOffset must be >= 0", startOffset >= 0);
+        assertTrue("endOffset must be >= 0", endOffset >= 0);
+        assertTrue("endOffset must be >= startOffset, got startOffset=" + startOffset + ",endOffset=" + endOffset, 
+            endOffset >= startOffset);
        if (finalOffset != null) {
-          assertTrue("startOffset must be <= finalOffset", offsetAtt.startOffset() <= finalOffset.intValue());
-          assertTrue("endOffset must be <= finalOffset: got endOffset=" + offsetAtt.endOffset() + " vs finalOffset=" + finalOffset.intValue(),
-                     offsetAtt.endOffset() <= finalOffset.intValue());
+          assertTrue("startOffset must be <= finalOffset", startOffset <= finalOffset.intValue());
+          assertTrue("endOffset must be <= finalOffset: got endOffset=" + endOffset + " vs finalOffset=" + finalOffset.intValue(),
+                     endOffset <= finalOffset.intValue());
+        }
+
+        if (posLengthAtt != null && posIncrAtt != null) {
+          // Validate offset consistency in the graph, ie
+          // all tokens leaving from a certain pos have the
+          // same startOffset, and all tokens arriving to a
+          // certain pos have the same endOffset:
+          final int posInc = posIncrAtt.getPositionIncrement();
+          pos += posInc;
+
+          final int posLength = posLengthAtt.getPositionLength();
+
+          if (!posToStartOffset.containsKey(pos)) {
+            // First time we've seen a token leaving from this position:
+            posToStartOffset.put(pos, startOffset);
+            //System.out.println("  + s " + pos + " -> " + startOffset);
+          } else {
+            // We've seen a token leaving from this position
+            // before; verify the startOffset is the same:
+            //System.out.println("  + vs " + pos + " -> " + startOffset);
+            assertEquals(posToStartOffset.get(pos).intValue(), startOffset);
+          }
+
+          final int endPos = pos + posLength;
+
+          if (!posToEndOffset.containsKey(endPos)) {
+            // First time we've seen a token arriving to this position:
+            posToEndOffset.put(endPos, endOffset);
+            //System.out.println("  + e " + endPos + " -> " + endOffset);
+          } else {
+            // We've seen a token arriving to this position
+            // before; verify the endOffset is the same:
+            //System.out.println("  + ve " + endPos + " -> " + endOffset);
+            assertEquals(posToEndOffset.get(endPos).intValue(), endOffset);
+          }
        }
      }
      if (posIncrAtt != null) {
@ -395,12 +439,41 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
      try {
        checkAnalysisConsistency(random, a, useCharFilter, text);
      } catch (Throwable t) {
-        System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + text + "'");
+        System.err.println("TEST FAIL: useCharFilter=" + useCharFilter + " text='" + escape(text) + "'");
        Rethrow.rethrow(t);
      }
    }
  }

+  public static String escape(String s) {
+    int charUpto = 0;
+    final StringBuilder sb = new StringBuilder();
+    while (charUpto < s.length()) {
+      final int c = s.codePointAt(charUpto);
+      if (c == 0xa) {
+        // Strangely, you cannot put \ u000A into Java
+        // sources (not in a comment nor a string
+        // constant)...:
+        sb.append("\\n");
+      } else if (c == 0xd) {
+        // ... nor \ u000D:
+        sb.append("\\r");
+      } else if (c == '"') {
+        sb.append("\\\"");
+      } else if (c == '\\') {
+        sb.append("\\\\");
+      } else if (c >= 0x20 && c < 0x80) {
+        sb.append((char) c);
+      } else {
+        // TODO: we can make ascii easier to read if we
+        // don't escape...
+        sb.append(String.format("\\u%04x", c));
+      }
+      charUpto += Character.charCount(c);
+    }
+    return sb.toString();
+  }
+
  public static void checkAnalysisConsistency(Random random, Analyzer a, boolean useCharFilter, String text) throws IOException {

    if (VERBOSE) {
@ -513,79 +586,79 @@ public abstract class BaseTokenStreamTestCase extends LuceneTestCase {
          ts.close();
        }
      }
+    }

-      // Final pass: verify clean tokenization matches
-      // results from first pass:
+    // Final pass: verify clean tokenization matches
+    // results from first pass:

+    if (VERBOSE) {
+      System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
+    }
+    reader = new StringReader(text);
+
+    if (random.nextInt(30) == 7) {
      if (VERBOSE) {
-        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: re-run analysis; " + tokens.size() + " tokens");
-      }
-      reader = new StringReader(text);
-
-      if (random.nextInt(30) == 7) {
-        if (VERBOSE) {
-          System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
-        }
-
-        reader = new MockReaderWrapper(random, reader);
+        System.out.println(Thread.currentThread().getName() + ": NOTE: BaseTokenStreamTestCase: using spoon-feed reader");
      }

-      ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
-      if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
-        // offset + pos + posLength + type
-        assertTokenStreamContents(ts, 
-                                  tokens.toArray(new String[tokens.size()]),
-                                  toIntArray(startOffsets),
-                                  toIntArray(endOffsets),
-                                  types.toArray(new String[types.size()]),
-                                  toIntArray(positions),
-                                  toIntArray(positionLengths),
-                                  text.length());
-      } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
-        // offset + pos + type
-        assertTokenStreamContents(ts, 
-                                  tokens.toArray(new String[tokens.size()]),
-                                  toIntArray(startOffsets),
-                                  toIntArray(endOffsets),
-                                  types.toArray(new String[types.size()]),
-                                  toIntArray(positions),
-                                  null,
-                                  text.length());
-      } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
-        // offset + pos + posLength
-        assertTokenStreamContents(ts, 
-                                  tokens.toArray(new String[tokens.size()]),
-                                  toIntArray(startOffsets),
-                                  toIntArray(endOffsets),
-                                  null,
-                                  toIntArray(positions),
-                                  toIntArray(positionLengths),
-                                  text.length());
-      } else if (posIncAtt != null && offsetAtt != null) {
-        // offset + pos
-        assertTokenStreamContents(ts, 
-                                  tokens.toArray(new String[tokens.size()]),
-                                  toIntArray(startOffsets),
-                                  toIntArray(endOffsets),
-                                  null,
-                                  toIntArray(positions),
-                                  null,
-                                  text.length());
-      } else if (offsetAtt != null) {
-        // offset
-        assertTokenStreamContents(ts, 
-                                  tokens.toArray(new String[tokens.size()]),
-                                  toIntArray(startOffsets),
-                                  toIntArray(endOffsets),
-                                  null,
-                                  null,
-                                  null,
-                                  text.length());
-      } else {
-        // terms only
-        assertTokenStreamContents(ts, 
-                                  tokens.toArray(new String[tokens.size()]));
-      }
+      reader = new MockReaderWrapper(random, reader);
+    }
+
+    ts = a.tokenStream("dummy", useCharFilter ? new MockCharFilter(reader, remainder) : reader);
+    if (typeAtt != null && posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+      // offset + pos + posLength + type
+      assertTokenStreamContents(ts, 
+                                tokens.toArray(new String[tokens.size()]),
+                                toIntArray(startOffsets),
+                                toIntArray(endOffsets),
+                                types.toArray(new String[types.size()]),
+                                toIntArray(positions),
+                                toIntArray(positionLengths),
+                                text.length());
+    } else if (typeAtt != null && posIncAtt != null && offsetAtt != null) {
+      // offset + pos + type
+      assertTokenStreamContents(ts, 
+                                tokens.toArray(new String[tokens.size()]),
+                                toIntArray(startOffsets),
+                                toIntArray(endOffsets),
+                                types.toArray(new String[types.size()]),
+                                toIntArray(positions),
+                                null,
+                                text.length());
+    } else if (posIncAtt != null && posLengthAtt != null && offsetAtt != null) {
+      // offset + pos + posLength
+      assertTokenStreamContents(ts, 
+                                tokens.toArray(new String[tokens.size()]),
+                                toIntArray(startOffsets),
+                                toIntArray(endOffsets),
+                                null,
+                                toIntArray(positions),
+                                toIntArray(positionLengths),
+                                text.length());
+    } else if (posIncAtt != null && offsetAtt != null) {
+      // offset + pos
+      assertTokenStreamContents(ts, 
+                                tokens.toArray(new String[tokens.size()]),
+                                toIntArray(startOffsets),
+                                toIntArray(endOffsets),
+                                null,
+                                toIntArray(positions),
+                                null,
+                                text.length());
+    } else if (offsetAtt != null) {
+      // offset
+      assertTokenStreamContents(ts, 
+                                tokens.toArray(new String[tokens.size()]),
+                                toIntArray(startOffsets),
+                                toIntArray(endOffsets),
+                                null,
+                                null,
+                                null,
+                                text.length());
+    } else {
+      // terms only
+      assertTokenStreamContents(ts, 
+                                tokens.toArray(new String[tokens.size()]));
    }
  }
  
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/LookaheadTokenFilter.java
@ -237,6 +237,7 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
            if (DEBUG) {
              System.out.println("  return inserted token");
            }
+            assert insertedTokenConsistent();
            insertPending = false;
            return true;
          }
@ -253,6 +254,16 @@ public abstract class LookaheadTokenFilter<T extends LookaheadTokenFilter.Positi
    }
  }

+  // If subclass inserted a token, make sure it had in fact
+  // looked ahead enough:
+  private boolean insertedTokenConsistent() {
+    final int posLen = posLenAtt.getPositionLength();
+    final Position endPosData = positions.get(outputPos + posLen);
+    assert endPosData.endOffset != -1;
+    assert offsetAtt.endOffset() == endPosData.endOffset;
+    return true;
+  }
+
  // TODO: end()?
  // TODO: close()?

--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockCharFilter.java
@ -31,6 +31,8 @@ public class MockCharFilter extends CharStream {
  // for testing only
  public MockCharFilter(Reader in, int remainder) {
    this.in = CharReader.get(in);
+    // TODO: instead of fixed remainder... maybe a fixed
+    // random seed?
    this.remainder = remainder;
    assert remainder >= 0 && remainder < 10 : "invalid parameter";
  }
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockHoleInjectingTokenFilter.java
@ -22,14 +22,22 @@ import java.io.IOException;
 import java.util.Random;

 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.util._TestUtil;

+// TODO: maybe, instead to be more "natural", we should make
+// a MockRemovesTokensTF, ideally subclassing FilteringTF
+// (in modules/analysis)
+
 // Randomly injects holes:
 public final class MockHoleInjectingTokenFilter extends TokenFilter {

  private final long randomSeed;
  private Random random;
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+  private int maxPos;
+  private int pos;

  public MockHoleInjectingTokenFilter(Random random, TokenStream in) {
    super(in);
@ -40,16 +48,28 @@ public final class MockHoleInjectingTokenFilter extends TokenFilter {
  public void reset() throws IOException {
    super.reset();
    random = new Random(randomSeed);
+    maxPos = -1;
+    pos = -1;
  }

  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      final int posInc = posIncAtt.getPositionIncrement();
-      if (posInc > 0 && random.nextInt(5) == 3) {
-        posIncAtt.setPositionIncrement(posInc + _TestUtil.nextInt(random, 1, 5));
-        // TODO: should we tweak offsets...?
+
+      int nextPos = pos + posInc;
+
+      // Carefully inject a hole only where it won't mess up
+      // the graph:
+      if (posInc > 0 && maxPos <= nextPos && random.nextInt(5) == 3) {
+        final int holeSize = _TestUtil.nextInt(random, 1, 5);
+        posIncAtt.setPositionIncrement(posInc + holeSize);
+        nextPos += holeSize;
      }
+
+      pos = nextPos;
+      maxPos = Math.max(maxPos, pos + posLenAtt.getPositionLength());
+
      return true;
    } else {
      return false;
@ -58,5 +78,3 @@ public final class MockHoleInjectingTokenFilter extends TokenFilter {

  // TODO: end?
 }
-
-
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseTokenizer.java
@ -603,10 +603,10 @@ public final class JapaneseTokenizer extends Tokenizer {

      if (posData.count == 0) {
        // No arcs arrive here; move to next position:
-        pos++;
        if (VERBOSE) {
-          System.out.println("    no arcs in; skip");
+          System.out.println("    no arcs in; skip pos=" + pos);
        }
+        pos++;
        continue;
      }

@ -785,6 +785,7 @@ public final class JapaneseTokenizer extends Tokenizer {

        // Find unknown match:
        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
+        final boolean isPunct = isPunctuation(firstCharacter);

        // NOTE: copied from UnknownDictionary.lookup:
        int unknownWordLength;
@ -798,7 +799,8 @@ public final class JapaneseTokenizer extends Tokenizer {
            if (ch == -1) {
              break;
            }
-            if (characterId == characterDefinition.getCharacterClass((char) ch)) {
+            if (characterId == characterDefinition.getCharacterClass((char) ch) &&
+                isPunctuation((char) ch) == isPunct) {
              unknownWordLength++;    			
            } else {
              break;
@ -1099,18 +1101,26 @@ public final class JapaneseTokenizer extends Tokenizer {
        // The pruning we did when we created the altToken
        // ensures that the back trace will align back with
        // the start of the altToken:
-        // cannot assert...
-        //assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
+        assert altToken.getPosition() == backPos: altToken.getPosition() + " vs " + backPos;
+
+        // NOTE: not quite right: the compound token may
+        // have had all punctuation back traced so far, but
+        // then the decompounded token at this position is
+        // not punctuation.  In this case backCount is 0,
+        // but we should maybe add the altToken anyway...?

-        if (VERBOSE) {
-          System.out.println("    add altToken=" + altToken);
-        }
        if (backCount > 0) {
          backCount++;
          altToken.setPositionLength(backCount);
+          if (VERBOSE) {
+            System.out.println("    add altToken=" + altToken);
+          }
          pending.add(altToken);
        } else {
          // This means alt token was all punct tokens:
+          if (VERBOSE) {
+            System.out.println("    discard all-punctuation altToken=" + altToken);
+          }
          assert discardPunctuation;
        }
        altToken = null;
--- a/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/Token.java
+++ b/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/Token.java
@ -50,7 +50,9 @@ public class Token {

  @Override
  public String toString() {
-    return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " type=" + type + " wordId=" + wordId + " leftID=" + dictionary.getLeftId(wordId) + ")";
+    return "Token(\"" + new String(surfaceForm, offset, length) + "\" pos=" + position + " length=" + length +
+      " posLen=" + positionLength + " type=" + type + " wordId=" + wordId +
+      " leftID=" + dictionary.getLeftId(wordId) + ")";
  }
  
  /**
--- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
+++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseAnalyzer.java
--- a/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
+++ b/modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseTokenizer.java
@ -644,4 +644,17 @@ public class TestJapaneseTokenizer extends BaseTokenStreamTestCase {
      System.out.println("Total time : " + (System.currentTimeMillis() - totalStart));
    }
  }
+
+  public void testWithPunctuation() throws Exception {
+    assertAnalyzesTo(analyzerNoPunct, "羽田。空港",
+                     new String[] { "羽田", "空港" },
+                     new int[] { 1, 1 });
+  }
+
+  public void testCompoundOverPunctuation() throws Exception {
+    assertAnalyzesToPositions(analyzerNoPunct, "dεε϶ϢϏΎϷΞͺ羽田",
+                              new String[] { "d", "ε", "ε", "ϢϏΎϷΞͺ", "羽田" },
+                              new int[] { 1, 1, 1, 1, 1},
+                              new int[] { 1, 1, 1, 1, 1});
+  }
 }