LUCENE-7668: add new test case; remove dead code; improve CannedTokenStream to copy all Token attributes

2017-01-31 11:56:07 -05:00 · 2017-01-31 11:56:07 -05:00 · 72eaeab715
parent a43ef8f480
commit 72eaeab715
3 changed files with 22 additions and 23 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterGraphFilter.java
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.AttributeSource;
@ -173,7 +172,6 @@ public final class WordDelimiterGraphFilter extends TokenFilter {
  private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAttribute = addAttribute(PositionLengthAttribute.class);
-  private final TypeAttribute typeAttribute = addAttribute(TypeAttribute.class);

  // used for iterating word delimiter breaks
  private final WordDelimiterIterator iterator;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterGraphFilter.java
@ -155,6 +155,19 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
    doSplitPossessive(0, "ra's", "ra", "s");
  }
  
+  public void testTokenType() throws Exception {
+    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
+    // test that subwords and catenated subwords have
+    // the correct offsets.
+    Token token = new Token("foo-bar", 5, 12);
+    token.setType("mytype");
+    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(token), DEFAULT_WORD_DELIM_TABLE, flags, null);
+
+    assertTokenStreamContents(wdf, 
+                              new String[] {"foobar", "foo", "bar"},
+                              new String[] {"mytype", "mytype", "mytype"});
+  }
+  
  /*
   * Set a large position increment gap of 10 if the token is "largegap" or "/"
   */
@ -177,7 +190,7 @@ public class TestWordDelimiterGraphFilter extends BaseTokenStreamTestCase {
      }
    }  
  }
-  
+
  public void testPositionIncrements() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/CannedTokenStream.java
@ -18,11 +18,9 @@ package org.apache.lucene.analysis;

 import java.io.IOException;

-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;

 /**
 * TokenStream from a canned list of Tokens.
@ -30,23 +28,19 @@ import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 public final class CannedTokenStream extends TokenStream {
  private final Token[] tokens;
  private int upto = 0;
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-  private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
+  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
  private final int finalOffset;
  private final int finalPosInc;

  public CannedTokenStream(Token... tokens) {
-    this.tokens = tokens;
-    finalOffset = 0;
-    finalPosInc = 0;
+    this(0, 0, tokens);
  }

  /** If you want trailing holes, pass a non-zero
   *  finalPosInc. */
  public CannedTokenStream(int finalPosInc, int finalOffset, Token... tokens) {
+    super(Token.TOKEN_ATTRIBUTE_FACTORY);
    this.tokens = tokens;
    this.finalOffset = finalOffset;
    this.finalPosInc = finalPosInc;
@ -62,16 +56,10 @@ public final class CannedTokenStream extends TokenStream {
  @Override
  public boolean incrementToken() {
    if (upto < tokens.length) {
-      final Token token = tokens[upto++];     
-      // TODO: can we just capture/restoreState so
-      // we get all attrs...?
-      clearAttributes();      
-      termAtt.setEmpty();
-      termAtt.append(token.toString());
-      posIncrAtt.setPositionIncrement(token.getPositionIncrement());
-      posLengthAtt.setPositionLength(token.getPositionLength());
-      offsetAtt.setOffset(token.startOffset(), token.endOffset());
-      payloadAtt.setPayload(token.getPayload());
+      clearAttributes();
+      // NOTE: this looks weird, casting offsetAtt to Token, but because we are using the Token class's AttributeFactory, all attributes are
+      // in fact backed by the Token class, so we just copy the current token into our Token:
+      tokens[upto++].copyTo((Token) offsetAtt);
      return true;
    } else {
      return false;