WordDelimiterFilter could loose position info

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@419321 13f79535-47bb-0310-9956-ffa450edef68
2006-07-05 19:36:08 +00:00 · 2006-07-05 19:36:08 +00:00 · a35e30cb35
parent b5919a7dc1
commit a35e30cb35
4 changed files with 58 additions and 26 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -43,6 +43,7 @@ Bug Fixes
 2. Added escaping of attribute values in the XML response (Erik Hatcher)
 3. Added empty extractTerms() to FunctionQuery to enable use in
    a MultiSearcher (Yonik)
+ 4. WordDelimiterFilter sometimes lost token positionIncrement information

 Other Changes
 1. Upgrade to Lucene 2.0 nightly build 2006-06-22, lucene SVN revision 416224,
--- a/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
+++ b/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
@ -123,10 +123,6 @@ final class WordDelimiterFilter extends TokenFilter {
    }
  }

-  private int charType(String s, int pos) {
-    return charType(s.charAt(pos));
-  }
-
  // use the type of the first char as the type
  // of the token.
  private int tokType(Token t) {
@ -170,16 +166,18 @@ final class WordDelimiterFilter extends TokenFilter {
    // Would it actually be faster to check for the common form
    // of isLetter() isLower()*, and then backtrack if it doesn't match?

+    int origPosIncrement;
    while(true) {
      Token t = input.next();
      if (t == null) return null;

      String s = t.termText();
-      int off=t.startOffset();
      int start=0;
      int end=s.length();
      if (end==0) continue;

+      origPosIncrement = t.getPositionIncrement();
+
      // Avoid calling charType more than once for each char (basically
      // avoid any backtracking).
      // makes code slightly more difficult, but faster.
@ -273,6 +271,7 @@ final class WordDelimiterFilter extends TokenFilter {
            // optimization... if this is the only token,
            // return it immediately.
            if (queue.size()==0) {
+              newtok.setPositionIncrement(origPosIncrement);
              return newtok;
            }

@ -376,7 +375,9 @@ final class WordDelimiterFilter extends TokenFilter {
    // System.out.println("##########AFTER COMBINATIONS:"+ str(queue));

    queuePos=1;
-    return queue.get(0);
+    Token tok = queue.get(0);
+    tok.setPositionIncrement(origPosIncrement);
+    return tok;
  }


@ -416,29 +417,10 @@ final class WordDelimiterFilter extends TokenFilter {
    }
  }

-  private String str(List<Token> lst) {
-    StringBuilder sb = new StringBuilder();
-    sb.append('{');
-    for (Token t : lst) {
-      sb.append('(');
-      sb.append('"');
-      sb.append(t.termText());
-      sb.append("\",increment=");
-      sb.append(Integer.toString(t.getPositionIncrement()));
-      sb.append(')');
-
-      sb.append(',');
-    }
-    sb.append('}');
-    return sb.toString();
-  }
-
-

  // questions:
  // negative numbers?  -42 indexed as just 42?
  // dollar sign?  $42
  // percent sign?  33%
  // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
-
 }
--- a/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
+++ b/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
@ -0,0 +1,49 @@
+package org.apache.solr.analysis;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+import org.apache.solr.util.TestHarness;
+import org.apache.solr.request.SolrQueryRequest;
+
+/**
+ * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
+ */
+public class TestWordDelimiterFilter extends AbstractSolrTestCase {
+  public String getSchemaFile() { return "solr/conf/schema.xml"; }
+  public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
+
+
+  public void posTst(String v1, String v2, String s1, String s2) {
+    assertU(adoc("id",  "42",
+                 "subword", v1,
+                 "subword", v2));
+    assertU(commit());
+
+    // there is a positionIncrementGap of 100 between field values, so
+    // we test if that was maintained.
+    assertQ("position increment lost",
+            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
+            ,"//result[@numFound=0]"
+    );
+    assertQ("position increment lost",
+            req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
+            ,"//result[@numFound=1]"
+    );
+  }
+
+
+  public void testRetainPositionIncrement() {
+    posTst("foo","bar","foo","bar");
+    posTst("-foo-","-bar-","foo","bar");
+    posTst("foo","bar","-foo-","-bar-");
+
+    posTst("123","456","123","456");
+    posTst("/123/","/456/","123","456");
+
+    posTst("/123/abc","qwe/456/","abc","qwe");
+
+    posTst("zoo-foo","bar-baz","foo","bar");
+    posTst("zoo-foo-123","456-bar-baz","foo","bar");
+  }
+
+
+}
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -174,7 +174,7 @@
      </analyzer>
    </fieldtype>

-    <fieldtype name="subword" class="solr.TextField">
+    <fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
      <analyzer type="index">
          <tokenizer class="solr.WhitespaceTokenizerFactory"/>
          <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>