SOLR-89 new PatternReplaceFilter, TrimFilter, and corrisponding Factories

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@494675 13f79535-47bb-0310-9956-ffa450edef68
2007-01-10 01:18:38 +00:00 · 2007-01-10 01:18:38 +00:00 · a2beac1e38
parent ae6c58f278
commit a2beac1e38
11 changed files with 456 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -37,6 +37,11 @@ Detailed Change List
 New Features
 1. SOLR-82: Default field values can be specified in the schema.xml.
    (Ryan McKinley via hossman)
+ 2. SOLR-89: Two new TokenFilters with corrisponding Factories...
+    * TrimFilter - Trims leading and trailing whitespace from Tokens
+    * PatternReplaceFilter - applies a Pattern to each token in the
+      stream, replacing match occurances with a specified replacement.
+    (hossman)

 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
--- a/example/exampledocs/mem.xml
+++ b/example/exampledocs/mem.xml
@ -18,7 +18,7 @@
 <add>
 <doc>
  <field name="id">TWINX2048-3200PRO</field>
-  <field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
+  <field name="name">CORSAIR  XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
  <field name="manu">Corsair Microsystems Inc.</field>
  <field name="cat">electronics</field>
  <field name="cat">memory</field>
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -182,6 +182,39 @@
      </analyzer>
    </fieldtype>

+    <!-- This is an example of using the KeywordTokenizer along
+         With various TokenFilterFactories to produce a sortable field
+         that does not include some properties of the source text
+      -->
+    <fieldtype name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
+      <analyzer>
+        <!-- KeywordTokenizer does no actual tokenizing, so the entire
+             input string is preserved as a single token
+          -->
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <!-- The LowerCase TokenFilter does what you expect, which can be
+             when you want your sorting to be case insensitive
+          -->
+        <filter class="solr.LowerCaseFilterFactory" />
+        <!-- The TrimFilter removes any leading or trailing whitespace -->
+        <filter class="solr.TrimFilterFactory" />
+        <!-- The PatternReplaceFilter gives you the flexibility to use
+             Java Regular expression to replace any sequence of characters
+             matching a pattern with an arbitrary replacement string, 
+             which may include back refrences to portions of the orriginal
+             string matched by the pattern.
+             
+             See the Java Regular Expression documentation for more
+             infomation on pattern and replacement string syntax.
+             
+             http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
+          -->
+        <filter class="solr.PatternReplaceFilterFactory"
+                pattern="([^a-z])" replacement="" replace="all"
+        />
+      </analyzer>
+    </fieldtype>
+
 </types>


@ -204,6 +237,8 @@
   <field name="id" type="string" indexed="true" stored="true"/>
   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
   <field name="name" type="text" indexed="true" stored="true"/>
+   <field name="nameSort" type="string" indexed="true" stored="false"/>
+   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
   <field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true"/>
   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
@ -264,6 +299,8 @@

   <copyField source="cat" dest="text"/>
   <copyField source="name" dest="text"/>
+   <copyField source="name" dest="nameSort"/>
+   <copyField source="name" dest="alphaNameSort"/>
   <copyField source="manu" dest="text"/>
   <copyField source="features" dest="text"/>
   <copyField source="includes" dest="text"/>
--- a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java
+++ b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java
@ -0,0 +1,82 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import java.io.IOException;
+
+/**
+ * A TokenFilter which applies a Pattern to each token in the stream,
+ * replacing match occurances with the specified replacement string.
+ *
+ * <p>
+ * <b>Note:</b> Depending on the input and the pattern used and the input
+ * TokenStream, this TokenFilter may produce Tokens whose text is the empty
+ * string.
+ * </p>
+ * 
+ * @version $Id:$
+ * @see Pattern
+ */
+public final class PatternReplaceFilter extends TokenFilter {
+  Pattern p;
+  String replacement;
+  boolean all = true;
+
+  /**
+   * Constructs an instance to replace either the first, or all occurances
+   *
+   * @param in the TokenStream to process
+   * @param p the patterm to apply to each Token
+   * @param replacement the "replacement string" to substitute, if null a
+   *        blank string will be used. Note that this is not the literal
+   *        string that will be used, '$' and '\' have special meaning.
+   * @param all if true, all matches will be replaced otherwise just the first match.
+   * @see Matcher#quoteReplacement
+   */
+  public PatternReplaceFilter(TokenStream in,
+                              Pattern p,
+                              String replacement,
+                              boolean all) {
+    super(in);
+    this.p=p;
+    this.replacement = (null == replacement) ? "" : replacement;
+    this.all=all;
+  }
+  
+  public final Token next() throws IOException {
+    Token t = input.next();
+    if (t == null)
+      return null;
+
+    Matcher m = p.matcher(t.termText());
+    if (all) {
+      t.setTermText(m.replaceAll(replacement));
+    } else {
+      t.setTermText(m.replaceFirst(replacement));
+    }
+
+    return t;
+  }
+
+}
--- a/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+
+import java.util.Map;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * @version $Id:$
+ * @see PatternReplaceFilter
+ */
+public class PatternReplaceFilterFactory extends BaseTokenFilterFactory {
+  Pattern p;
+  String replacement;
+  boolean all = true;
+  
+  public void init(Map<String, String> args) {
+    super.init(args);
+    try {
+      p = Pattern.compile(args.get("pattern"));
+    } catch (PatternSyntaxException e) {
+      throw new RuntimeException
+        ("Configuration Error: 'pattern' can not be parsed in " +
+         this.getClass().getName(), e);
+    }
+    
+    replacement = args.get("replacement");
+    
+    String r = args.get("replace");
+    if (null != r) {
+      if (r.equals("all")) {
+        all = true;
+      } else {
+        if (r.equals("first")) {
+          all = false;
+        } else {
+          throw new RuntimeException
+            ("Configuration Error: 'replace' must be 'first' or 'all' in "
+             + this.getClass().getName());
+        }
+      }
+    }
+
+  }
+  public TokenStream create(TokenStream input) {
+    return new PatternReplaceFilter(input, p, replacement, all);
+  }
+}
--- a/src/java/org/apache/solr/analysis/TrimFilter.java
+++ b/src/java/org/apache/solr/analysis/TrimFilter.java
@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+import java.io.IOException;
+
+/**
+ * Trims leading and trailing whitespace from Tokens in the stream.
+ *
+ * @version $Id:$
+ */
+public final class TrimFilter extends TokenFilter {
+
+  public TrimFilter(TokenStream in) {
+    super(in);
+  }
+
+  public final Token next() throws IOException {
+    Token t = input.next();
+    if (null == t || null == t.termText())
+      return t;
+
+    t.setTermText(t.termText().trim());
+    return t;
+  }
+}
--- a/src/java/org/apache/solr/analysis/TrimFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java
@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id:$
+ * @see TrimFilter
+ */
+public class TrimFilterFactory extends BaseTokenFilterFactory {
+  public TokenStream create(TokenStream input) {
+    return new TrimFilter(input);
+  }
+}
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@ -686,6 +686,26 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
    
  }
  
+  public void testPatternReplaceFilter() {
+
+    assertU(adoc("id", "1",
+                 "patternreplacefilt", "My  fine-feathered friend!"));
+    assertU(adoc("id", "2",
+                 "patternreplacefilt", "  What's Up Doc?"));
+    assertU(commit());
+ 
+    assertQ("don't find Up",
+            req("q", "patternreplacefilt:Up"),
+            "*[count(//doc)=0]");
+    
+    assertQ("find doc",
+            req("q", "patternreplacefilt:__What_s_Up_Doc_"),
+            "*[count(//doc)=1]");
+
+    assertQ("find birds",
+            req("q", "patternreplacefilt:My__fine_feathered_friend_"),
+            "*[count(//doc)=1]");
+  }

 //   /** this doesn't work, but if it did, this is how we'd test it. */
 //   public void testOverwriteFalse() {
--- a/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java
@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * @version $Id:$
+ */
+public class TestPatternReplaceFilter extends TestCase {
+  
+  public void testReplaceAll() throws Exception {
+    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
+    TokenStream ts = new PatternReplaceFilter
+      (new WhitespaceTokenizer(new StringReader(input)),
+       Pattern.compile("a*b"),
+       "-", true);
+    assertEquals("-foo-foo-foo-", ts.next().termText());
+    assertEquals("-", ts.next().termText());
+    assertEquals("c-", ts.next().termText());
+    assertNull(ts.next());
+  }
+  
+  public void testReplaceFirst() throws Exception {
+    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
+    TokenStream ts = new PatternReplaceFilter
+      (new WhitespaceTokenizer(new StringReader(input)),
+       Pattern.compile("a*b"),
+       "-", false);
+    assertEquals("-fooaabfooabfoob", ts.next().termText());
+    assertEquals("-", ts.next().termText());
+    assertEquals("c-", ts.next().termText());
+    assertNull(ts.next());
+  }
+  
+  public void testStripFirst() throws Exception {
+    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
+    TokenStream ts = new PatternReplaceFilter
+      (new WhitespaceTokenizer(new StringReader(input)),
+       Pattern.compile("a*b"),
+       null, false);
+    assertEquals("fooaabfooabfoob", ts.next().termText());
+    assertEquals("", ts.next().termText());
+    assertEquals("c", ts.next().termText());
+    assertNull(ts.next());
+  }
+  
+  public void testStripAll() throws Exception {
+    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
+    TokenStream ts = new PatternReplaceFilter
+      (new WhitespaceTokenizer(new StringReader(input)),
+       Pattern.compile("a*b"),
+       null, true);
+    assertEquals("foofoofoo", ts.next().termText());
+    assertEquals("", ts.next().termText());
+    assertEquals("c", ts.next().termText());
+    assertNull(ts.next());
+  }
+  
+  public void testReplaceAllWithBackRef() throws Exception {
+    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
+    TokenStream ts = new PatternReplaceFilter
+      (new WhitespaceTokenizer(new StringReader(input)),
+       Pattern.compile("(a*)b"),
+       "$1\\$", true);
+    assertEquals("aa$fooaa$fooa$foo$", ts.next().termText());
+    assertEquals("a$", ts.next().termText());
+    assertEquals("caaaaaaaaa$", ts.next().termText());
+    assertNull(ts.next());
+  }
+
+}
--- a/src/test/org/apache/solr/analysis/TestTrimFilter.java
+++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java
@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+
+/**
+ * @version $Id:$
+ */
+public class TestTrimFilter extends TestCase {
+  
+  public void testTrim() throws Exception {
+    TokenStream ts = new TrimFilter
+      (new IterTokenStream(new Token(" a ", 1, 5),
+                           new Token("b   ",6,10),
+                           new Token("cCc",11,15),
+                           new Token("   ",16,20)));
+
+    assertEquals("a", ts.next().termText());
+    assertEquals("b", ts.next().termText());
+    assertEquals("cCc", ts.next().termText());
+    assertEquals("", ts.next().termText());
+    assertNull(ts.next());
+  }
+
+  public static class IterTokenStream extends TokenStream {
+    Iterator<Token> toks;
+    public IterTokenStream(Token... toks) {
+      this.toks = Arrays.asList(toks).iterator();
+    }
+    public Token next() {
+      if (toks.hasNext()) {
+        return toks.next();
+      }
+      return null;
+    }
+  }
+}
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -164,6 +164,17 @@
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldtype>
+    <fieldtype name="patternreplacefilt" class="solr.TextField">
+      <analyzer type="index">
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.PatternReplaceFilterFactory"
+                pattern="([^a-zA-Z])" replacement="_" replace="all"
+        />
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+      </analyzer>
+    </fieldtype>
    <fieldtype name="porterfilt" class="solr.TextField">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -340,6 +351,7 @@
   <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
   <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
   <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
+   <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
   <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
   <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
   <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>