SOLR-89 new PatternReplaceFilter, TrimFilter, and corrisponding Factories

git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@494675 13f79535-47bb-0310-9956-ffa450edef68
2007-01-10 01:18:38 +00:00 · 2007-01-10 01:18:38 +00:00 · a2beac1e38
parent ae6c58f278
commit a2beac1e38
11 changed files with 456 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -37,6 +37,11 @@ Detailed Change List
 New Features
 1. SOLR-82: Default field values can be specified in the schema.xml.
    (Ryan McKinley via hossman)
 2. SOLR-89: Two new TokenFilters with corrisponding Factories...
    * TrimFilter - Trims leading and trailing whitespace from Tokens
    * PatternReplaceFilter - applies a Pattern to each token in the
      stream, replacing match occurances with a specified replacement.
    (hossman)
 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
--- a/example/exampledocs/mem.xml
+++ b/example/exampledocs/mem.xml
@ -18,7 +18,7 @@
 <add>
 <doc>
  <field name="id">TWINX2048-3200PRO</field>
-  <field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
+  <field name="name">CORSAIR  XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
  <field name="manu">Corsair Microsystems Inc.</field>
  <field name="cat">electronics</field>
  <field name="cat">memory</field>
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -182,6 +182,39 @@
      </analyzer>
    </fieldtype>
    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
      -->
    <fieldtype name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
      <analyzer>
        <!-- KeywordTokenizer does no actual tokenizing, so the entire
             input string is preserved as a single token
          -->
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <!-- The LowerCase TokenFilter does what you expect, which can be
             when you want your sorting to be case insensitive
          -->
        <filter class="solr.LowerCaseFilterFactory" />
        <!-- The TrimFilter removes any leading or trailing whitespace -->
        <filter class="solr.TrimFilterFactory" />
        <!-- The PatternReplaceFilter gives you the flexibility to use
             Java Regular expression to replace any sequence of characters
             matching a pattern with an arbitrary replacement string, 
             which may include back refrences to portions of the orriginal
             string matched by the pattern.
             See the Java Regular Expression documentation for more
             infomation on pattern and replacement string syntax.
             http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
          -->
        <filter class="solr.PatternReplaceFilterFactory"
                pattern="([^a-z])" replacement="" replace="all"
        />
      </analyzer>
    </fieldtype>
 </types>
@ -204,6 +237,8 @@
   <field name="id" type="string" indexed="true" stored="true"/>
   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
   <field name="name" type="text" indexed="true" stored="true"/>
   <field name="nameSort" type="string" indexed="true" stored="false"/>
   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
   <field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true"/>
   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
@ -264,6 +299,8 @@
   <copyField source="cat" dest="text"/>
   <copyField source="name" dest="text"/>
   <copyField source="name" dest="nameSort"/>
   <copyField source="name" dest="alphaNameSort"/>
   <copyField source="manu" dest="text"/>
   <copyField source="features" dest="text"/>
   <copyField source="includes" dest="text"/>
--- a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java
+++ b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java
@ -0,0 +1,82 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;
 import java.io.IOException;
 /**
 * A TokenFilter which applies a Pattern to each token in the stream,
 * replacing match occurances with the specified replacement string.
 *
 * <p>
 * <b>Note:</b> Depending on the input and the pattern used and the input
 * TokenStream, this TokenFilter may produce Tokens whose text is the empty
 * string.
 * </p>
 * 
 * @version $Id:$
 * @see Pattern
 */
 public final class PatternReplaceFilter extends TokenFilter {
  Pattern p;
  String replacement;
  boolean all = true;
  /**
   * Constructs an instance to replace either the first, or all occurances
   *
   * @param in the TokenStream to process
   * @param p the patterm to apply to each Token
   * @param replacement the "replacement string" to substitute, if null a
   *        blank string will be used. Note that this is not the literal
   *        string that will be used, '$' and '\' have special meaning.
   * @param all if true, all matches will be replaced otherwise just the first match.
   * @see Matcher#quoteReplacement
   */
  public PatternReplaceFilter(TokenStream in,
                              Pattern p,
                              String replacement,
                              boolean all) {
    super(in);
    this.p=p;
    this.replacement = (null == replacement) ? "" : replacement;
    this.all=all;
  }
  public final Token next() throws IOException {
    Token t = input.next();
    if (t == null)
      return null;
    Matcher m = p.matcher(t.termText());
    if (all) {
      t.setTermText(m.replaceAll(replacement));
    } else {
      t.setTermText(m.replaceFirst(replacement));
    }
    return t;
  }
 }
--- a/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
@ -0,0 +1,67 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenStream;
 import java.util.Map;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;
 import java.util.regex.PatternSyntaxException;
 /**
 * @version $Id:$
 * @see PatternReplaceFilter
 */
 public class PatternReplaceFilterFactory extends BaseTokenFilterFactory {
  Pattern p;
  String replacement;
  boolean all = true;
  public void init(Map<String, String> args) {
    super.init(args);
    try {
      p = Pattern.compile(args.get("pattern"));
    } catch (PatternSyntaxException e) {
      throw new RuntimeException
        ("Configuration Error: 'pattern' can not be parsed in " +
         this.getClass().getName(), e);
    }
    replacement = args.get("replacement");
    String r = args.get("replace");
    if (null != r) {
      if (r.equals("all")) {
        all = true;
      } else {
        if (r.equals("first")) {
          all = false;
        } else {
          throw new RuntimeException
            ("Configuration Error: 'replace' must be 'first' or 'all' in "
             + this.getClass().getName());
        }
      }
    }
  }
  public TokenStream create(TokenStream input) {
    return new PatternReplaceFilter(input, p, replacement, all);
  }
 }
--- a/src/java/org/apache/solr/analysis/TrimFilter.java
+++ b/src/java/org/apache/solr/analysis/TrimFilter.java
@ -0,0 +1,45 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
 import java.io.IOException;
 /**
 * Trims leading and trailing whitespace from Tokens in the stream.
 *
 * @version $Id:$
 */
 public final class TrimFilter extends TokenFilter {
  public TrimFilter(TokenStream in) {
    super(in);
  }
  public final Token next() throws IOException {
    Token t = input.next();
    if (null == t || null == t.termText())
      return t;
    t.setTermText(t.termText().trim());
    return t;
  }
 }
--- a/src/java/org/apache/solr/analysis/TrimFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java
@ -0,0 +1,30 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenStream;
 /**
 * @version $Id:$
 * @see TrimFilter
 */
 public class TrimFilterFactory extends BaseTokenFilterFactory {
  public TokenStream create(TokenStream input) {
    return new TrimFilter(input);
  }
 }
--- a/src/test/org/apache/solr/BasicFunctionalityTest.java
+++ b/src/test/org/apache/solr/BasicFunctionalityTest.java
@ -686,6 +686,26 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
  }
  public void testPatternReplaceFilter() {
    assertU(adoc("id", "1",
                 "patternreplacefilt", "My  fine-feathered friend!"));
    assertU(adoc("id", "2",
                 "patternreplacefilt", "  What's Up Doc?"));
    assertU(commit());
    assertQ("don't find Up",
            req("q", "patternreplacefilt:Up"),
            "*[count(//doc)=0]");
    assertQ("find doc",
            req("q", "patternreplacefilt:__What_s_Up_Doc_"),
            "*[count(//doc)=1]");
    assertQ("find birds",
            req("q", "patternreplacefilt:My__fine_feathered_friend_"),
            "*[count(//doc)=1]");
  }
 //   /** this doesn't work, but if it did, this is how we'd test it. */
 //   public void testOverwriteFalse() {
--- a/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java
@ -0,0 +1,96 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.analysis;
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.regex.Pattern;
 import java.util.regex.Matcher;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 /**
 * @version $Id:$
 */
 public class TestPatternReplaceFilter extends TestCase {
  public void testReplaceAll() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
      (new WhitespaceTokenizer(new StringReader(input)),
       Pattern.compile("a*b"),
       "-", true);
    assertEquals("-foo-foo-foo-", ts.next().termText());
    assertEquals("-", ts.next().termText());
    assertEquals("c-", ts.next().termText());
    assertNull(ts.next());
  }
  public void testReplaceFirst() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
      (new WhitespaceTokenizer(new StringReader(input)),
       Pattern.compile("a*b"),
       "-", false);
    assertEquals("-fooaabfooabfoob", ts.next().termText());
    assertEquals("-", ts.next().termText());
    assertEquals("c-", ts.next().termText());
    assertNull(ts.next());
  }
  public void testStripFirst() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
      (new WhitespaceTokenizer(new StringReader(input)),
       Pattern.compile("a*b"),
       null, false);
    assertEquals("fooaabfooabfoob", ts.next().termText());
    assertEquals("", ts.next().termText());
    assertEquals("c", ts.next().termText());
    assertNull(ts.next());
  }
  public void testStripAll() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
      (new WhitespaceTokenizer(new StringReader(input)),
       Pattern.compile("a*b"),
       null, true);
    assertEquals("foofoofoo", ts.next().termText());
    assertEquals("", ts.next().termText());
    assertEquals("c", ts.next().termText());
    assertNull(ts.next());
  }
  public void testReplaceAllWithBackRef() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
      (new WhitespaceTokenizer(new StringReader(input)),
       Pattern.compile("(a*)b"),
       "$1\\$", true);
    assertEquals("aa$fooaa$fooa$foo$", ts.next().termText());
    assertEquals("a$", ts.next().termText());
    assertEquals("caaaaaaaaa$", ts.next().termText());
    assertNull(ts.next());
  }
 }
--- a/src/test/org/apache/solr/analysis/TestTrimFilter.java
+++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java
@ -0,0 +1,61 @@
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.solr.analysis;
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.Arrays;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 /**
 * @version $Id:$
 */
 public class TestTrimFilter extends TestCase {
  public void testTrim() throws Exception {
    TokenStream ts = new TrimFilter
      (new IterTokenStream(new Token(" a ", 1, 5),
                           new Token("b   ",6,10),
                           new Token("cCc",11,15),
                           new Token("   ",16,20)));
    assertEquals("a", ts.next().termText());
    assertEquals("b", ts.next().termText());
    assertEquals("cCc", ts.next().termText());
    assertEquals("", ts.next().termText());
    assertNull(ts.next());
  }
  public static class IterTokenStream extends TokenStream {
    Iterator<Token> toks;
    public IterTokenStream(Token... toks) {
      this.toks = Arrays.asList(toks).iterator();
    }
    public Token next() {
      if (toks.hasNext()) {
        return toks.next();
      }
      return null;
    }
  }
 }
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -164,6 +164,17 @@
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldtype>
    <fieldtype name="patternreplacefilt" class="solr.TextField">
      <analyzer type="index">
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.PatternReplaceFilterFactory"
                pattern="([^a-zA-Z])" replacement="_" replace="all"
        />
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.KeywordTokenizerFactory"/>
      </analyzer>
    </fieldtype>
    <fieldtype name="porterfilt" class="solr.TextField">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
@ -340,6 +351,7 @@
   <field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
   <field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
   <field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
   <field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
   <field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
   <field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
   <field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>