From a2beac1e383fe62801f5b0170feaa6d06db438c5 Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Wed, 10 Jan 2007 01:18:38 +0000 Subject: [PATCH] SOLR-89 new PatternReplaceFilter, TrimFilter, and corrisponding Factories git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@494675 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 5 + example/exampledocs/mem.xml | 2 +- example/solr/conf/schema.xml | 37 +++++++ .../solr/analysis/PatternReplaceFilter.java | 82 ++++++++++++++++ .../analysis/PatternReplaceFilterFactory.java | 67 +++++++++++++ .../org/apache/solr/analysis/TrimFilter.java | 45 +++++++++ .../solr/analysis/TrimFilterFactory.java | 30 ++++++ .../apache/solr/BasicFunctionalityTest.java | 20 ++++ .../analysis/TestPatternReplaceFilter.java | 96 +++++++++++++++++++ .../apache/solr/analysis/TestTrimFilter.java | 61 ++++++++++++ src/test/test-files/solr/conf/schema.xml | 12 +++ 11 files changed, 456 insertions(+), 1 deletion(-) create mode 100644 src/java/org/apache/solr/analysis/PatternReplaceFilter.java create mode 100644 src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java create mode 100644 src/java/org/apache/solr/analysis/TrimFilter.java create mode 100644 src/java/org/apache/solr/analysis/TrimFilterFactory.java create mode 100644 src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java create mode 100644 src/test/org/apache/solr/analysis/TestTrimFilter.java diff --git a/CHANGES.txt b/CHANGES.txt index ba482af2e4b..35243496bfe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -37,6 +37,11 @@ Detailed Change List New Features 1. SOLR-82: Default field values can be specified in the schema.xml. (Ryan McKinley via hossman) + 2. SOLR-89: Two new TokenFilters with corrisponding Factories... + * TrimFilter - Trims leading and trailing whitespace from Tokens + * PatternReplaceFilter - applies a Pattern to each token in the + stream, replacing match occurances with a specified replacement. + (hossman) Changes in runtime behavior 1. Highlighting using DisMax will only pick up terms from the main diff --git a/example/exampledocs/mem.xml b/example/exampledocs/mem.xml index 99783da4248..d4a0d179eb2 100644 --- a/example/exampledocs/mem.xml +++ b/example/exampledocs/mem.xml @@ -18,7 +18,7 @@ TWINX2048-3200PRO - CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail + CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail Corsair Microsystems Inc. electronics memory diff --git a/example/solr/conf/schema.xml b/example/solr/conf/schema.xml index eccd9f82dfd..2808e2a79af 100755 --- a/example/solr/conf/schema.xml +++ b/example/solr/conf/schema.xml @@ -182,6 +182,39 @@ + + + + + + + + + + + + + + @@ -204,6 +237,8 @@ + + @@ -264,6 +299,8 @@ + + diff --git a/src/java/org/apache/solr/analysis/PatternReplaceFilter.java b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java new file mode 100644 index 00000000000..b9477148303 --- /dev/null +++ b/src/java/org/apache/solr/analysis/PatternReplaceFilter.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; + +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.IOException; + +/** + * A TokenFilter which applies a Pattern to each token in the stream, + * replacing match occurances with the specified replacement string. + * + *

+ * Note: Depending on the input and the pattern used and the input + * TokenStream, this TokenFilter may produce Tokens whose text is the empty + * string. + *

+ * + * @version $Id:$ + * @see Pattern + */ +public final class PatternReplaceFilter extends TokenFilter { + Pattern p; + String replacement; + boolean all = true; + + /** + * Constructs an instance to replace either the first, or all occurances + * + * @param in the TokenStream to process + * @param p the patterm to apply to each Token + * @param replacement the "replacement string" to substitute, if null a + * blank string will be used. Note that this is not the literal + * string that will be used, '$' and '\' have special meaning. + * @param all if true, all matches will be replaced otherwise just the first match. + * @see Matcher#quoteReplacement + */ + public PatternReplaceFilter(TokenStream in, + Pattern p, + String replacement, + boolean all) { + super(in); + this.p=p; + this.replacement = (null == replacement) ? "" : replacement; + this.all=all; + } + + public final Token next() throws IOException { + Token t = input.next(); + if (t == null) + return null; + + Matcher m = p.matcher(t.termText()); + if (all) { + t.setTermText(m.replaceAll(replacement)); + } else { + t.setTermText(m.replaceFirst(replacement)); + } + + return t; + } + +} diff --git a/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java b/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java new file mode 100644 index 00000000000..b38063f5a39 --- /dev/null +++ b/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenStream; + +import java.util.Map; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.util.regex.PatternSyntaxException; + +/** + * @version $Id:$ + * @see PatternReplaceFilter + */ +public class PatternReplaceFilterFactory extends BaseTokenFilterFactory { + Pattern p; + String replacement; + boolean all = true; + + public void init(Map args) { + super.init(args); + try { + p = Pattern.compile(args.get("pattern")); + } catch (PatternSyntaxException e) { + throw new RuntimeException + ("Configuration Error: 'pattern' can not be parsed in " + + this.getClass().getName(), e); + } + + replacement = args.get("replacement"); + + String r = args.get("replace"); + if (null != r) { + if (r.equals("all")) { + all = true; + } else { + if (r.equals("first")) { + all = false; + } else { + throw new RuntimeException + ("Configuration Error: 'replace' must be 'first' or 'all' in " + + this.getClass().getName()); + } + } + } + + } + public TokenStream create(TokenStream input) { + return new PatternReplaceFilter(input, p, replacement, all); + } +} diff --git a/src/java/org/apache/solr/analysis/TrimFilter.java b/src/java/org/apache/solr/analysis/TrimFilter.java new file mode 100644 index 00000000000..c4a2d3f8304 --- /dev/null +++ b/src/java/org/apache/solr/analysis/TrimFilter.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; + +import java.io.IOException; + +/** + * Trims leading and trailing whitespace from Tokens in the stream. + * + * @version $Id:$ + */ +public final class TrimFilter extends TokenFilter { + + public TrimFilter(TokenStream in) { + super(in); + } + + public final Token next() throws IOException { + Token t = input.next(); + if (null == t || null == t.termText()) + return t; + + t.setTermText(t.termText().trim()); + return t; + } +} diff --git a/src/java/org/apache/solr/analysis/TrimFilterFactory.java b/src/java/org/apache/solr/analysis/TrimFilterFactory.java new file mode 100644 index 00000000000..3e5e03ed42e --- /dev/null +++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.TokenStream; + +/** + * @version $Id:$ + * @see TrimFilter + */ +public class TrimFilterFactory extends BaseTokenFilterFactory { + public TokenStream create(TokenStream input) { + return new TrimFilter(input); + } +} diff --git a/src/test/org/apache/solr/BasicFunctionalityTest.java b/src/test/org/apache/solr/BasicFunctionalityTest.java index d3b2a441077..b614ca72d1d 100644 --- a/src/test/org/apache/solr/BasicFunctionalityTest.java +++ b/src/test/org/apache/solr/BasicFunctionalityTest.java @@ -686,6 +686,26 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase { } + public void testPatternReplaceFilter() { + + assertU(adoc("id", "1", + "patternreplacefilt", "My fine-feathered friend!")); + assertU(adoc("id", "2", + "patternreplacefilt", " What's Up Doc?")); + assertU(commit()); + + assertQ("don't find Up", + req("q", "patternreplacefilt:Up"), + "*[count(//doc)=0]"); + + assertQ("find doc", + req("q", "patternreplacefilt:__What_s_Up_Doc_"), + "*[count(//doc)=1]"); + + assertQ("find birds", + req("q", "patternreplacefilt:My__fine_feathered_friend_"), + "*[count(//doc)=1]"); + } // /** this doesn't work, but if it did, this is how we'd test it. */ // public void testOverwriteFalse() { diff --git a/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java b/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java new file mode 100644 index 00000000000..cb0b50ec34e --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestPatternReplaceFilter.java @@ -0,0 +1,96 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.io.IOException; +import java.io.StringReader; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; + +/** + * @version $Id:$ + */ +public class TestPatternReplaceFilter extends TestCase { + + public void testReplaceAll() throws Exception { + String input = "aabfooaabfooabfoob ab caaaaaaaaab"; + TokenStream ts = new PatternReplaceFilter + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + "-", true); + assertEquals("-foo-foo-foo-", ts.next().termText()); + assertEquals("-", ts.next().termText()); + assertEquals("c-", ts.next().termText()); + assertNull(ts.next()); + } + + public void testReplaceFirst() throws Exception { + String input = "aabfooaabfooabfoob ab caaaaaaaaab"; + TokenStream ts = new PatternReplaceFilter + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + "-", false); + assertEquals("-fooaabfooabfoob", ts.next().termText()); + assertEquals("-", ts.next().termText()); + assertEquals("c-", ts.next().termText()); + assertNull(ts.next()); + } + + public void testStripFirst() throws Exception { + String input = "aabfooaabfooabfoob ab caaaaaaaaab"; + TokenStream ts = new PatternReplaceFilter + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + null, false); + assertEquals("fooaabfooabfoob", ts.next().termText()); + assertEquals("", ts.next().termText()); + assertEquals("c", ts.next().termText()); + assertNull(ts.next()); + } + + public void testStripAll() throws Exception { + String input = "aabfooaabfooabfoob ab caaaaaaaaab"; + TokenStream ts = new PatternReplaceFilter + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("a*b"), + null, true); + assertEquals("foofoofoo", ts.next().termText()); + assertEquals("", ts.next().termText()); + assertEquals("c", ts.next().termText()); + assertNull(ts.next()); + } + + public void testReplaceAllWithBackRef() throws Exception { + String input = "aabfooaabfooabfoob ab caaaaaaaaab"; + TokenStream ts = new PatternReplaceFilter + (new WhitespaceTokenizer(new StringReader(input)), + Pattern.compile("(a*)b"), + "$1\\$", true); + assertEquals("aa$fooaa$fooa$foo$", ts.next().termText()); + assertEquals("a$", ts.next().termText()); + assertEquals("caaaaaaaaa$", ts.next().termText()); + assertNull(ts.next()); + } + +} diff --git a/src/test/org/apache/solr/analysis/TestTrimFilter.java b/src/test/org/apache/solr/analysis/TestTrimFilter.java new file mode 100644 index 00000000000..a61c63e4060 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Arrays; + +import junit.framework.TestCase; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + + +/** + * @version $Id:$ + */ +public class TestTrimFilter extends TestCase { + + public void testTrim() throws Exception { + TokenStream ts = new TrimFilter + (new IterTokenStream(new Token(" a ", 1, 5), + new Token("b ",6,10), + new Token("cCc",11,15), + new Token(" ",16,20))); + + assertEquals("a", ts.next().termText()); + assertEquals("b", ts.next().termText()); + assertEquals("cCc", ts.next().termText()); + assertEquals("", ts.next().termText()); + assertNull(ts.next()); + } + + public static class IterTokenStream extends TokenStream { + Iterator toks; + public IterTokenStream(Token... toks) { + this.toks = Arrays.asList(toks).iterator(); + } + public Token next() { + if (toks.hasNext()) { + return toks.next(); + } + return null; + } + } +} diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index 11db8b68bc0..fdc0e312ede 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -164,6 +164,17 @@ + + + + + + + + + @@ -340,6 +351,7 @@ +