mirror of https://github.com/apache/lucene.git
SOLR-89 new PatternReplaceFilter, TrimFilter, and corrisponding Factories
git-svn-id: https://svn.apache.org/repos/asf/incubator/solr/trunk@494675 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ae6c58f278
commit
a2beac1e38
|
@ -37,6 +37,11 @@ Detailed Change List
|
|||
New Features
|
||||
1. SOLR-82: Default field values can be specified in the schema.xml.
|
||||
(Ryan McKinley via hossman)
|
||||
2. SOLR-89: Two new TokenFilters with corrisponding Factories...
|
||||
* TrimFilter - Trims leading and trailing whitespace from Tokens
|
||||
* PatternReplaceFilter - applies a Pattern to each token in the
|
||||
stream, replacing match occurances with a specified replacement.
|
||||
(hossman)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. Highlighting using DisMax will only pick up terms from the main
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
<add>
|
||||
<doc>
|
||||
<field name="id">TWINX2048-3200PRO</field>
|
||||
<field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
|
||||
<field name="name">CORSAIR XMS 2GB (2 x 1GB) 184-Pin DDR SDRAM Unbuffered DDR 400 (PC 3200) Dual Channel Kit System Memory - Retail</field>
|
||||
<field name="manu">Corsair Microsystems Inc.</field>
|
||||
<field name="cat">electronics</field>
|
||||
<field name="cat">memory</field>
|
||||
|
|
|
@ -182,6 +182,39 @@
|
|||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
-->
|
||||
<fieldtype name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
|
||||
<analyzer>
|
||||
<!-- KeywordTokenizer does no actual tokenizing, so the entire
|
||||
input string is preserved as a single token
|
||||
-->
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<!-- The LowerCase TokenFilter does what you expect, which can be
|
||||
when you want your sorting to be case insensitive
|
||||
-->
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<!-- The TrimFilter removes any leading or trailing whitespace -->
|
||||
<filter class="solr.TrimFilterFactory" />
|
||||
<!-- The PatternReplaceFilter gives you the flexibility to use
|
||||
Java Regular expression to replace any sequence of characters
|
||||
matching a pattern with an arbitrary replacement string,
|
||||
which may include back refrences to portions of the orriginal
|
||||
string matched by the pattern.
|
||||
|
||||
See the Java Regular Expression documentation for more
|
||||
infomation on pattern and replacement string syntax.
|
||||
|
||||
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
|
||||
-->
|
||||
<filter class="solr.PatternReplaceFilterFactory"
|
||||
pattern="([^a-z])" replacement="" replace="all"
|
||||
/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
</types>
|
||||
|
||||
|
||||
|
@ -204,6 +237,8 @@
|
|||
<field name="id" type="string" indexed="true" stored="true"/>
|
||||
<field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
|
||||
<field name="name" type="text" indexed="true" stored="true"/>
|
||||
<field name="nameSort" type="string" indexed="true" stored="false"/>
|
||||
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
|
||||
<field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
|
||||
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true"/>
|
||||
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
|
@ -264,6 +299,8 @@
|
|||
|
||||
<copyField source="cat" dest="text"/>
|
||||
<copyField source="name" dest="text"/>
|
||||
<copyField source="name" dest="nameSort"/>
|
||||
<copyField source="name" dest="alphaNameSort"/>
|
||||
<copyField source="manu" dest="text"/>
|
||||
<copyField source="features" dest="text"/>
|
||||
<copyField source="includes" dest="text"/>
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A TokenFilter which applies a Pattern to each token in the stream,
|
||||
* replacing match occurances with the specified replacement string.
|
||||
*
|
||||
* <p>
|
||||
* <b>Note:</b> Depending on the input and the pattern used and the input
|
||||
* TokenStream, this TokenFilter may produce Tokens whose text is the empty
|
||||
* string.
|
||||
* </p>
|
||||
*
|
||||
* @version $Id:$
|
||||
* @see Pattern
|
||||
*/
|
||||
public final class PatternReplaceFilter extends TokenFilter {
|
||||
Pattern p;
|
||||
String replacement;
|
||||
boolean all = true;
|
||||
|
||||
/**
|
||||
* Constructs an instance to replace either the first, or all occurances
|
||||
*
|
||||
* @param in the TokenStream to process
|
||||
* @param p the patterm to apply to each Token
|
||||
* @param replacement the "replacement string" to substitute, if null a
|
||||
* blank string will be used. Note that this is not the literal
|
||||
* string that will be used, '$' and '\' have special meaning.
|
||||
* @param all if true, all matches will be replaced otherwise just the first match.
|
||||
* @see Matcher#quoteReplacement
|
||||
*/
|
||||
public PatternReplaceFilter(TokenStream in,
|
||||
Pattern p,
|
||||
String replacement,
|
||||
boolean all) {
|
||||
super(in);
|
||||
this.p=p;
|
||||
this.replacement = (null == replacement) ? "" : replacement;
|
||||
this.all=all;
|
||||
}
|
||||
|
||||
public final Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (t == null)
|
||||
return null;
|
||||
|
||||
Matcher m = p.matcher(t.termText());
|
||||
if (all) {
|
||||
t.setTermText(m.replaceAll(replacement));
|
||||
} else {
|
||||
t.setTermText(m.replaceFirst(replacement));
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
* @see PatternReplaceFilter
|
||||
*/
|
||||
public class PatternReplaceFilterFactory extends BaseTokenFilterFactory {
|
||||
Pattern p;
|
||||
String replacement;
|
||||
boolean all = true;
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
try {
|
||||
p = Pattern.compile(args.get("pattern"));
|
||||
} catch (PatternSyntaxException e) {
|
||||
throw new RuntimeException
|
||||
("Configuration Error: 'pattern' can not be parsed in " +
|
||||
this.getClass().getName(), e);
|
||||
}
|
||||
|
||||
replacement = args.get("replacement");
|
||||
|
||||
String r = args.get("replace");
|
||||
if (null != r) {
|
||||
if (r.equals("all")) {
|
||||
all = true;
|
||||
} else {
|
||||
if (r.equals("first")) {
|
||||
all = false;
|
||||
} else {
|
||||
throw new RuntimeException
|
||||
("Configuration Error: 'replace' must be 'first' or 'all' in "
|
||||
+ this.getClass().getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new PatternReplaceFilter(input, p, replacement, all);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Trims leading and trailing whitespace from Tokens in the stream.
|
||||
*
|
||||
* @version $Id:$
|
||||
*/
|
||||
public final class TrimFilter extends TokenFilter {
|
||||
|
||||
public TrimFilter(TokenStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
public final Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (null == t || null == t.termText())
|
||||
return t;
|
||||
|
||||
t.setTermText(t.termText().trim());
|
||||
return t;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
* @see TrimFilter
|
||||
*/
|
||||
public class TrimFilterFactory extends BaseTokenFilterFactory {
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TrimFilter(input);
|
||||
}
|
||||
}
|
|
@ -686,6 +686,26 @@ public class BasicFunctionalityTest extends AbstractSolrTestCase {
|
|||
|
||||
}
|
||||
|
||||
public void testPatternReplaceFilter() {
|
||||
|
||||
assertU(adoc("id", "1",
|
||||
"patternreplacefilt", "My fine-feathered friend!"));
|
||||
assertU(adoc("id", "2",
|
||||
"patternreplacefilt", " What's Up Doc?"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("don't find Up",
|
||||
req("q", "patternreplacefilt:Up"),
|
||||
"*[count(//doc)=0]");
|
||||
|
||||
assertQ("find doc",
|
||||
req("q", "patternreplacefilt:__What_s_Up_Doc_"),
|
||||
"*[count(//doc)=1]");
|
||||
|
||||
assertQ("find birds",
|
||||
req("q", "patternreplacefilt:My__fine_feathered_friend_"),
|
||||
"*[count(//doc)=1]");
|
||||
}
|
||||
|
||||
// /** this doesn't work, but if it did, this is how we'd test it. */
|
||||
// public void testOverwriteFalse() {
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestPatternReplaceFilter extends TestCase {
|
||||
|
||||
public void testReplaceAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", true);
|
||||
assertEquals("-foo-foo-foo-", ts.next().termText());
|
||||
assertEquals("-", ts.next().termText());
|
||||
assertEquals("c-", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
}
|
||||
|
||||
public void testReplaceFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", false);
|
||||
assertEquals("-fooaabfooabfoob", ts.next().termText());
|
||||
assertEquals("-", ts.next().termText());
|
||||
assertEquals("c-", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
}
|
||||
|
||||
public void testStripFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, false);
|
||||
assertEquals("fooaabfooabfoob", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertEquals("c", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
}
|
||||
|
||||
public void testStripAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, true);
|
||||
assertEquals("foofoofoo", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertEquals("c", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
}
|
||||
|
||||
public void testReplaceAllWithBackRef() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("(a*)b"),
|
||||
"$1\\$", true);
|
||||
assertEquals("aa$fooaa$fooa$foo$", ts.next().termText());
|
||||
assertEquals("a$", ts.next().termText());
|
||||
assertEquals("caaaaaaaaa$", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestTrimFilter extends TestCase {
|
||||
|
||||
public void testTrim() throws Exception {
|
||||
TokenStream ts = new TrimFilter
|
||||
(new IterTokenStream(new Token(" a ", 1, 5),
|
||||
new Token("b ",6,10),
|
||||
new Token("cCc",11,15),
|
||||
new Token(" ",16,20)));
|
||||
|
||||
assertEquals("a", ts.next().termText());
|
||||
assertEquals("b", ts.next().termText());
|
||||
assertEquals("cCc", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
}
|
||||
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Token... toks) {
|
||||
this.toks = Arrays.asList(toks).iterator();
|
||||
}
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -164,6 +164,17 @@
|
|||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="patternreplacefilt" class="solr.TextField">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.PatternReplaceFilterFactory"
|
||||
pattern="([^a-zA-Z])" replacement="_" replace="all"
|
||||
/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
<fieldtype name="porterfilt" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
|
@ -340,6 +351,7 @@
|
|||
<field name="standardtokfilt" type="standardtokfilt" indexed="true" stored="true"/>
|
||||
<field name="standardfilt" type="standardfilt" indexed="true" stored="true"/>
|
||||
<field name="lowerfilt" type="lowerfilt" indexed="true" stored="true"/>
|
||||
<field name="patternreplacefilt" type="patternreplacefilt" indexed="true" stored="true"/>
|
||||
<field name="porterfilt" type="porterfilt" indexed="true" stored="true"/>
|
||||
<field name="engporterfilt" type="engporterfilt" indexed="true" stored="true"/>
|
||||
<field name="custengporterfilt" type="custengporterfilt" indexed="true" stored="true"/>
|
||||
|
|
Loading…
Reference in New Issue