LUCENE-2413: consolidate pattern analysis into contrib/analyzers, deprecate the old PatternAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940813 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 11:57:21 +00:00
parent 6fa480a5ed
commit cd320b5f57
15 changed files with 333 additions and 156 deletions

View File

@ -92,6 +92,9 @@ API Changes
stemming. Add Turkish and Romanian stopwords lists to support this.
(Robert Muir, Uwe Schindler, Simon Willnauer)
* LUCENE-2413: Deprecated PatternAnalyzer in contrib/analyzers, in favor of the
pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
@ -165,6 +168,8 @@ New features
into subwords and performs optional transformations on subword groups.
- o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which
filters out Tokens at the same position and Term text as the previous token.
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
(... in progress)
Build

View File

@ -62,8 +62,10 @@ import org.apache.lucene.util.Version;
* pat.tokenStream("content", "James is running round in the woods"),
* "English"));
* </pre>
*
* @deprecated use the pattern-based analysis in the analysis/pattern package instead.
* This analyzer will be removed in a future release (4.1)
*/
@Deprecated
public final class PatternAnalyzer extends Analyzer {
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.util.LinkedList;
@ -45,7 +45,6 @@ import org.apache.lucene.analysis.CharStream;
* highlight snippet="aa1&lt;em&gt;23bb&lt;/em&gt;"
* </p>
*
* @version $Id$
* @since Solr 1.5
*/
public class PatternReplaceCharFilter extends BaseCharFilter {

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.pattern;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -35,7 +35,6 @@ import java.io.IOException;
* string.
* </p>
*
* @version $Id:$
* @see Pattern
*/
public final class PatternReplaceFilter extends TokenFilter {

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.Reader;
@ -24,7 +24,6 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.commons.io.IOUtils;
/**
* This tokenizer uses regex pattern matching to construct distinct tokens
@ -51,7 +50,6 @@ import org.apache.commons.io.IOUtils;
* </p>
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
*
* @version $Id$
* @see Pattern
*/
public final class PatternTokenizer extends Tokenizer {
@ -59,7 +57,7 @@ public final class PatternTokenizer extends Tokenizer {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private String str;
private final StringBuilder str = new StringBuilder();
private int index;
private final Pattern pattern;
@ -71,7 +69,7 @@ public final class PatternTokenizer extends Tokenizer {
super(input);
this.pattern = pattern;
this.group = group;
str = IOUtils.toString(input);
fillBuffer(str, input);
matcher = pattern.matcher(str);
index = 0;
}
@ -84,11 +82,11 @@ public final class PatternTokenizer extends Tokenizer {
// match a specific group
while (matcher.find()) {
final String match = matcher.group(group);
if (match.length() == 0) continue;
termAtt.setEmpty().append(match);
index = matcher.start(group);
offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
final int endIndex = matcher.end(group);
if (index == endIndex) continue;
termAtt.setEmpty().append(str, index, endIndex);
offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
return true;
}
@ -131,9 +129,19 @@ public final class PatternTokenizer extends Tokenizer {
@Override
public void reset(Reader input) throws IOException {
super.reset(input);
str = IOUtils.toString(input);
fillBuffer(str, input);
matcher.reset(str);
index = 0;
}
// TODO: we should see if we can make this tokenizer work without reading
// the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
final char[] buffer = new char[8192];
private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
int len;
sb.setLength(0);
while ((len = input.read(buffer)) > 0) {
sb.append(buffer, 0, len);
}
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Set of components for pattern-based (regex) analysis.
</body>
</html>

View File

@ -15,39 +15,31 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
*
* @version $Id$
*
* Tests {@link PatternReplaceCharFilter}
*/
public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
// 1111
// 01234567890123
// this is test.
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1$2$3");
factory.init(args);
CharStream cs = factory.create(
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
@ -58,13 +50,9 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// aa bb cc
public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc";
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
factory.init(args);
CharStream cs = factory.create(
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertFalse(ts.incrementToken());
}
@ -73,14 +61,9 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// aa#bb#cc
public void test1block1matchSameLength() throws IOException {
final String BLOCK = "aa bb cc";
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1#$2#$3");
factory.init(args);
CharStream cs = factory.create(
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
@ -95,7 +78,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa##bb###cc", "dd" },
new int[] { 0, 9 },
@ -109,7 +92,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
final String BLOCK = " a a";
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa", "aa" },
new int[] { 1, 4 },
@ -124,7 +107,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
final String BLOCK = "aa bb cc dd";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa#bb", "dd" },
new int[] { 0, 12 },
@ -139,7 +122,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
@ -154,7 +137,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
@ -171,7 +154,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharReader.get( new StringReader( BLOCK ) ) );
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },

View File

@ -15,8 +15,9 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
@ -26,12 +27,12 @@ import java.util.regex.Pattern;
/**
* @version $Id:$
*/
public class TestPatternReplaceFilter extends BaseTokenTestCase {
public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
"-", true);
assertTokenStreamContents(ts,
@ -41,7 +42,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
public void testReplaceFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
"-", false);
assertTokenStreamContents(ts,
@ -51,7 +52,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
public void testStripFirst() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
null, false);
assertTokenStreamContents(ts,
@ -61,7 +62,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
public void testStripAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("a*b"),
null, true);
assertTokenStreamContents(ts,
@ -71,7 +72,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
public void testReplaceAllWithBackRef() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
TokenStream ts = new PatternReplaceFilter
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
Pattern.compile("(a*)b"),
"$1\\$", true);
assertTokenStreamContents(ts,

View File

@ -0,0 +1,119 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.pattern;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
public class TestPatternTokenizer extends BaseTokenStreamTestCase
{
public void testSplitting() throws Exception
{
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
String[][] tests = {
// group pattern input output
{ "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
{ "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
{ "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
{ "-1", ":", "boo:and:foo", "boo and foo" },
{ "-1", "o", "boo:and:foo", "b :and:f" },
{ "0", ":", "boo:and:foo", ": :" },
{ "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
};
for( String[] test : tests ) {
TokenStream stream = new PatternTokenizer(new StringReader(test[2]), Pattern.compile(test[1]), Integer.parseInt(test[0]));
String out = tsToString( stream );
// System.out.println( test[2] + " ==> " + out );
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
// Make sure it is the same as if we called 'split'
// test disabled, as we remove empty tokens
/*if( "-1".equals( test[0] ) ) {
String[] split = test[2].split( test[1] );
stream = tokenizer.create( new StringReader( test[2] ) );
int i=0;
for( Token t = stream.next(); null != t; t = stream.next() )
{
assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
}
}*/
}
}
public void testOffsetCorrection() throws Exception {
final String INPUT = "G&uuml;nther G&uuml;nther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<String>();
mappingRules.add( "\"&uuml;\" => \"ü\"" );
NormalizeCharMap normMap = new NormalizeCharMap();
normMap.add("&uuml;", "ü");
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
// create PatternTokenizer
TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
new int[] { 0, 13, 26, 29 },
new int[] { 12, 25, 28, 33 });
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0);
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
new int[] { 12, 25 });
}
/**
* TODO: rewrite tests not to use string comparison.
* @deprecated only tests TermAttribute!
*/
private static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
while (in.incrementToken()) {
if (out.length() > 0)
out.append(' ');
out.append(termAtt.toString());
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
}
in.close();
return out.toString();
}
}

View File

@ -22,6 +22,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
/**
*

View File

@ -17,6 +17,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
import java.util.Map;
import java.util.regex.Pattern;

View File

@ -27,6 +27,7 @@ import java.util.regex.Pattern;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.solr.common.SolrException;

View File

@ -0,0 +1,86 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure this factory is working
*/
public class TestPatternReplaceCharFilterFactory extends BaseTokenTestCase {
// 1111
// 01234567890123
// this is test.
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1$2$3");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
new int[] { 4, 7, 13 });
}
// 012345678
// aa bb cc
public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc";
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
assertFalse(ts.incrementToken());
}
// 012345678
// aa bb cc
// aa#bb#cc
public void test1block1matchSameLength() throws IOException {
final String BLOCK = "aa bb cc";
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1#$2#$3");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
new int[] { 8 });
}
}

View File

@ -0,0 +1,45 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
/**
* Simple tests to ensure this factory is working
*/
public class TestPatternReplaceFilterFactory extends BaseTokenTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "a*b");
args.put("replacement", "-");
factory.init(args);
TokenStream ts = factory.create
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
assertTokenStreamContents(ts,
new String[] { "-foo-foo-foo-", "-", "c-" });
}
}

View File

@ -17,120 +17,25 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/** Simple Tests to ensure this factory is working */
public class TestPatternTokenizerFactory extends BaseTokenTestCase
{
public void testSplitting() throws Exception
{
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
String[][] tests = {
// group pattern input output
{ "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
{ "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
{ "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
{ "-1", ":", "boo:and:foo", "boo and foo" },
{ "-1", "o", "boo:and:foo", "b :and:f" },
{ "0", ":", "boo:and:foo", ": :" },
{ "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
};
Map<String,String> args = new HashMap<String, String>();
for( String[] test : tests ) {
args.put( PatternTokenizerFactory.GROUP, test[0] );
args.put( PatternTokenizerFactory.PATTERN, test[1] );
PatternTokenizerFactory tokenizer = new PatternTokenizerFactory();
tokenizer.init( args );
TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
String out = tsToString( stream );
// System.out.println( test[2] + " ==> " + out );
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
// Make sure it is the same as if we called 'split'
// test disabled, as we remove empty tokens
/*if( "-1".equals( test[0] ) ) {
String[] split = test[2].split( test[1] );
stream = tokenizer.create( new StringReader( test[2] ) );
int i=0;
for( Token t = stream.next(); null != t; t = stream.next() )
{
assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
}
}*/
}
}
public void testOffsetCorrection() throws Exception {
final String INPUT = "G&uuml;nther G&uuml;nther is here";
// create MappingCharFilter
MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
List<String> mappingRules = new ArrayList<String>();
mappingRules.add( "\"&uuml;\" => \"ü\"" );
NormalizeCharMap normMap = new NormalizeCharMap();
cfFactory.parseRules( mappingRules, normMap );
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
public void testFactory() throws Exception {
final String INPUT = "Günther Günther is here";
// create PatternTokenizer
Map<String,String> args = new HashMap<String, String>();
args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
tokFactory.init( args );
TokenStream stream = tokFactory.create( charStream );
TokenStream stream = tokFactory.create( new StringReader(INPUT) );
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
new int[] { 0, 13, 26, 29 },
new int[] { 12, 25, 28, 33 });
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
args.put( PatternTokenizerFactory.PATTERN, "Günther" );
args.put( PatternTokenizerFactory.GROUP, "0" );
tokFactory = new PatternTokenizerFactory();
tokFactory.init( args );
stream = tokFactory.create( charStream );
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
new int[] { 12, 25 });
}
/**
* TODO: rewrite tests not to use string comparison.
* @deprecated only tests TermAttribute!
*/
private static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
while (in.incrementToken()) {
if (out.length() > 0)
out.append(' ');
out.append(termAtt.toString());
in.clearAttributes();
termAtt.setEmpty().append("bogusTerm");
}
in.close();
return out.toString();
new String[] { "Günther", "Günther", "is", "here" });
}
}