mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate pattern analysis into contrib/analyzers, deprecate the old PatternAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940813 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6fa480a5ed
commit
cd320b5f57
|
@ -92,6 +92,9 @@ API Changes
|
|||
stemming. Add Turkish and Romanian stopwords lists to support this.
|
||||
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||
|
||||
* LUCENE-2413: Deprecated PatternAnalyzer in contrib/analyzers, in favor of the
|
||||
pattern package (CharFilter, Tokenizer, TokenFilter). (Robert Muir)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
|
||||
|
@ -165,6 +168,8 @@ New features
|
|||
into subwords and performs optional transformations on subword groups.
|
||||
- o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which
|
||||
filters out Tokens at the same position and Term text as the previous token.
|
||||
- o.a.l.analysis.pattern: Package for pattern-based analysis, containing a
|
||||
CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
|
||||
(... in progress)
|
||||
|
||||
Build
|
||||
|
|
|
@ -62,8 +62,10 @@ import org.apache.lucene.util.Version;
|
|||
* pat.tokenStream("content", "James is running round in the woods"),
|
||||
* "English"));
|
||||
* </pre>
|
||||
*
|
||||
* @deprecated use the pattern-based analysis in the analysis/pattern package instead.
|
||||
* This analyzer will be removed in a future release (4.1)
|
||||
*/
|
||||
@Deprecated
|
||||
public final class PatternAnalyzer extends Analyzer {
|
||||
|
||||
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
@ -45,7 +45,6 @@ import org.apache.lucene.analysis.CharStream;
|
|||
* highlight snippet="aa1<em>23bb</em>"
|
||||
* </p>
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.5
|
||||
*/
|
||||
public class PatternReplaceCharFilter extends BaseCharFilter {
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -35,7 +35,6 @@ import java.io.IOException;
|
|||
* string.
|
||||
* </p>
|
||||
*
|
||||
* @version $Id:$
|
||||
* @see Pattern
|
||||
*/
|
||||
public final class PatternReplaceFilter extends TokenFilter {
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -24,7 +24,6 @@ import java.util.regex.Pattern;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
/**
|
||||
* This tokenizer uses regex pattern matching to construct distinct tokens
|
||||
|
@ -51,7 +50,6 @@ import org.apache.commons.io.IOUtils;
|
|||
* </p>
|
||||
* <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
|
||||
*
|
||||
* @version $Id$
|
||||
* @see Pattern
|
||||
*/
|
||||
public final class PatternTokenizer extends Tokenizer {
|
||||
|
@ -59,7 +57,7 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private String str;
|
||||
private final StringBuilder str = new StringBuilder();
|
||||
private int index;
|
||||
|
||||
private final Pattern pattern;
|
||||
|
@ -71,7 +69,7 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
super(input);
|
||||
this.pattern = pattern;
|
||||
this.group = group;
|
||||
str = IOUtils.toString(input);
|
||||
fillBuffer(str, input);
|
||||
matcher = pattern.matcher(str);
|
||||
index = 0;
|
||||
}
|
||||
|
@ -84,11 +82,11 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
|
||||
// match a specific group
|
||||
while (matcher.find()) {
|
||||
final String match = matcher.group(group);
|
||||
if (match.length() == 0) continue;
|
||||
termAtt.setEmpty().append(match);
|
||||
index = matcher.start(group);
|
||||
offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
|
||||
final int endIndex = matcher.end(group);
|
||||
if (index == endIndex) continue;
|
||||
termAtt.setEmpty().append(str, index, endIndex);
|
||||
offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -131,9 +129,19 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
str = IOUtils.toString(input);
|
||||
fillBuffer(str, input);
|
||||
matcher.reset(str);
|
||||
index = 0;
|
||||
}
|
||||
|
||||
|
||||
// TODO: we should see if we can make this tokenizer work without reading
|
||||
// the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
|
||||
final char[] buffer = new char[8192];
|
||||
private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
|
||||
int len;
|
||||
sb.setLength(0);
|
||||
while ((len = input.read(buffer)) > 0) {
|
||||
sb.append(buffer, 0, len);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Set of components for pattern-based (regex) analysis.
|
||||
</body>
|
||||
</html>
|
|
@ -15,39 +15,31 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
*
|
||||
* Tests {@link PatternReplaceCharFilter}
|
||||
*/
|
||||
public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
||||
public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
// 1111
|
||||
// 01234567890123
|
||||
// this is test.
|
||||
public void testNothingChange() throws IOException {
|
||||
final String BLOCK = "this is test.";
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
args.put("replacement", "$1$2$3");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "test." },
|
||||
new int[] { 0, 5, 8 },
|
||||
|
@ -58,13 +50,9 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
// aa bb cc
|
||||
public void testReplaceByEmpty() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
|
@ -73,14 +61,9 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
// aa#bb#cc
|
||||
public void test1block1matchSameLength() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
args.put("replacement", "$1#$2#$3");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb#cc" },
|
||||
new int[] { 0 },
|
||||
|
@ -95,7 +78,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
final String BLOCK = "aa bb cc dd";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa##bb###cc", "dd" },
|
||||
new int[] { 0, 9 },
|
||||
|
@ -109,7 +92,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
final String BLOCK = " a a";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "aa" },
|
||||
new int[] { 1, 4 },
|
||||
|
@ -124,7 +107,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
final String BLOCK = "aa bb cc dd";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb", "dd" },
|
||||
new int[] { 0, 12 },
|
||||
|
@ -139,7 +122,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
final String BLOCK = " aa bb cc --- aa bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1 $2 $3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
|
||||
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
|
||||
|
@ -154,7 +137,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
final String BLOCK = " aa bb cc --- aa bb aa. bb aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
|
||||
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
|
||||
|
@ -171,7 +154,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
|
||||
cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
|
||||
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
|
|
@ -15,8 +15,9 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
|
@ -26,12 +27,12 @@ import java.util.regex.Pattern;
|
|||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestPatternReplaceFilter extends BaseTokenTestCase {
|
||||
public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testReplaceAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", true);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -41,7 +42,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
|
|||
public void testReplaceFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", false);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -51,7 +52,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
|
|||
public void testStripFirst() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, false);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -61,7 +62,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
|
|||
public void testStripAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, true);
|
||||
assertTokenStreamContents(ts,
|
||||
|
@ -71,7 +72,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
|
|||
public void testReplaceAllWithBackRef() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
TokenStream ts = new PatternReplaceFilter
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
|
||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
|
||||
Pattern.compile("(a*)b"),
|
||||
"$1\\$", true);
|
||||
assertTokenStreamContents(ts,
|
|
@ -0,0 +1,119 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
public class TestPatternTokenizer extends BaseTokenStreamTestCase
|
||||
{
|
||||
public void testSplitting() throws Exception
|
||||
{
|
||||
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
|
||||
String[][] tests = {
|
||||
// group pattern input output
|
||||
{ "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
|
||||
{ "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
|
||||
{ "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
|
||||
{ "-1", ":", "boo:and:foo", "boo and foo" },
|
||||
{ "-1", "o", "boo:and:foo", "b :and:f" },
|
||||
{ "0", ":", "boo:and:foo", ": :" },
|
||||
{ "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
|
||||
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
|
||||
};
|
||||
|
||||
for( String[] test : tests ) {
|
||||
TokenStream stream = new PatternTokenizer(new StringReader(test[2]), Pattern.compile(test[1]), Integer.parseInt(test[0]));
|
||||
String out = tsToString( stream );
|
||||
// System.out.println( test[2] + " ==> " + out );
|
||||
|
||||
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
|
||||
|
||||
// Make sure it is the same as if we called 'split'
|
||||
// test disabled, as we remove empty tokens
|
||||
/*if( "-1".equals( test[0] ) ) {
|
||||
String[] split = test[2].split( test[1] );
|
||||
stream = tokenizer.create( new StringReader( test[2] ) );
|
||||
int i=0;
|
||||
for( Token t = stream.next(); null != t; t = stream.next() )
|
||||
{
|
||||
assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
|
||||
}
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffsetCorrection() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
// create MappingCharFilter
|
||||
List<String> mappingRules = new ArrayList<String>();
|
||||
mappingRules.add( "\"ü\" => \"ü\"" );
|
||||
NormalizeCharMap normMap = new NormalizeCharMap();
|
||||
normMap.add("ü", "ü");
|
||||
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
|
||||
|
||||
// create PatternTokenizer
|
||||
TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther", "is", "here" },
|
||||
new int[] { 0, 13, 26, 29 },
|
||||
new int[] { 12, 25, 28, 33 });
|
||||
|
||||
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
|
||||
stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther" },
|
||||
new int[] { 0, 13 },
|
||||
new int[] { 12, 25 });
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: rewrite tests not to use string comparison.
|
||||
* @deprecated only tests TermAttribute!
|
||||
*/
|
||||
private static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
|
||||
// extra safety to enforce, that the state is not preserved and also
|
||||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0)
|
||||
out.append(' ');
|
||||
out.append(termAtt.toString());
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
}
|
||||
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
}
|
|
@ -22,6 +22,7 @@ import java.util.regex.Pattern;
|
|||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.regex.Pattern;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.pattern.PatternTokenizer;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure this factory is working
|
||||
*/
|
||||
public class TestPatternReplaceCharFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
// 1111
|
||||
// 01234567890123
|
||||
// this is test.
|
||||
public void testNothingChange() throws IOException {
|
||||
final String BLOCK = "this is test.";
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
args.put("replacement", "$1$2$3");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "test." },
|
||||
new int[] { 0, 5, 8 },
|
||||
new int[] { 4, 7, 13 });
|
||||
}
|
||||
|
||||
// 012345678
|
||||
// aa bb cc
|
||||
public void testReplaceByEmpty() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
// 012345678
|
||||
// aa bb cc
|
||||
// aa#bb#cc
|
||||
public void test1block1matchSameLength() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
args.put("replacement", "$1#$2#$3");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb#cc" },
|
||||
new int[] { 0 },
|
||||
new int[] { 8 });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure this factory is working
|
||||
*/
|
||||
public class TestPatternReplaceFilterFactory extends BaseTokenTestCase {
|
||||
|
||||
public void testReplaceAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "a*b");
|
||||
args.put("replacement", "-");
|
||||
factory.init(args);
|
||||
TokenStream ts = factory.create
|
||||
(new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
|
||||
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "-foo-foo-foo-", "-", "c-" });
|
||||
}
|
||||
}
|
|
@ -17,120 +17,25 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/** Simple Tests to ensure this factory is working */
|
||||
public class TestPatternTokenizerFactory extends BaseTokenTestCase
|
||||
{
|
||||
public void testSplitting() throws Exception
|
||||
{
|
||||
String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
|
||||
String[][] tests = {
|
||||
// group pattern input output
|
||||
{ "-1", "--", "aaa--bbb--ccc", "aaa bbb ccc" },
|
||||
{ "-1", ":", "aaa:bbb:ccc", "aaa bbb ccc" },
|
||||
{ "-1", "\\p{Space}", "aaa bbb \t\tccc ", "aaa bbb ccc" },
|
||||
{ "-1", ":", "boo:and:foo", "boo and foo" },
|
||||
{ "-1", "o", "boo:and:foo", "b :and:f" },
|
||||
{ "0", ":", "boo:and:foo", ": :" },
|
||||
{ "0", qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" },
|
||||
{ "1", qpattern, "aaa 'bbb' 'ccc'", "bbb ccc" }
|
||||
};
|
||||
|
||||
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
for( String[] test : tests ) {
|
||||
args.put( PatternTokenizerFactory.GROUP, test[0] );
|
||||
args.put( PatternTokenizerFactory.PATTERN, test[1] );
|
||||
|
||||
PatternTokenizerFactory tokenizer = new PatternTokenizerFactory();
|
||||
tokenizer.init( args );
|
||||
|
||||
TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
|
||||
String out = tsToString( stream );
|
||||
// System.out.println( test[2] + " ==> " + out );
|
||||
|
||||
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
|
||||
|
||||
// Make sure it is the same as if we called 'split'
|
||||
// test disabled, as we remove empty tokens
|
||||
/*if( "-1".equals( test[0] ) ) {
|
||||
String[] split = test[2].split( test[1] );
|
||||
stream = tokenizer.create( new StringReader( test[2] ) );
|
||||
int i=0;
|
||||
for( Token t = stream.next(); null != t; t = stream.next() )
|
||||
{
|
||||
assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
|
||||
}
|
||||
}*/
|
||||
}
|
||||
}
|
||||
|
||||
public void testOffsetCorrection() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
// create MappingCharFilter
|
||||
MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
|
||||
List<String> mappingRules = new ArrayList<String>();
|
||||
mappingRules.add( "\"ü\" => \"ü\"" );
|
||||
NormalizeCharMap normMap = new NormalizeCharMap();
|
||||
cfFactory.parseRules( mappingRules, normMap );
|
||||
CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
|
||||
public void testFactory() throws Exception {
|
||||
final String INPUT = "Günther Günther is here";
|
||||
|
||||
// create PatternTokenizer
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
|
||||
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
|
||||
tokFactory.init( args );
|
||||
TokenStream stream = tokFactory.create( charStream );
|
||||
TokenStream stream = tokFactory.create( new StringReader(INPUT) );
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther", "is", "here" },
|
||||
new int[] { 0, 13, 26, 29 },
|
||||
new int[] { 12, 25, 28, 33 });
|
||||
|
||||
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
|
||||
args.put( PatternTokenizerFactory.PATTERN, "Günther" );
|
||||
args.put( PatternTokenizerFactory.GROUP, "0" );
|
||||
tokFactory = new PatternTokenizerFactory();
|
||||
tokFactory.init( args );
|
||||
stream = tokFactory.create( charStream );
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther" },
|
||||
new int[] { 0, 13 },
|
||||
new int[] { 12, 25 });
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO: rewrite tests not to use string comparison.
|
||||
* @deprecated only tests TermAttribute!
|
||||
*/
|
||||
private static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
|
||||
// extra safety to enforce, that the state is not preserved and also
|
||||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0)
|
||||
out.append(' ');
|
||||
out.append(termAtt.toString());
|
||||
in.clearAttributes();
|
||||
termAtt.setEmpty().append("bogusTerm");
|
||||
}
|
||||
|
||||
in.close();
|
||||
return out.toString();
|
||||
new String[] { "Günther", "Günther", "is", "here" });
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue