LUCENE-2413: consolidate WDF into contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 09:11:05 +00:00
parent a9ef636cb1
commit 9238c5f5ca
6 changed files with 203 additions and 157 deletions

View File

@ -161,6 +161,8 @@ New features
and phrases.
- o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML
constructs.
- o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words
into subwords and performs optional transformations on subword groups.
(... in progress)
Build

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -58,10 +58,9 @@ import java.io.IOException;
* in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
* delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
*
* @version $Id$
*/
final class WordDelimiterFilter extends TokenFilter {
public final class WordDelimiterFilter extends TokenFilter {
public static final int LOWER = 0x01;
public static final int UPPER = 0x02;

View File

@ -1,4 +1,4 @@
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,7 +17,7 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import static org.apache.solr.analysis.WordDelimiterFilter.*;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
/**
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.

View File

@ -15,9 +15,10 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.StopFilter;
@ -29,12 +30,8 @@ import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
import static org.apache.solr.analysis.BaseTokenTestCase.*;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -43,96 +40,10 @@ import java.util.HashSet;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
* TODO: should explicitly test things like protWords and not rely on
* the factory tests in Solr.
*/
public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml","schema.xml");
}
public void posTst(String v1, String v2, String s1, String s2) {
assertU(adoc("id", "42",
"subword", v1,
"subword", v2));
assertU(commit());
// there is a positionIncrementGap of 100 between field values, so
// we test if that was maintained.
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
,"//result[@numFound=0]"
);
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
,"//result[@numFound=1]"
);
clearIndex();
}
@Test
public void testRetainPositionIncrement() {
posTst("foo","bar","foo","bar");
posTst("-foo-","-bar-","foo","bar");
posTst("foo","bar","-foo-","-bar-");
posTst("123","456","123","456");
posTst("/123/","/456/","123","456");
posTst("/123/abc","qwe/456/","abc","qwe");
posTst("zoo-foo","bar-baz","foo","bar");
posTst("zoo-foo-123","456-bar-baz","foo","bar");
}
@Test
public void testNoGenerationEdgeCase() {
assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
clearIndex();
}
@Test
public void testIgnoreCaseChange() {
assertU(adoc("id", "43",
"wdf_nocase", "HellO WilliAM",
"subword", "GoodBye JonEs"));
assertU(commit());
assertQ("no case change",
req("wdf_nocase:(hell o am)")
,"//result[@numFound=0]"
);
assertQ("case change",
req("subword:(good jon)")
,"//result[@numFound=1]"
);
clearIndex();
}
@Test
public void testPreserveOrignalTrue() {
assertU(adoc("id", "144",
"wdf_preserve", "404-123"));
assertU(commit());
assertQ("preserving original word",
req("wdf_preserve:404")
,"//result[@numFound=1]"
);
assertQ("preserving original word",
req("wdf_preserve:123")
,"//result[@numFound=1]"
);
assertQ("preserving original word",
req("wdf_preserve:404-123*")
,"//result[@numFound=1]"
);
clearIndex();
}
public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
/***
public void testPerformance() throws IOException {
@ -232,59 +143,6 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
new int[] { 11, 15, 15 });
}
@Test
public void testAlphaNumericWords(){
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
assertU(commit());
assertQ("j2se found",
req("numericsubword:(J2SE)")
,"//result[@numFound=1]"
);
assertQ("no j2 or se",
req("numericsubword:(J2 OR SE)")
,"//result[@numFound=0]"
);
clearIndex();
}
@Test
public void testProtectedWords(){
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
assertU(commit());
assertQ("java found",
req("protectedsubword:(java)")
,"//result[@numFound=1]"
);
assertQ(".net found",
req("protectedsubword:(.net)")
,"//result[@numFound=1]"
);
assertQ("c# found",
req("protectedsubword:(c#)")
,"//result[@numFound=1]"
);
assertQ("c++ found",
req("protectedsubword:(c++)")
,"//result[@numFound=1]"
);
assertQ("c found?",
req("protectedsubword:c")
,"//result[@numFound=0]"
);
assertQ("net found?",
req("protectedsubword:net")
,"//result[@numFound=0]"
);
clearIndex();
}
public void doSplit(final String input, String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
new StringReader(input)), 1, 1, 0, 0, 0);
@ -368,13 +226,13 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
@Test
public void testPositionIncrements() throws Exception {
final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("NUTCH")), false);
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
public TokenStream tokenStream(String field, Reader reader) {
return new WordDelimiterFilter(
new WhitespaceTokenizer(DEFAULT_VERSION, reader),
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
}
};
@ -401,7 +259,7 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
public TokenStream tokenStream(String field, Reader reader) {
return new WordDelimiterFilter(
new LargePosIncTokenFilter(
new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
}
};
@ -431,8 +289,8 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
Analyzer a3 = new Analyzer() {
public TokenStream tokenStream(String field, Reader reader) {
StopFilter filter = new StopFilter(DEFAULT_VERSION,
new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
filter.setEnablePositionIncrements(true);
return new WordDelimiterFilter(filter,
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);

View File

@ -18,6 +18,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.common.ResourceLoader;

View File

@ -0,0 +1,186 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/
public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml","schema.xml");
}
public void posTst(String v1, String v2, String s1, String s2) {
assertU(adoc("id", "42",
"subword", v1,
"subword", v2));
assertU(commit());
// there is a positionIncrementGap of 100 between field values, so
// we test if that was maintained.
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
,"//result[@numFound=0]"
);
assertQ("position increment lost",
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
,"//result[@numFound=1]"
);
clearIndex();
}
@Test
public void testRetainPositionIncrement() {
posTst("foo","bar","foo","bar");
posTst("-foo-","-bar-","foo","bar");
posTst("foo","bar","-foo-","-bar-");
posTst("123","456","123","456");
posTst("/123/","/456/","123","456");
posTst("/123/abc","qwe/456/","abc","qwe");
posTst("zoo-foo","bar-baz","foo","bar");
posTst("zoo-foo-123","456-bar-baz","foo","bar");
}
@Test
public void testNoGenerationEdgeCase() {
assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
clearIndex();
}
@Test
public void testIgnoreCaseChange() {
assertU(adoc("id", "43",
"wdf_nocase", "HellO WilliAM",
"subword", "GoodBye JonEs"));
assertU(commit());
assertQ("no case change",
req("wdf_nocase:(hell o am)")
,"//result[@numFound=0]"
);
assertQ("case change",
req("subword:(good jon)")
,"//result[@numFound=1]"
);
clearIndex();
}
@Test
public void testPreserveOrignalTrue() {
assertU(adoc("id", "144",
"wdf_preserve", "404-123"));
assertU(commit());
assertQ("preserving original word",
req("wdf_preserve:404")
,"//result[@numFound=1]"
);
assertQ("preserving original word",
req("wdf_preserve:123")
,"//result[@numFound=1]"
);
assertQ("preserving original word",
req("wdf_preserve:404-123*")
,"//result[@numFound=1]"
);
clearIndex();
}
/***
public void testPerformance() throws IOException {
String s = "now is the time-for all good men to come to-the aid of their country.";
Token tok = new Token();
long start = System.currentTimeMillis();
int ret=0;
for (int i=0; i<1000000; i++) {
StringReader r = new StringReader(s);
TokenStream ts = new WhitespaceTokenizer(r);
ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
while (ts.next(tok) != null) ret++;
}
System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
}
***/
@Test
public void testAlphaNumericWords(){
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
assertU(commit());
assertQ("j2se found",
req("numericsubword:(J2SE)")
,"//result[@numFound=1]"
);
assertQ("no j2 or se",
req("numericsubword:(J2 OR SE)")
,"//result[@numFound=0]"
);
clearIndex();
}
@Test
public void testProtectedWords(){
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
assertU(commit());
assertQ("java found",
req("protectedsubword:(java)")
,"//result[@numFound=1]"
);
assertQ(".net found",
req("protectedsubword:(.net)")
,"//result[@numFound=1]"
);
assertQ("c# found",
req("protectedsubword:(c#)")
,"//result[@numFound=1]"
);
assertQ("c++ found",
req("protectedsubword:(c++)")
,"//result[@numFound=1]"
);
assertQ("c found?",
req("protectedsubword:c")
,"//result[@numFound=0]"
);
assertQ("net found?",
req("protectedsubword:net")
,"//result[@numFound=0]"
);
clearIndex();
}
}