mirror of https://github.com/apache/lucene.git
LUCENE-2413: consolidate WDF into contrib/analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940781 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a9ef636cb1
commit
9238c5f5ca
|
@ -161,6 +161,8 @@ New features
|
|||
and phrases.
|
||||
- o.a.l.analysis.charfilter.HTMLStripCharFilter: CharFilter that strips HTML
|
||||
constructs.
|
||||
- o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words
|
||||
into subwords and performs optional transformations on subword groups.
|
||||
(... in progress)
|
||||
|
||||
Build
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -58,10 +58,9 @@ import java.io.IOException;
|
|||
* in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word
|
||||
* delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer).
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
|
||||
final class WordDelimiterFilter extends TokenFilter {
|
||||
public final class WordDelimiterFilter extends TokenFilter {
|
||||
|
||||
public static final int LOWER = 0x01;
|
||||
public static final int UPPER = 0x02;
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,7 +17,7 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import static org.apache.solr.analysis.WordDelimiterFilter.*;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
|
||||
|
||||
/**
|
||||
* A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
|
|
@ -15,9 +15,10 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
|
@ -29,12 +30,8 @@ import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
|||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.apache.solr.analysis.BaseTokenTestCase.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
@ -43,96 +40,10 @@ import java.util.HashSet;
|
|||
|
||||
/**
|
||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||
* TODO: should explicitly test things like protWords and not rely on
|
||||
* the factory tests in Solr.
|
||||
*/
|
||||
public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig.xml","schema.xml");
|
||||
}
|
||||
|
||||
public void posTst(String v1, String v2, String s1, String s2) {
|
||||
assertU(adoc("id", "42",
|
||||
"subword", v1,
|
||||
"subword", v2));
|
||||
assertU(commit());
|
||||
|
||||
// there is a positionIncrementGap of 100 between field values, so
|
||||
// we test if that was maintained.
|
||||
assertQ("position increment lost",
|
||||
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("position increment lost",
|
||||
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRetainPositionIncrement() {
|
||||
posTst("foo","bar","foo","bar");
|
||||
posTst("-foo-","-bar-","foo","bar");
|
||||
posTst("foo","bar","-foo-","-bar-");
|
||||
|
||||
posTst("123","456","123","456");
|
||||
posTst("/123/","/456/","123","456");
|
||||
|
||||
posTst("/123/abc","qwe/456/","abc","qwe");
|
||||
|
||||
posTst("zoo-foo","bar-baz","foo","bar");
|
||||
posTst("zoo-foo-123","456-bar-baz","foo","bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoGenerationEdgeCase() {
|
||||
assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreCaseChange() {
|
||||
|
||||
assertU(adoc("id", "43",
|
||||
"wdf_nocase", "HellO WilliAM",
|
||||
"subword", "GoodBye JonEs"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("no case change",
|
||||
req("wdf_nocase:(hell o am)")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("case change",
|
||||
req("subword:(good jon)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPreserveOrignalTrue() {
|
||||
|
||||
assertU(adoc("id", "144",
|
||||
"wdf_preserve", "404-123"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:404")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:123")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:404-123*")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
/***
|
||||
public void testPerformance() throws IOException {
|
||||
|
@ -232,59 +143,6 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
|
|||
new int[] { 11, 15, 15 });
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAlphaNumericWords(){
|
||||
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("j2se found",
|
||||
req("numericsubword:(J2SE)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
assertQ("no j2 or se",
|
||||
req("numericsubword:(J2 OR SE)")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProtectedWords(){
|
||||
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("java found",
|
||||
req("protectedsubword:(java)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ(".net found",
|
||||
req("protectedsubword:(.net)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c# found",
|
||||
req("protectedsubword:(c#)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c++ found",
|
||||
req("protectedsubword:(c++)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c found?",
|
||||
req("protectedsubword:c")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("net found?",
|
||||
req("protectedsubword:net")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
|
||||
public void doSplit(final String input, String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
|
||||
new StringReader(input)), 1, 1, 0, 0, 0);
|
||||
|
@ -368,13 +226,13 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
|
|||
|
||||
@Test
|
||||
public void testPositionIncrements() throws Exception {
|
||||
final CharArraySet protWords = new CharArraySet(DEFAULT_VERSION, new HashSet<String>(Arrays.asList("NUTCH")), false);
|
||||
final CharArraySet protWords = new CharArraySet(TEST_VERSION_CURRENT, new HashSet<String>(Arrays.asList("NUTCH")), false);
|
||||
|
||||
/* analyzer that uses whitespace + wdf */
|
||||
Analyzer a = new Analyzer() {
|
||||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
return new WordDelimiterFilter(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, reader),
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
}
|
||||
};
|
||||
|
@ -401,7 +259,7 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
|
|||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
return new WordDelimiterFilter(
|
||||
new LargePosIncTokenFilter(
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, reader)),
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader)),
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
||||
}
|
||||
};
|
||||
|
@ -431,8 +289,8 @@ public class TestWordDelimiterFilter extends SolrTestCaseJ4 {
|
|||
|
||||
Analyzer a3 = new Analyzer() {
|
||||
public TokenStream tokenStream(String field, Reader reader) {
|
||||
StopFilter filter = new StopFilter(DEFAULT_VERSION,
|
||||
new WhitespaceTokenizer(DEFAULT_VERSION, reader), StandardAnalyzer.STOP_WORDS_SET);
|
||||
StopFilter filter = new StopFilter(TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET);
|
||||
filter.setEnablePositionIncrements(true);
|
||||
return new WordDelimiterFilter(filter,
|
||||
1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
|
|
@ -0,0 +1,186 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||
*/
|
||||
public class TestWordDelimiterFilterFactory extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig.xml","schema.xml");
|
||||
}
|
||||
|
||||
public void posTst(String v1, String v2, String s1, String s2) {
|
||||
assertU(adoc("id", "42",
|
||||
"subword", v1,
|
||||
"subword", v2));
|
||||
assertU(commit());
|
||||
|
||||
// there is a positionIncrementGap of 100 between field values, so
|
||||
// we test if that was maintained.
|
||||
assertQ("position increment lost",
|
||||
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~90")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("position increment lost",
|
||||
req("+id:42 +subword:\"" + s1 + ' ' + s2 + "\"~110")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRetainPositionIncrement() {
|
||||
posTst("foo","bar","foo","bar");
|
||||
posTst("-foo-","-bar-","foo","bar");
|
||||
posTst("foo","bar","-foo-","-bar-");
|
||||
|
||||
posTst("123","456","123","456");
|
||||
posTst("/123/","/456/","123","456");
|
||||
|
||||
posTst("/123/abc","qwe/456/","abc","qwe");
|
||||
|
||||
posTst("zoo-foo","bar-baz","foo","bar");
|
||||
posTst("zoo-foo-123","456-bar-baz","foo","bar");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoGenerationEdgeCase() {
|
||||
assertU(adoc("id", "222", "numberpartfail", "123.123.123.123"));
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreCaseChange() {
|
||||
|
||||
assertU(adoc("id", "43",
|
||||
"wdf_nocase", "HellO WilliAM",
|
||||
"subword", "GoodBye JonEs"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("no case change",
|
||||
req("wdf_nocase:(hell o am)")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("case change",
|
||||
req("subword:(good jon)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPreserveOrignalTrue() {
|
||||
|
||||
assertU(adoc("id", "144",
|
||||
"wdf_preserve", "404-123"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:404")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:123")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("preserving original word",
|
||||
req("wdf_preserve:404-123*")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
/***
|
||||
public void testPerformance() throws IOException {
|
||||
String s = "now is the time-for all good men to come to-the aid of their country.";
|
||||
Token tok = new Token();
|
||||
long start = System.currentTimeMillis();
|
||||
int ret=0;
|
||||
for (int i=0; i<1000000; i++) {
|
||||
StringReader r = new StringReader(s);
|
||||
TokenStream ts = new WhitespaceTokenizer(r);
|
||||
ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
|
||||
|
||||
while (ts.next(tok) != null) ret++;
|
||||
}
|
||||
|
||||
System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
|
||||
}
|
||||
***/
|
||||
|
||||
@Test
|
||||
public void testAlphaNumericWords(){
|
||||
assertU(adoc("id", "68","numericsubword","Java/J2SE"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("j2se found",
|
||||
req("numericsubword:(J2SE)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
assertQ("no j2 or se",
|
||||
req("numericsubword:(J2 OR SE)")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProtectedWords(){
|
||||
assertU(adoc("id", "70","protectedsubword","c# c++ .net Java/J2SE"));
|
||||
assertU(commit());
|
||||
|
||||
assertQ("java found",
|
||||
req("protectedsubword:(java)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ(".net found",
|
||||
req("protectedsubword:(.net)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c# found",
|
||||
req("protectedsubword:(c#)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c++ found",
|
||||
req("protectedsubword:(c++)")
|
||||
,"//result[@numFound=1]"
|
||||
);
|
||||
|
||||
assertQ("c found?",
|
||||
req("protectedsubword:c")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
assertQ("net found?",
|
||||
req("protectedsubword:net")
|
||||
,"//result[@numFound=0]"
|
||||
);
|
||||
clearIndex();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue