LUCENE-4044: port over synfilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene2510@1364907 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-07-24 06:18:49 +00:00
parent dfe9a8444a
commit 5249e46aee
5 changed files with 43 additions and 43 deletions

View File

@ -1,4 +1,4 @@
package org.apache.solr.analysis; package org.apache.lucene.analysis.synonym;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -38,9 +38,6 @@ import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser; import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
import org.apache.lucene.analysis.util.*; import org.apache.lucene.analysis.util.*;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
import org.apache.solr.common.util.StrUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* Factory for {@link SynonymFilter}. * Factory for {@link SynonymFilter}.
@ -55,9 +52,6 @@ import org.slf4j.LoggerFactory;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
*/ */
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
public static final Logger log = LoggerFactory.getLogger(SynonymFilterFactory.class);
private SynonymMap map; private SynonymMap map;
private boolean ignoreCase; private boolean ignoreCase;
@ -100,10 +94,6 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
} catch (Exception e) { } catch (Exception e) {
throw new InitializationException("Exception thrown while loading synonyms", e); throw new InitializationException("Exception thrown while loading synonyms", e);
} }
if (map.fst == null) {
log.warn("Synonyms loaded with " + args + " has empty rule set!");
}
} }
/** /**
@ -125,7 +115,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
decoder.reset(); decoder.reset();
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
} else { } else {
List<String> files = StrUtils.splitFileNames(synonyms); List<String> files = splitFileNames(synonyms);
for (String file : files) { for (String file : files) {
decoder.reset(); decoder.reset();
parser.add(new InputStreamReader(loader.openResource(file), decoder)); parser.add(new InputStreamReader(loader.openResource(file), decoder));
@ -153,7 +143,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
decoder.reset(); decoder.reset();
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder)); parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
} else { } else {
List<String> files = StrUtils.splitFileNames(synonyms); List<String> files = splitFileNames(synonyms);
for (String file : files) { for (String file : files) {
decoder.reset(); decoder.reset();
parser.add(new InputStreamReader(loader.openResource(file), decoder)); parser.add(new InputStreamReader(loader.openResource(file), decoder));
@ -162,6 +152,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
return parser.build(); return parser.build();
} }
// nocommit: spi-hack solr.xxx and o.a.solr.analysis.xxx via a delegator
// (there are no tests for this functionality)
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname){ private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname){
TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class); TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class);
tokFactory.setLuceneMatchVersion(luceneMatchVersion); tokFactory.setLuceneMatchVersion(luceneMatchVersion);

View File

@ -85,5 +85,6 @@ org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory org.apache.lucene.analysis.standard.ClassicFilterFactory
org.apache.lucene.analysis.standard.StandardFilterFactory org.apache.lucene.analysis.standard.StandardFilterFactory
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
org.apache.lucene.analysis.synonym.SynonymFilterFactory
org.apache.lucene.analysis.th.ThaiWordFilterFactory org.apache.lucene.analysis.th.ThaiWordFilterFactory
org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory

View File

@ -1,4 +1,4 @@
package org.apache.solr.analysis; package org.apache.lucene.analysis.synonym;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,21 +17,16 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader; import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader;
import org.apache.solr.core.SolrResourceLoader; import org.apache.lucene.analysis.util.StringMockResourceLoader;
public class TestSynonymFilterFactory extends BaseTokenStreamTestCase { public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
/** test that we can parse and use the solr syn file */ /** test that we can parse and use the solr syn file */
@ -41,7 +36,7 @@ public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
args.put("synonyms", "synonyms.txt"); args.put("synonyms", "synonyms.txt");
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(args); factory.init(args);
factory.inform(new SolrResourceLoader("solr/collection1")); factory.inform(new ResourceAsStreamResourceLoader(getClass()));
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
assertTrue(ts instanceof SynonymFilter); assertTrue(ts instanceof SynonymFilter);
assertTokenStreamContents(ts, assertTokenStreamContents(ts,
@ -56,28 +51,8 @@ public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
args.put("synonyms", "synonyms.txt"); args.put("synonyms", "synonyms.txt");
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT); factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
factory.init(args); factory.init(args);
factory.inform(new StringMockSolrResourceLoader("")); // empty file! factory.inform(new StringMockResourceLoader("")); // empty file!
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false)); TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
assertTokenStreamContents(ts, new String[] { "GB" }); assertTokenStreamContents(ts, new String[] { "GB" });
} }
private class StringMockSolrResourceLoader implements ResourceLoader {
String text;
StringMockSolrResourceLoader(String text) {
this.text = text;
}
public List<String> getLines(String resource) throws IOException {
return Arrays.asList(text.split("\n"));
}
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
return null;
}
public InputStream openResource(String resource) throws IOException {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
}
} }

View File

@ -0,0 +1,31 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text
aaa => aaaa
bbb => bbbb1 bbbb2
ccc => cccc1,cccc2
a\=>a => b\=>b
a\,a => b\,b
fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
pixima => pixma

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ResourceLoader; import org.apache.lucene.analysis.util.ResourceLoader;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;