mirror of
https://github.com/apache/lucene.git
synced 2025-02-23 02:35:02 +00:00
LUCENE-4044: port over synfilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene2510@1364907 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dfe9a8444a
commit
5249e46aee
@ -1,4 +1,4 @@
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
@ -38,9 +38,6 @@ import org.apache.lucene.analysis.synonym.SolrSynonymParser;
|
||||
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
|
||||
import org.apache.lucene.analysis.util.*;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link SynonymFilter}.
|
||||
@ -55,9 +52,6 @@ import org.slf4j.LoggerFactory;
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
|
||||
public static final Logger log = LoggerFactory.getLogger(SynonymFilterFactory.class);
|
||||
|
||||
private SynonymMap map;
|
||||
private boolean ignoreCase;
|
||||
|
||||
@ -100,10 +94,6 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
||||
} catch (Exception e) {
|
||||
throw new InitializationException("Exception thrown while loading synonyms", e);
|
||||
}
|
||||
|
||||
if (map.fst == null) {
|
||||
log.warn("Synonyms loaded with " + args + " has empty rule set!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -125,7 +115,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||
List<String> files = splitFileNames(synonyms);
|
||||
for (String file : files) {
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
||||
@ -153,7 +143,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
|
||||
} else {
|
||||
List<String> files = StrUtils.splitFileNames(synonyms);
|
||||
List<String> files = splitFileNames(synonyms);
|
||||
for (String file : files) {
|
||||
decoder.reset();
|
||||
parser.add(new InputStreamReader(loader.openResource(file), decoder));
|
||||
@ -162,6 +152,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
||||
return parser.build();
|
||||
}
|
||||
|
||||
// nocommit: spi-hack solr.xxx and o.a.solr.analysis.xxx via a delegator
|
||||
// (there are no tests for this functionality)
|
||||
private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname){
|
||||
TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class);
|
||||
tokFactory.setLuceneMatchVersion(luceneMatchVersion);
|
@ -85,5 +85,6 @@ org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
|
||||
org.apache.lucene.analysis.standard.ClassicFilterFactory
|
||||
org.apache.lucene.analysis.standard.StandardFilterFactory
|
||||
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
|
||||
org.apache.lucene.analysis.synonym.SynonymFilterFactory
|
||||
org.apache.lucene.analysis.th.ThaiWordFilterFactory
|
||||
org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
|
||||
|
@ -1,4 +1,4 @@
|
||||
package org.apache.solr.analysis;
|
||||
package org.apache.lucene.analysis.synonym;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
@ -17,21 +17,16 @@ package org.apache.solr.analysis;
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader;
|
||||
import org.apache.lucene.analysis.util.StringMockResourceLoader;
|
||||
|
||||
public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
|
||||
/** test that we can parse and use the solr syn file */
|
||||
@ -41,7 +36,7 @@ public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
|
||||
args.put("synonyms", "synonyms.txt");
|
||||
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
|
||||
factory.init(args);
|
||||
factory.inform(new SolrResourceLoader("solr/collection1"));
|
||||
factory.inform(new ResourceAsStreamResourceLoader(getClass()));
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
|
||||
assertTrue(ts instanceof SynonymFilter);
|
||||
assertTokenStreamContents(ts,
|
||||
@ -56,28 +51,8 @@ public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
|
||||
args.put("synonyms", "synonyms.txt");
|
||||
factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader("")); // empty file!
|
||||
factory.inform(new StringMockResourceLoader("")); // empty file!
|
||||
TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
|
||||
assertTokenStreamContents(ts, new String[] { "GB" });
|
||||
}
|
||||
|
||||
private class StringMockSolrResourceLoader implements ResourceLoader {
|
||||
String text;
|
||||
|
||||
StringMockSolrResourceLoader(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return Arrays.asList(text.split("\n"));
|
||||
}
|
||||
|
||||
public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public InputStream openResource(String resource) throws IOException {
|
||||
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
#some test synonym mappings unlikely to appear in real input text
|
||||
aaa => aaaa
|
||||
bbb => bbbb1 bbbb2
|
||||
ccc => cccc1,cccc2
|
||||
a\=>a => b\=>b
|
||||
a\,a => b\,b
|
||||
fooaaa,baraaa,bazaaa
|
||||
|
||||
# Some synonym groups specific to this example
|
||||
GB,gib,gigabyte,gigabytes
|
||||
MB,mib,megabyte,megabytes
|
||||
Television, Televisions, TV, TVs
|
||||
#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
|
||||
#after us won't split it into two words.
|
||||
|
||||
# Synonym mappings can be used for spelling correction too
|
||||
pixima => pixma
|
||||
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
Loading…
x
Reference in New Issue
Block a user