LUCENE-4044: port over synfilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene2510@1364907 13f79535-47bb-0310-9956-ffa450edef68
2025-02-23 02:35:02 +00:00 · 2012-07-24 06:18:49 +00:00 · 2012-07-24 06:18:49 +00:00 · 5249e46aee
commit 5249e46aee
parent dfe9a8444a
5 changed files with 43 additions and 43 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.synonym;

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -38,9 +38,6 @@ import org.apache.lucene.analysis.synonym.SolrSynonymParser;
 import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
 import org.apache.lucene.analysis.util.*;
 import org.apache.lucene.util.Version;
-import org.apache.solr.common.util.StrUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;

 /**
 * Factory for {@link SynonymFilter}.
@ -55,9 +52,6 @@ import org.slf4j.LoggerFactory;
 * &lt;/fieldType&gt;</pre>
 */
 public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
-
-  public static final Logger log = LoggerFactory.getLogger(SynonymFilterFactory.class);
-
  private SynonymMap map;
  private boolean ignoreCase;
  
@ -100,10 +94,6 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
    } catch (Exception e) {
      throw new InitializationException("Exception thrown while loading synonyms", e);
    }
-    
-    if (map.fst == null) {
-      log.warn("Synonyms loaded with " + args + " has empty rule set!");
-    }
  }
  
  /**
@ -125,7 +115,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
      decoder.reset();
      parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
    } else {
-      List<String> files = StrUtils.splitFileNames(synonyms);
+      List<String> files = splitFileNames(synonyms);
      for (String file : files) {
        decoder.reset();
        parser.add(new InputStreamReader(loader.openResource(file), decoder));
@ -153,7 +143,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
      decoder.reset();
      parser.add(new InputStreamReader(loader.openResource(synonyms), decoder));
    } else {
-      List<String> files = StrUtils.splitFileNames(synonyms);
+      List<String> files = splitFileNames(synonyms);
      for (String file : files) {
        decoder.reset();
        parser.add(new InputStreamReader(loader.openResource(file), decoder));
@ -162,6 +152,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
    return parser.build();
  }
  
+  // nocommit: spi-hack solr.xxx and o.a.solr.analysis.xxx via a delegator
+  // (there are no tests for this functionality)
  private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname){
    TokenizerFactory tokFactory = loader.newInstance(cname, TokenizerFactory.class);
    tokFactory.setLuceneMatchVersion(luceneMatchVersion);
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -85,5 +85,6 @@ org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
 org.apache.lucene.analysis.standard.ClassicFilterFactory
 org.apache.lucene.analysis.standard.StandardFilterFactory
 org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
+org.apache.lucene.analysis.synonym.SynonymFilterFactory
 org.apache.lucene.analysis.th.ThaiWordFilterFactory
 org.apache.lucene.analysis.tr.TurkishLowerCaseFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
@ -1,4 +1,4 @@
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.synonym;

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,21 +17,16 @@ package org.apache.solr.analysis;
 * limitations under the License.
 */

-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
 import java.io.StringReader;
-import java.util.Arrays;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.synonym.SynonymFilter;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.solr.core.SolrResourceLoader;
+import org.apache.lucene.analysis.util.ResourceAsStreamResourceLoader;
+import org.apache.lucene.analysis.util.StringMockResourceLoader;

 public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
  /** test that we can parse and use the solr syn file */
@ -41,7 +36,7 @@ public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
    args.put("synonyms", "synonyms.txt");
    factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
    factory.init(args);
-    factory.inform(new SolrResourceLoader("solr/collection1"));
+    factory.inform(new ResourceAsStreamResourceLoader(getClass()));
    TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
    assertTrue(ts instanceof SynonymFilter);
    assertTokenStreamContents(ts, 
@ -56,28 +51,8 @@ public class TestSynonymFilterFactory extends BaseTokenStreamTestCase {
    args.put("synonyms", "synonyms.txt");
    factory.setLuceneMatchVersion(TEST_VERSION_CURRENT);
    factory.init(args);
-    factory.inform(new StringMockSolrResourceLoader("")); // empty file!
+    factory.inform(new StringMockResourceLoader("")); // empty file!
    TokenStream ts = factory.create(new MockTokenizer(new StringReader("GB"), MockTokenizer.WHITESPACE, false));
    assertTokenStreamContents(ts, new String[] { "GB" });
  }
-  
-  private class StringMockSolrResourceLoader implements ResourceLoader {
-    String text;
-
-    StringMockSolrResourceLoader(String text) {
-      this.text = text;
-    }
-
-    public List<String> getLines(String resource) throws IOException {
-      return Arrays.asList(text.split("\n"));
-    }
-
-    public <T> T newInstance(String cname, Class<T> expectedType, String... subpackages) {
-      return null;
-    }
-
-    public InputStream openResource(String resource) throws IOException {
-      return new ByteArrayInputStream(text.getBytes("UTF-8"));
-    }
-  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms.txt
@ -0,0 +1,31 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+#some test synonym mappings unlikely to appear in real input text
+aaa => aaaa
+bbb => bbbb1 bbbb2
+ccc => cccc1,cccc2
+a\=>a => b\=>b
+a\,a => b\,b
+fooaaa,baraaa,bazaaa
+
+# Some synonym groups specific to this example
+GB,gib,gigabyte,gigabytes
+MB,mib,megabyte,megabytes
+Television, Televisions, TV, TVs
+#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#after us won't split it into two words.
+
+# Synonym mappings can be used for spelling correction too
+pixima => pixma
+
--- a/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestMultiWordSynonyms.java
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
 import org.apache.lucene.analysis.util.ResourceLoader;

 import java.io.ByteArrayInputStream;