SOLR-3359: add analyzer attribute/property to SynonymFilterFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1504037 13f79535-47bb-0310-9956-ffa450edef68
2013-07-17 07:50:32 +00:00 · 2013-07-17 07:50:32 +00:00 · ec803e133e
parent fefa4517c9
commit ec803e133e
4 changed files with 74 additions and 8 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -57,6 +57,9 @@ New features
 * LUCENE-5098: New broadword utility methods in oal.util.BroadWord.
  (Paul Elschot via Adrien Grand, Dawid Weiss)

+* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
+  (Ryo Onodera via Koji Sekiguchi)
+
 API Changes

 * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
@ -68,6 +68,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
  private final String synonyms;
  private final String format;
  private final boolean expand;
+  private final String analyzerName;
  private final Map<String, String> tokArgs = new HashMap<String, String>();

  private SynonymMap map;
@ -79,7 +80,13 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
    format = get(args, "format");
    expand = getBoolean(args, "expand", true);

+    analyzerName = get(args, "analyzer");
    tokenizerFactory = get(args, "tokenizerFactory");
+    if (analyzerName != null && tokenizerFactory != null) {
+      throw new IllegalArgumentException("Analyzer and TokenizerFactory can't be specified both: " +
+                                         analyzerName + " and " + tokenizerFactory);
+    }
+
    if (tokenizerFactory != null) {
      assureMatchVersion();
      tokArgs.put("luceneMatchVersion", getLuceneMatchVersion().toString());
@ -104,15 +111,20 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
  @Override
  public void inform(ResourceLoader loader) throws IOException {
    final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
+    Analyzer analyzer;
    
-    Analyzer analyzer = new Analyzer() {
-      @Override
-      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-        Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
-        TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
-        return new TokenStreamComponents(tokenizer, stream);
-      }
-    };
+    if (analyzerName != null) {
+      analyzer = loadAnalyzer(loader, analyzerName);
+    } else {
+      analyzer = new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+          Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
+          TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
+          return new TokenStreamComponents(tokenizer, stream);
+        }
+      };
+    }

    try {
      if (format == null || format.equals("solr")) {
@ -188,4 +200,17 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
      throw new RuntimeException(e);
    }
  }
+
+  private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
+    Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
+    try {
+      Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_50);
+      if (analyzer instanceof ResourceLoaderAware) {
+        ((ResourceLoaderAware) analyzer).inform(loader);
+      }
+      return analyzer;
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymFilterFactory.java
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
 import org.apache.lucene.analysis.util.ClasspathResourceLoader;
 import org.apache.lucene.analysis.util.StringMockResourceLoader;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;

 public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
  /** test that we can parse and use the solr syn file */
@ -64,6 +65,28 @@ public class TestSynonymFilterFactory extends BaseTokenStreamFactoryTestCase {
    }
  }

+  /** Test that analyzer and tokenizerFactory is both specified */
+  public void testAnalyzer() throws Exception {
+    final String analyzer = CJKAnalyzer.class.getName();
+    final String tokenizerFactory = PatternTokenizerFactory.class.getName();
+    TokenFilterFactory factory = null;
+
+    factory = tokenFilterFactory("Synonym",
+        "synonyms", "synonyms2.txt",
+        "analyzer", analyzer);
+    assertNotNull(factory);
+
+    try {
+      tokenFilterFactory("Synonym",
+          "synonyms", "synonyms.txt",
+          "analyzer", analyzer,
+          "tokenizerFactory", tokenizerFactory);
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Analyzer and TokenizerFactory can't be specified both"));
+    }
+  }
+
  static final String TOK_SYN_ARG_VAL = "argument";
  static final String TOK_FOO_ARG_VAL = "foofoofoo";

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms2.txt
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/synonyms2.txt
@ -0,0 +1,15 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+
+蛙 => カエル