SOLR-1984: add HyphenationCompoundWordTokenFilterFactory

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@962555 13f79535-47bb-0310-9956-ffa450edef68
2010-07-09 15:20:51 +00:00 · 2010-07-09 15:20:51 +00:00 · c5bc95a357
parent 70d7eb1766
commit c5bc95a357
6 changed files with 1479 additions and 0 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -196,6 +196,8 @@ New Features
    will now return warnings about (gsingers)

 * SOLR-1985: FastVectorHighlighter: add wrapper class for Lucene's SingleFragListBuilder (koji)
+
+* SOLR-1984: Add HyphenationCompoundWordTokenFilterFactory. (PB via rmuir)
   
 Optimizations
 ----------------------
--- a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java
@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
+import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.solr.analysis.BaseTokenFilterFactory;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+import java.util.Map;
+
+/**
+ * Factory for {@link DictionaryCompoundWordTokenFilter}
+ * <p>
+ * This factory accepts the following parameters:
+ * <ul>
+ *  <li><code>hyphenator</code> (mandatory): path to the FOP xml hyphenation pattern. 
+ *  See <a href="http://offo.sourceforge.net/hyphenation/">http://offo.sourceforge.net/hyphenation/</a>.
+ *  <li><code>encoding</code> (optional): encoding of the xml hyphenation file. defaults to UTF-8.
+ *  <li><code>dictionary</code> (optional): dictionary of words. defaults to no dictionary.
+ *  <li><code>minWordSize</code> (optional): minimal word length that gets decomposed. defaults to 5.
+ *  <li><code>minSubwordSize</code> (optional): minimum length of subwords. defaults to 2.
+ *  <li><code>maxSubwordSize</code> (optional): maximum length of subwords. defaults to 15.
+ *  <li><code>onlyLongestMatch</code> (optional): if true, adds only the longest matching subword 
+ *    to the stream. defaults to false.
+ * </ul>
+ * <p>
+ * @see HyphenationCompoundWordTokenFilter
+ */
+public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
+  private CharArraySet dictionary;
+  private HyphenationTree hyphenator;
+  private String dictFile;
+  private String hypFile;
+  private String encoding = "UTF-8"; // default to UTF-8 encoding
+  private int minWordSize;
+  private int minSubwordSize;
+  private int maxSubwordSize;
+  private boolean onlyLongestMatch;
+  
+  public void init(Map<String, String> args) {
+    super.init(args);
+    assureMatchVersion();
+    dictFile = args.get("dictionary");
+    if (args.containsKey("encoding"))
+      encoding = args.get("encoding");
+    hypFile = args.get("hyphenator");
+    if (null == hypFile) {
+      throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
+          "Missing required parameter: hyphenator");
+    }
+
+    minWordSize = getInt("minWordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE);
+    minSubwordSize = getInt("minSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE);
+    maxSubwordSize = getInt("maxSubwordSize", CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE);
+    onlyLongestMatch = getBoolean("onlyLongestMatch", false);
+  }
+  
+  public void inform(ResourceLoader loader) {
+    Reader reader = null;
+    try {
+      if (dictFile != null) // the dictionary can be empty.
+        dictionary = getWordSet(loader, dictFile, false);
+      
+      InputStream hyph = loader.openResource(hypFile);
+      reader = new InputStreamReader(hyph, encoding);
+      hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(reader);
+    } catch (Exception e) { // TODO: getHyphenationTree really shouldnt throw "Exception"
+      throw new RuntimeException(e);
+    } finally {
+      IOUtils.closeQuietly(reader);
+    }
+  }
+  
+  public HyphenationCompoundWordTokenFilter create(TokenStream input) {
+    return new HyphenationCompoundWordTokenFilter(luceneMatchVersion, input, hyphenator, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+  }
+}
--- a/solr/src/test/org/apache/solr/analysis/TestHyphenationCompoundWordTokenFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestHyphenationCompoundWordTokenFilterFactory.java
@ -0,0 +1,78 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.core.SolrResourceLoader;
+
+/**
+ * Simple tests to ensure the Hyphenation compound filter factory is working.
+ */
+public class TestHyphenationCompoundWordTokenFilterFactory extends BaseTokenTestCase {
+  /**
+   * Ensure the factory works with hyphenation grammar+dictionary: using default options.
+   */
+  public void testHyphenationWithDictionary() throws Exception {
+    Reader reader = new StringReader("min veninde som er lidt af en læsehest");
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory();
+    ResourceLoader loader = new SolrResourceLoader(null, null);
+    Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
+    args.put("hyphenator", "da_UTF8.xml");
+    args.put("dictionary", "da_compoundDictionary.txt");
+    factory.init(args);
+    factory.inform(loader);
+    TokenStream stream = factory.create(tokenizer);
+    
+    assertTokenStreamContents(stream, 
+        new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
+        new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
+    );
+  }
+
+  /**
+   * Ensure the factory works with no dictionary: using hyphenation grammar only.
+   * Also change the min/max subword sizes from the default. When using no dictionary,
+   * its generally necessary to tweak these, or you get lots of expansions.
+   */
+  public void testHyphenationOnly() throws Exception {
+    Reader reader = new StringReader("basketballkurv");
+    Tokenizer tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, reader);
+    HyphenationCompoundWordTokenFilterFactory factory = new HyphenationCompoundWordTokenFilterFactory();
+    ResourceLoader loader = new SolrResourceLoader(null, null);
+    Map<String,String> args = new HashMap<String,String>(DEFAULT_VERSION_PARAM);
+    args.put("hyphenator", "da_UTF8.xml");
+    args.put("minSubwordSize", "2");
+    args.put("maxSubwordSize", "4");
+    factory.init(args);
+    factory.inform(loader);
+    TokenStream stream = factory.create(tokenizer);
+    
+    assertTokenStreamContents(stream,
+        new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
+    );
+  }
+}
--- a/solr/src/test/test-files/solr/conf/da_UTF8.xml
+++ b/solr/src/test/test-files/solr/conf/da_UTF8.xml
--- a/solr/src/test/test-files/solr/conf/da_compoundDictionary.txt
+++ b/solr/src/test/test-files/solr/conf/da_compoundDictionary.txt
@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# A set of words for testing the HyphenationCompound factory,
+# in conjunction with the danish hyphenation grammar.
+læse
+hest
--- a/solr/src/test/test-files/solr/conf/hyphenation.dtd
+++ b/solr/src/test/test-files/solr/conf/hyphenation.dtd
@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+  Copyright 1999-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
+
+<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
+                           classes, exceptions?, patterns)>
+
+<!-- Hyphen character to be used in the exception list as shortcut for
+     <hyphen pre-break="-"/>. Defaults to '-'
+-->
+<!ELEMENT hyphen-char EMPTY>
+<!ATTLIST hyphen-char value CDATA #REQUIRED>
+
+<!-- Default minimun length in characters of hyphenated word fragments
+     before and after the line break. For some languages this is not
+     only for aesthetic purposes, wrong hyphens may be generated if this
+     is not accounted for.
+-->
+<!ELEMENT hyphen-min EMPTY>
+<!ATTLIST hyphen-min before CDATA #REQUIRED>
+<!ATTLIST hyphen-min after CDATA #REQUIRED>
+
+<!-- Character equivalent classes: space separated list of character groups, all
+     characters in a group are to be treated equivalent as far as
+     the hyphenation algorithm is concerned. The first character in a group
+     is the group's equivalent character. Patterns should only contain
+     first characters. It also defines word characters, i.e. a word that
+     contains characters not present in any of the classes is not hyphenated.
+-->
+<!ELEMENT classes (#PCDATA)>
+
+<!-- Hyphenation exceptions: space separated list of hyphenated words.
+     A hyphen is indicated by the hyphen tag, but you can use the
+     hyphen-char defined previously as shortcut. This is in cases
+     when the algorithm procedure finds wrong hyphens or you want
+     to provide your own hyphenation for some words.
+-->
+<!ELEMENT exceptions (#PCDATA|hyphen)* >
+
+<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
+     characters as described before, between any two word characters a digit
+     in the range 0 to 9 may be specified. The absence of a digit is equivalent
+     to zero. The '.' character is reserved to indicate begining or ending
+     of words. -->
+<!ELEMENT patterns (#PCDATA)>
+
+<!-- A "full hyphen" equivalent to TeX's \discretionary
+     with pre-break, post-break and no-break attributes.
+     To be used in the exceptions list, the hyphen character is not
+     automatically added -->
+<!ELEMENT hyphen EMPTY>
+<!ATTLIST hyphen pre CDATA #IMPLIED>
+<!ATTLIST hyphen no CDATA #IMPLIED>
+<!ATTLIST hyphen post CDATA #IMPLIED>