LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without a dictionary

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@946139 13f79535-47bb-0310-9956-ffa450edef68
2010-05-19 11:58:37 +00:00 · 2010-05-19 11:58:37 +00:00 · fe5f1aabcb
parent c5ae13f39e
commit fe5f1aabcb
5 changed files with 106 additions and 8 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -160,6 +160,9 @@ New features
 * LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
   when Version is set to 3.1 or higher.  (Robert Muir)

+ * LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary.
+   (Thomas Peuss via Robert Muir)
+
 Build

 * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@ -154,7 +154,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
    this.maxSubwordSize=maxSubwordSize;
    this.onlyLongestMatch=onlyLongestMatch;
    
-    if (dictionary instanceof CharArraySet) {
+    if (dictionary==null || dictionary instanceof CharArraySet) {
      this.dictionary = (CharArraySet) dictionary;
    } else {
      this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
@ -181,6 +181,9 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  }
  
  public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
+    if (dictionary == null) {
+      return null;
+    }
    // is the below really case insensitive? 
    CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
    addAllLowerCase(dict, Arrays.asList(dictionary));
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@ -153,6 +153,33 @@ public class HyphenationCompoundWordTokenFilter extends
    this.hyphenator = hyphenator;
  }

+  /**
+   * Create a HyphenationCompoundWordTokenFilter with no dictionary.
+   * <p>
+   * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
+   * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
+   * null, minWordSize, minSubwordSize, maxSubwordSize }
+   */
+  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
+      HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
+      int maxSubwordSize) {
+    this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
+        maxSubwordSize, false);
+  }
+  
+  /**
+   * Create a HyphenationCompoundWordTokenFilter with no dictionary.
+   * <p>
+   * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int) 
+   * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator, 
+   * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
+   */
+  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
+      HyphenationTree hyphenator) {
+    this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, 
+        DEFAULT_MAX_SUBWORD_SIZE);
+  }
+
  /**
   * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
   * 
@ -305,7 +332,7 @@ public class HyphenationCompoundWordTokenFilter extends
        }

        // check the dictionary
-        if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+        if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
          if (this.onlyLongestMatch) {
            if (longestMatchToken != null) {
              if (longestMatchToken.termLength() < partLength) {
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/package.html
@ -81,8 +81,9 @@ filter available:
 The {@link
 org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
 HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
-potential subwords that a worth to check against the dictionary. The
-quality of the output tokens is directly connected to the quality of the
+potential subwords that a worth to check against the dictionary. It can be used
+without a dictionary as well but then produces a lot of "nonword" tokens.
+The quality of the output tokens is directly connected to the quality of the
 grammar file you use. For languages like German they are quite good.
 <h5>Grammar file</h5>
 Unfortunately we cannot bundle the hyphenation grammar files with Lucene
@ -157,8 +158,24 @@ This decision matrix should help you:
        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
        
-    Token t;
-    while ((t=tf.next())!=null) {
+    CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+    while (tf.incrementToken()) {
+       System.out.println(t);
+    }
+  }
+
+  public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
+    Reader reader = new FileReader("de_DR.xml");
+
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(reader);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader(
+            "Rindfleisch&uuml;berwachungsgesetz Drahtschere abba")), hyphenator);
+        
+    CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+    while (tf.incrementToken()) {
       System.out.println(t);
    }
  }
@ -173,8 +190,8 @@ This decision matrix should help you:
            new StringReader(
                "Bild&ouml;rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas&ouml;gonfodral Basfiolsfodral Basfiolsfodralmakareges&auml;ll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
        dict);
-    Token t;
-    while ((t=tf.next())!=null) {
+    CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
+    while (tf.incrementToken()) {
       System.out.println(t);
    }
  }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -70,6 +70,54 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {

  }

+  /**
+   * With hyphenation-only, you can get a lot of nonsense tokens.
+   * This can be controlled with the min/max subword size.
+   */
+  public void testHyphenationOnly() throws Exception {
+    Reader reader = getHyphenationReader();
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+      .getHyphenationTree(reader);
+    
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT,
+        new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+        hyphenator,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        2, 4);
+    
+    // min=2, max=4
+    assertTokenStreamContents(tf,
+        new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
+    );
+    
+    tf = new HyphenationCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT,
+        new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+        hyphenator,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        4, 6);
+    
+    // min=4, max=6
+    assertTokenStreamContents(tf,
+        new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
+    );
+    
+    tf = new HyphenationCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT,
+        new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
+        hyphenator,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        4, 10);
+    
+    // min=4, max=10
+    assertTokenStreamContents(tf,
+        new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", 
+                       "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
+    );
+    
+  }
+
  public void testDumbCompoundWordsSE() throws Exception {
    String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
        "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",