LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be used with custom attributes. All those attributes are preserved and set on all added decompounded tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1188597 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 05:19:17 +00:00 · 2011-10-25 10:44:36 +00:00 · 2011-10-25 10:44:36 +00:00 · ec186e7280
commit ec186e7280
parent 51d010010c
5 changed files with 250 additions and 195 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -82,6 +82,9 @@ Changes in backwards compatibility policy
 * LUCENE-3446: Removed BooleanFilter.finalResult() due to change to
   FixedBitSet.  (Uwe Schindler)

+ * LUCENE-3508: Changed some method signatures in decompounding TokenFilters
+   to make them no longer use the Token class.  (Uwe Schindler)
+
 New Features

 * LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
@ -129,6 +132,10 @@ Bug Fixes
   Java that crash on certain inputs containing supplementary characters.
   (Robert Muir)

+ * LUCENE-3508: Decompounders based on CompoundWordTokenFilterBase can now be
+   used with custom attributes. All those attributes are preserved and set on all
+   added decompounded tokens.  (Spyros Kapnissis, Uwe Schindler)
+
 API Changes
 
 * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@ -24,20 +24,17 @@ import java.util.LinkedList;
 import java.util.Locale;
 import java.util.Set;

-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;

 /**
- * Base class for decomposition token filters. <a name="version"/>
+ * Base class for decomposition token filters.
 * <p>
 * You must specify the required {@link Version} compatibility when creating
 * CompoundWordTokenFilterBase:
@ -46,6 +43,13 @@ import org.apache.lucene.util.Version;
 * supplementary characters in strings and char arrays provided as compound word
 * dictionaries.
 * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
 */
 public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  /**
@ -64,37 +68,22 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
  
  protected final CharArraySet dictionary;
-  protected final LinkedList<Token> tokens;
+  protected final LinkedList<CompoundToken> tokens;
  protected final int minWordSize;
  protected final int minSubwordSize;
  protected final int maxSubwordSize;
  protected final boolean onlyLongestMatch;
  
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
+  protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
-  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-  private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
  
-  private final Token wrapper = new Token();
-
-  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
-    this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
-  }
-  
-  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
-    this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
-  }
+  private AttributeSource.State current;

  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
    this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
  }

-  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
-    this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
-  }
-
  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
    this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
  }
@ -102,7 +91,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
    super(input);
    
-    this.tokens=new LinkedList<Token>();
+    this.tokens=new LinkedList<CompoundToken>();
    this.minWordSize=minWordSize;
    this.minSubwordSize=minSubwordSize;
    this.maxSubwordSize=maxSubwordSize;
@ -111,113 +100,95 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
    if (dictionary==null || dictionary instanceof CharArraySet) {
      this.dictionary = (CharArraySet) dictionary;
    } else {
-      this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
-      addAllLowerCase(this.dictionary, dictionary);
+      this.dictionary = new CharArraySet(matchVersion, dictionary, true);
    }
  }

-  /**
-   * Create a set of words from an array
-   * The resulting Set does case insensitive matching
-   * TODO We should look for a faster dictionary lookup approach.
-   * @param dictionary 
-   * @return {@link Set} of lowercased terms 
-   */
-  public static Set<?> makeDictionary(final String[] dictionary) {
-    return makeDictionary(Version.LUCENE_30, dictionary);
+  /** @deprecated Use the constructors taking {@link Set} */
+  @Deprecated
+  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
+    this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+  }
+
+  /** @deprecated Use the constructors taking {@link Set} */
+  @Deprecated
+  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    this(matchVersion, input,makeDictionary(matchVersion,dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
  }
  
-  public static Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
+  /** @deprecated Use the constructors taking {@link Set} */
+  @Deprecated
+  protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
+    this(matchVersion, input,makeDictionary(matchVersion,dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+  }
+
+  /** @deprecated Only available for backwards compatibility. */
+  @Deprecated
+  public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) {
    if (dictionary == null) {
      return null;
    }
-    // is the below really case insensitive? 
-    CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
-    addAllLowerCase(dict, Arrays.asList(dictionary));
-    return dict;
-  }
-  
-  private void setToken(final Token token) throws IOException {
-    clearAttributes();
-    termAtt.copyBuffer(token.buffer(), 0, token.length());
-    flagsAtt.setFlags(token.getFlags());
-    typeAtt.setType(token.type());
-    offsetAtt.setOffset(token.startOffset(), token.endOffset());
-    posIncAtt.setPositionIncrement(token.getPositionIncrement());
-    payloadAtt.setPayload(token.getPayload());
+    return new CharArraySet(matchVersion, Arrays.asList(dictionary), true);
  }
  
  @Override
  public final boolean incrementToken() throws IOException {
-    if (tokens.size() > 0) {
-      setToken(tokens.removeFirst());
+    if (!tokens.isEmpty()) {
+      assert current != null;
+      CompoundToken token = tokens.removeFirst();
+      restoreState(current); // keep all other attributes untouched
+      termAtt.setEmpty().append(token.txt);
+      offsetAtt.setOffset(token.startOffset, token.endOffset);
+      posIncAtt.setPositionIncrement(0);
      return true;
    }

-    if (!input.incrementToken())
-      return false;
-    
-    wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
-    wrapper.setStartOffset(offsetAtt.startOffset());
-    wrapper.setEndOffset(offsetAtt.endOffset());
-    wrapper.setFlags(flagsAtt.getFlags());
-    wrapper.setType(typeAtt.type());
-    wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
-    wrapper.setPayload(payloadAtt.getPayload());
-    
-    decompose(wrapper);
-
-    if (tokens.size() > 0) {
-      setToken(tokens.removeFirst());
+    current = null; // not really needed, but for safety
+    if (input.incrementToken()) {
+      // Only words longer than minWordSize get processed
+      if (termAtt.length() >= this.minWordSize) {
+        decompose();
+        // only capture the state if we really need it for producing new tokens
+        if (!tokens.isEmpty()) {
+          current = captureState();
+        }
+      }
+      // return original token:
      return true;
    } else {
      return false;
    }
  }
-  
-  protected static void addAllLowerCase(CharArraySet target, Collection<?> col) {
-    for (Object obj : col) {
-      String string = (String) obj;
-      target.add(string.toLowerCase(Locale.ENGLISH));
-    }
-  }
-  
-  protected static char[] makeLowerCaseCopy(final char[] buffer) {
-    char[] result=new char[buffer.length];
-    System.arraycopy(buffer, 0, result, 0, buffer.length);
-    
-    for (int i=0;i<buffer.length;++i) {
-       result[i]=Character.toLowerCase(buffer[i]);
-    }
-    
-    return result;
-  }
-  
-  protected final Token createToken(final int offset, final int length,
-      final Token prototype) {
-    int newStart = prototype.startOffset() + offset;
-    Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
-    t.setPositionIncrement(0);
-    return t;
-  }

-  protected void decompose(final Token token) {
-    // In any case we give the original token back
-    tokens.add((Token) token.clone());
-
-    // Only words longer than minWordSize get processed
-    if (token.length() < this.minWordSize) {
-      return;
-    }
-    
-    decomposeInternal(token);
-  }
-  
-  protected abstract void decomposeInternal(final Token token);
+  /** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
+   * The original token may not be placed in the list, as it is automatically passed through this filter.
+   */
+  protected abstract void decompose();

  @Override
  public void reset() throws IOException {
    super.reset();
    tokens.clear();
+    current = null;
  }
+  
+  /**
+   * Helper class to hold decompounded token information
+   */
+  protected class CompoundToken {
+    public final CharSequence txt;
+    public final int startOffset, endOffset;
+
+    /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
+    public CompoundToken(int offset, int length) {
+      final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
+      this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
+      // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
+      // chars from the term, offsets may not match correctly (other filters producing tokens
+      // may also have this problem):
+      this.startOffset = newStart;
+      this.endOffset = newStart + length;
+    }
+
+	}  
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compound;

 import java.util.Set;

-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.util.Version;
@ -31,12 +30,25 @@ import org.apache.lucene.util.Version;
 * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
 * "Donaudampfschiff" even when you only enter "schiff". 
 *  It uses a brute-force algorithm to achieve this.
- * </p>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ * <ul>
+ * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
 */
 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
  /**
-   * Creates a new {@link DictionaryCompoundWordTokenFilter}
-   * 
+   * Creates a new {@link DictionaryCompoundWordTokenFilter}.
   * @param matchVersion
   *          Lucene version to enable correct Unicode 4.0 behavior in the
   *          dictionaries if Version > 3.0. See <a
@ -54,7 +66,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
   *          only subwords shorter than this get to the output stream
   * @param onlyLongestMatch
   *          Add only the longest matching subword to the stream
+   * @deprecated Use the constructors taking {@link Set}
   */
+  @Deprecated
  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
      int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
    super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
@ -73,7 +87,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
   *          the {@link TokenStream} to process
   * @param dictionary
   *          the word dictionary to match against
+   * @deprecated Use the constructors taking {@link Set}
   */
+  @Deprecated
  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
    super(matchVersion, input, dictionary);
  }
@ -89,12 +105,9 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
   * @param input
   *          the {@link TokenStream} to process
   * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
   */
-  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
+  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
    super(matchVersion, input, dictionary);
  }
  
@ -109,10 +122,7 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
   * @param input
   *          the {@link TokenStream} to process
   * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
   * @param minWordSize
   *          only words longer than this get processed
   * @param minSubwordSize
@ -122,37 +132,31 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
   * @param onlyLongestMatch
   *          Add only the longest matching subword to the stream
   */
-  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
+  public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
      int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
    super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
  }

  @Override
-  protected void decomposeInternal(final Token token) {
-    // Only words longer than minWordSize get processed
-    if (token.length() < this.minWordSize) {
-      return;
-    }
-    
-    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
-    
-    for (int i=0;i<=token.length()-this.minSubwordSize;++i) {
-        Token longestMatchToken=null;
+  protected void decompose() {
+    final int len = termAtt.length();
+    for (int i=0;i<=len-this.minSubwordSize;++i) {
+        CompoundToken longestMatchToken=null;
        for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
-            if(i+j>token.length()) {
+            if(i+j>len) {
                break;
            }
-            if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
+            if(dictionary.contains(termAtt.buffer(), i, j)) {
                if (this.onlyLongestMatch) {
                   if (longestMatchToken!=null) {
-                     if (longestMatchToken.length()<j) {
-                       longestMatchToken=createToken(i,j,token);
+                     if (longestMatchToken.txt.length()<j) {
+                       longestMatchToken=new CompoundToken(i,j);
                     }
                   } else {
-                     longestMatchToken=createToken(i,j,token);
+                     longestMatchToken=new CompoundToken(i,j);
                   }
                } else {
-                   tokens.add(createToken(i,j,token));
+                   tokens.add(new CompoundToken(i,j));
                }
            } 
        }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.compound;
 import java.io.File;
 import java.util.Set;

-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
@ -34,7 +33,21 @@ import org.xml.sax.InputSource;
 * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
 * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
 * grammar and a word dictionary to achieve this.
- * </p>
+ * <p>
+ * You must specify the required {@link Version} compatibility when creating
+ * CompoundWordTokenFilterBase:
+ * <ul>
+ * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
+ * supplementary characters in strings and char arrays provided as compound word
+ * dictionaries.
+ * </ul>
+ * <p>If you pass in a {@link org.apache.lucene.analysis.util.CharArraySet} as dictionary,
+ * it should be case-insensitive unless it contains only lowercased entries and you
+ * have {@link org.apache.lucene.analysis.core.LowerCaseFilter} before this filter in your analysis chain.
+ * For optional performance (as this filter does lots of lookups to the dictionary,
+ * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
+ * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
+ * transformed to case-insensitive!
 */
 public class HyphenationCompoundWordTokenFilter extends
    CompoundWordTokenFilterBase {
@ -62,7 +75,9 @@ public class HyphenationCompoundWordTokenFilter extends
   *          only subwords shorter than this get to the output stream
   * @param onlyLongestMatch
   *          Add only the longest matching subword to the stream
+   * @deprecated Use the constructors taking {@link Set}
   */
+  @Deprecated
  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
      HyphenationTree hyphenator, String[] dictionary, int minWordSize,
      int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
@ -86,10 +101,12 @@ public class HyphenationCompoundWordTokenFilter extends
   *          the hyphenation pattern tree to use for hyphenation
   * @param dictionary
   *          the word dictionary to match against
+   * @deprecated Use the constructors taking {@link Set}
   */
+  @Deprecated
  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
      HyphenationTree hyphenator, String[] dictionary) {
-    this(matchVersion, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+    this(matchVersion, input, hyphenator, makeDictionary(matchVersion,dictionary), DEFAULT_MIN_WORD_SIZE,
        DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
  }

@ -106,10 +123,7 @@ public class HyphenationCompoundWordTokenFilter extends
   * @param hyphenator
   *          the hyphenation pattern tree to use for hyphenation
   * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
   */
  public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
      HyphenationTree hyphenator, Set<?> dictionary) {
@ -130,10 +144,7 @@ public class HyphenationCompoundWordTokenFilter extends
   * @param hyphenator
   *          the hyphenation pattern tree to use for hyphenation
   * @param dictionary
-   *          the word dictionary to match against. If this is a
-   *          {@link org.apache.lucene.analysis.util.CharArraySet CharArraySet} it
-   *          must have set ignoreCase=false and only contain lower case
-   *          strings.
+   *          the word dictionary to match against.
   * @param minWordSize
   *          only words longer than this get processed
   * @param minSubwordSize
@ -218,22 +229,20 @@ public class HyphenationCompoundWordTokenFilter extends
  }

  @Override
-  protected void decomposeInternal(final Token token) {
+  protected void decompose() {
    // get the hyphenation points
-    Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
-        .length(), 1, 1);
+    Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
    // No hyphen points found -> exit
    if (hyphens == null) {
      return;
    }

    final int[] hyp = hyphens.getHyphenationPoints();
-    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());

    for (int i = 0; i < hyp.length; ++i) {
      int remaining = hyp.length - i;
      int start = hyp[i];
-      Token longestMatchToken = null;
+      CompoundToken longestMatchToken = null;
      for (int j = 1; j < remaining; j++) {
        int partLength = hyp[i + j] - start;

@ -250,34 +259,33 @@ public class HyphenationCompoundWordTokenFilter extends
        }

        // check the dictionary
-        if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+        if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
          if (this.onlyLongestMatch) {
            if (longestMatchToken != null) {
-              if (longestMatchToken.length() < partLength) {
-                longestMatchToken = createToken(start, partLength, token);
+              if (longestMatchToken.txt.length() < partLength) {
+                longestMatchToken = new CompoundToken(start, partLength);
              }
            } else {
-              longestMatchToken = createToken(start, partLength, token);
+              longestMatchToken = new CompoundToken(start, partLength);
            }
          } else {
-            tokens.add(createToken(start, partLength, token));
+            tokens.add(new CompoundToken(start, partLength));
          }
-        } else if (dictionary.contains(lowerCaseTermBuffer, start,
-            partLength - 1)) {
+        } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
          // check the dictionary again with a word that is one character
          // shorter
          // to avoid problems with genitive 's characters and other binding
          // characters
          if (this.onlyLongestMatch) {
            if (longestMatchToken != null) {
-              if (longestMatchToken.length() < partLength - 1) {
-                longestMatchToken = createToken(start, partLength - 1, token);
+              if (longestMatchToken.txt.length() < partLength - 1) {
+                longestMatchToken = new CompoundToken(start, partLength - 1);
              }
            } else {
-              longestMatchToken = createToken(start, partLength - 1, token);
+              longestMatchToken = new CompoundToken(start, partLength - 1);
            }
          } else {
-            tokens.add(createToken(start, partLength - 1, token));
+            tokens.add(new CompoundToken(start, partLength - 1));
          }
        }
      }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@ -17,15 +17,20 @@ package org.apache.lucene.analysis.compound;
 * limitations under the License.
 */

+import java.io.IOException;
 import java.io.StringReader;
-import org.xml.sax.InputSource;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeImpl;
+import org.xml.sax.InputSource;

 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
  public void testHyphenationCompoundWordsDA() throws Exception {
@ -166,45 +171,45 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    String[] dict = {"ab", "cd", "ef"};

    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
-			new WhitespaceTokenizer(TEST_VERSION_CURRENT,
-				new StringReader(
-					"abcdef")
-				),
-			dict,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+      new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader(
+          "abcdef")
+        ),
+      dict,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

    assertTokenStreamContents(tf,
-			new String[] { "abcdef", "ab", "cd", "ef" },
-			new int[] { 0, 0, 2, 4},
-			new int[] { 6, 2, 4, 6},
-			new int[] { 1, 0, 0, 0}
-			);
+      new String[] { "abcdef", "ab", "cd", "ef" },
+      new int[] { 0, 0, 2, 4},
+      new int[] { 6, 2, 4, 6},
+      new int[] { 1, 0, 0, 0}
+      );
  }

  public void testWordComponentWithLessThanMinimumLength() throws Exception {
    String[] dict = {"abc", "d", "efg"};

    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
-			new WhitespaceTokenizer(TEST_VERSION_CURRENT,
-				new StringReader(
-					"abcdefg")
-				),
-			dict,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
-			CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+      new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader(
+          "abcdefg")
+        ),
+      dict,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

-	// since "d" is shorter than the minimum subword size, it should not be added to the token stream
+  // since "d" is shorter than the minimum subword size, it should not be added to the token stream
    assertTokenStreamContents(tf,
-			new String[] { "abcdefg", "abc", "efg" },
-			new int[] { 0, 0, 4},
-			new int[] { 7, 3, 7},
-			new int[] { 1, 0, 0}
-			);
+      new String[] { "abcdefg", "abc", "efg" },
+      new int[] { 0, 0, 4},
+      new int[] { 7, 3, 7},
+      new int[] { 1, 0, 0}
+      );
  }
-  
+
  public void testReset() throws Exception {
    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
        "Aufgabe", "Überwachung" };
@ -228,4 +233,64 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
    assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  }

+  public void testRetainMockAttribute() throws Exception {
+    String[] dict = { "abc", "d", "efg" };
+    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader("abcdefg"));
+    TokenStream stream = new MockRetainAttributeFilter(tokenizer);
+    stream = new DictionaryCompoundWordTokenFilter(
+        TEST_VERSION_CURRENT, stream, dict,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+    MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
+    while (stream.incrementToken()) {
+      assertTrue("Custom attribute value was lost", retAtt.getRetain());
+    }
+
+  }
+
+  public static interface MockRetainAttribute extends Attribute {
+    void setRetain(boolean attr);
+    boolean getRetain();
+  }
+
+  public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
+    private boolean retain = false;
+    @Override
+    public void clear() {
+      retain = false;
+    }
+    public boolean getRetain() {
+      return retain;
+    }
+    public void setRetain(boolean retain) {
+      this.retain = retain;
+    }
+    @Override
+    public void copyTo(AttributeImpl target) {
+      MockRetainAttribute t = (MockRetainAttribute) target;
+      t.setRetain(retain);
+    }
+  }
+
+  private static class MockRetainAttributeFilter extends TokenFilter {
+    
+    MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
+    
+    MockRetainAttributeFilter(TokenStream input) {
+      super(input);
+    }
+    
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()){
+        retainAtt.setRetain(true); 
+        return true;
+      } else {
+      return false;
+      }
+    }
+  }
+
 }