From 7a27cdcbc901cd5e5ebb44065e3158866156dc08 Mon Sep 17 00:00:00 2001
From: Grant Ingersoll <gsingers@apache.org>
Date: Fri, 16 May 2008 12:22:50 +0000
Subject: [PATCH] LUCENE-1166: Added token filter for decomposing compound
 words

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@657027 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   1 +
 .../compound/CompoundWordTokenFilterBase.java | 169 +++++
 .../DictionaryCompoundWordTokenFilter.java    | 114 +++
 .../HyphenationCompoundWordTokenFilter.java   | 217 ++++++
 .../compound/hyphenation/ByteVector.java      | 126 ++++
 .../compound/hyphenation/CharVector.java      | 136 ++++
 .../analysis/compound/hyphenation/Hyphen.java |  69 ++
 .../compound/hyphenation/Hyphenation.java     |  54 ++
 .../hyphenation/HyphenationException.java     |  32 +
 .../compound/hyphenation/HyphenationTree.java | 475 +++++++++++++
 .../compound/hyphenation/PatternConsumer.java |  55 ++
 .../compound/hyphenation/PatternParser.java   | 518 ++++++++++++++
 .../compound/hyphenation/TernaryTree.java     | 663 ++++++++++++++++++
 .../compound/hyphenation/hyphenation.dtd      |  68 ++
 .../compound/hyphenation/package.html         |  10 +
 .../lucene/analysis/compound/package.html     | 166 +++++
 .../compound/TestCompoundWordTokenFilter.java | 214 ++++++
 17 files changed, 3087 insertions(+)
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html
 create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html
 create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java

diff --git a/CHANGES.txt b/CHANGES.txt
index 3023c14462c..5be526d213e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -159,6 +159,7 @@ New features
 12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
     that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll) 
 
+13. LUCENE-1166: Decomposition tokenfilter for languages like German and Swedish (Thomas Peuss via Grant Ingersoll)
 
 Optimizations
 
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
new file mode 100644
index 00000000000..7876977177e
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java
@@ -0,0 +1,169 @@
+package org.apache.lucene.analysis.compound;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Base class for decomposition token filters.
+ */
+public abstract class CompoundWordTokenFilterBase extends TokenFilter {
+  /**
+   * The default for minimal word length that gets decomposed
+   */
+  public static final int DEFAULT_MIN_WORD_SIZE = 5;
+
+  /**
+   * The default for minimal length of subwords that get propagated to the output of this filter
+   */
+  public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
+
+  /**
+   * The default for maximal length of subwords that get propagated to the output of this filter
+   */
+  public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
+  
+  protected final CharArraySet dictionary;
+  protected final LinkedList tokens;
+  protected final int minWordSize;
+  protected final int minSubwordSize;
+  protected final int maxSubwordSize;
+  protected final boolean onlyLongestMatch;
+
+  protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
+  }
+  
+  protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
+    this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+  }
+
+  protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) {
+    this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
+  }
+
+  protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
+    this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+  }
+
+  protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) {
+    this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
+  }
+
+  protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    super(input);
+    
+    this.tokens=new LinkedList();
+    this.minWordSize=minWordSize;
+    this.minSubwordSize=minSubwordSize;
+    this.maxSubwordSize=maxSubwordSize;
+    this.onlyLongestMatch=onlyLongestMatch;
+    
+    if (dictionary instanceof CharArraySet) {
+      this.dictionary = (CharArraySet) dictionary;
+    } else {
+      this.dictionary = new CharArraySet(dictionary.size(), false);
+      addAllLowerCase(this.dictionary, dictionary);
+    }
+  }
+
+  /**
+   * Create a set of words from an array
+   * The resulting Set does case insensitive matching
+   * TODO We should look for a faster dictionary lookup approach.
+   * @param dictionary
+   * @return
+   */
+  public static final Set makeDictionary(final String[] dictionary) {
+    CharArraySet dict = new CharArraySet(dictionary.length, false);
+    addAllLowerCase(dict, Arrays.asList(dictionary));
+    return dict;
+  }
+  
+  public Token next() throws IOException {
+    if (tokens.size() > 0) {
+      return (Token)tokens.removeFirst();
+    }
+
+    Token token = input.next();
+    if (token == null) {
+      return null;
+    }
+
+    decompose(token);
+
+    if (tokens.size() > 0) {
+      return (Token)tokens.removeFirst();
+    } else {
+      return null;
+    }
+  }
+  
+  protected static final void addAllLowerCase(Set target, Collection col) {
+    Iterator iter=col.iterator();
+    
+    while (iter.hasNext()) {
+      target.add(((String)iter.next()).toLowerCase());
+    }
+  }
+  
+  protected static char[] makeLowerCaseCopy(final char[] buffer) {
+    char[] result=new char[buffer.length];
+    System.arraycopy(buffer, 0, result, 0, buffer.length);
+    
+    for (int i=0;i<buffer.length;++i) {
+       result[i]=Character.toLowerCase(buffer[i]);
+    }
+    
+    return result;
+  }
+  
+  protected final Token createToken(final int offset, final int length,
+      final Token prototype) {
+    Token t = new Token(prototype.startOffset() + offset, prototype
+        .startOffset()
+        + offset + length, prototype.type());
+    t.setTermBuffer(prototype.termBuffer(), offset, length);
+    t.setPositionIncrement(0);
+    return t;
+  }
+
+  protected void decompose(final Token token) {
+    // In any case we give the original token back
+    tokens.add(token);
+
+    // Only words longer than minWordSize get processed
+    if (token.termLength() < this.minWordSize) {
+      return;
+    }
+    
+    decomposeInternal(token);
+  }
+  
+  protected abstract void decomposeInternal(final Token token);
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
new file mode 100644
index 00000000000..37dae189d02
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java
@@ -0,0 +1,114 @@
+package org.apache.lucene.analysis.compound;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Set;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A TokenFilter that decomposes compound words found in many germanic languages
+ * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
+ * "Donaudampfschiff" even when you only enter "schiff". 
+ *  It uses a brute-force algorithm to achieve this.
+ */
+public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
+  /**
+   * 
+   * @param input the token stream to process
+   * @param dictionary the word dictionary to match against
+   * @param minWordSize only words longer than this get processed
+   * @param minSubwordSize only subwords longer than this get to the output stream
+   * @param maxSubwordSize only subwords shorter than this get to the output stream
+   * @param onlyLongestMatch Add only the longest matching subword to the stream
+   */
+  public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
+      int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+  }
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param dictionary the word dictionary to match against
+   */
+  public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
+    super(input, dictionary);
+  }
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
+   *        lower case strings. 
+   */
+  public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
+    super(input, dictionary);
+  }
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
+   *        lower case strings. 
+   * @param minWordSize only words longer than this get processed
+   * @param minSubwordSize only subwords longer than this get to the output stream
+   * @param maxSubwordSize only subwords shorter than this get to the output stream
+   * @param onlyLongestMatch Add only the longest matching subword to the stream
+   */
+  public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
+      int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
+  }
+
+  protected void decomposeInternal(final Token token) {
+    // Only words longer than minWordSize get processed
+    if (token.termLength() < this.minWordSize) {
+      return;
+    }
+    
+    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
+    
+    for (int i=0;i<token.termLength()-this.minSubwordSize;++i) {
+        Token longestMatchToken=null;
+        for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
+            if(i+j>token.termLength()) {
+                break;
+            }
+            if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
+                if (this.onlyLongestMatch) {
+                   if (longestMatchToken!=null) {
+                     if (longestMatchToken.termLength()<j) {
+                       longestMatchToken=createToken(i,j,token);
+                     }
+                   } else {
+                     longestMatchToken=createToken(i,j,token);
+                   }
+                } else {
+                   tokens.add(createToken(i,j,token));
+                }
+            } 
+        }
+        if (this.onlyLongestMatch && longestMatchToken!=null) {
+          tokens.add(longestMatchToken);
+        }
+    }
+  }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
new file mode 100644
index 00000000000..0e7f936b3bc
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java
@@ -0,0 +1,217 @@
+package org.apache.lucene.analysis.compound;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
+import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+import org.xml.sax.InputSource;
+
+/**
+ * A TokenFilter that decomposes compound words found in many germanic languages
+ * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
+ * "Donaudampfschiff" even when you only enter "schiff" It uses a hyphenation
+ * grammar and a word dictionary to achieve this.
+ */
+public class HyphenationCompoundWordTokenFilter extends
+    CompoundWordTokenFilterBase {
+  private HyphenationTree hyphenator;
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param hyphenator the hyphenation pattern tree to use for hyphenation
+   * @param dictionary the word dictionary to match against
+   * @param minWordSize only words longer than this get processed
+   * @param minSubwordSize only subwords longer than this get to the output
+   *        stream
+   * @param maxSubwordSize only subwords shorter than this get to the output
+   *        stream
+   * @param onlyLongestMatch Add only the longest matching subword to the stream
+   */
+  public HyphenationCompoundWordTokenFilter(TokenStream input,
+      HyphenationTree hyphenator, String[] dictionary, int minWordSize,
+      int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    this(input, hyphenator, makeDictionary(dictionary), minWordSize,
+        minSubwordSize, maxSubwordSize, onlyLongestMatch);
+  }
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param hyphenator the hyphenation pattern tree to use for hyphenation
+   * @param dictionary the word dictionary to match against
+   */
+  public HyphenationCompoundWordTokenFilter(TokenStream input,
+      HyphenationTree hyphenator, String[] dictionary) {
+    this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
+        DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
+  }
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param hyphenator the hyphenation pattern tree to use for hyphenation
+   * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
+   *        lower case strings. 
+   */
+  public HyphenationCompoundWordTokenFilter(TokenStream input,
+      HyphenationTree hyphenator, Set dictionary) {
+    this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
+        DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
+  }
+
+  /**
+   * 
+   * @param input the token stream to process
+   * @param hyphenator the hyphenation pattern tree to use for hyphenation
+   * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
+   *        lower case strings. 
+   * @param minWordSize only words longer than this get processed
+   * @param minSubwordSize only subwords longer than this get to the output
+   *        stream
+   * @param maxSubwordSize only subwords shorter than this get to the output
+   *        stream
+   * @param onlyLongestMatch Add only the longest matching subword to the stream
+   */
+  public HyphenationCompoundWordTokenFilter(TokenStream input,
+      HyphenationTree hyphenator, Set dictionary, int minWordSize,
+      int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
+    super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
+        onlyLongestMatch);
+
+    this.hyphenator = hyphenator;
+  }
+
+  /**
+   * Create a hyphenator tree
+   * 
+   * @param hyphenationFilename the filename of the XML grammar to load
+   * @return An object representing the hyphenation patterns
+   * @throws Exception
+   */
+  public static HyphenationTree getHyphenationTree(String hyphenationFilename)
+      throws Exception {
+    return getHyphenationTree(new File(hyphenationFilename));
+  }
+
+  /**
+   * Create a hyphenator tree
+   * 
+   * @param hyphenationFile the file of the XML grammar to load
+   * @return An object representing the hyphenation patterns
+   * @throws Exception
+   */
+  public static HyphenationTree getHyphenationTree(File hyphenationFile)
+      throws Exception {
+    return getHyphenationTree(new InputStreamReader(new FileInputStream(
+        hyphenationFile), "ISO-8859-1"));
+  }
+
+  /**
+   * Create a hyphenator tree
+   * 
+   * @param hyphenationReader the reader of the XML grammar to load from
+   * @return An object representing the hyphenation patterns
+   * @throws Exception
+   */
+  public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
+      throws Exception {
+    HyphenationTree tree = new HyphenationTree();
+
+    tree.loadPatterns(new InputSource(hyphenationReader));
+
+    return tree;
+  }
+
+  protected void decomposeInternal(final Token token) {
+    // get the hpyphenation points
+    Hyphenation hyphens = hyphenator.hyphenate(token.termBuffer(), 0, token
+        .termLength(), 1, 1);
+    // No hyphen points found -> exit
+    if (hyphens == null) {
+      return;
+    }
+
+    final int[] hyp = hyphens.getHyphenationPoints();
+    char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
+
+    for (int i = 0; i < hyp.length; ++i) {
+      int remaining = hyp.length - i;
+      int start = hyp[i];
+      Token longestMatchToken = null;
+      for (int j = 1; j < remaining; j++) {
+        int partLength = hyp[i + j] - start;
+
+        // if the part is longer than maxSubwordSize we
+        // are done with this round
+        if (partLength > this.maxSubwordSize) {
+          break;
+        }
+
+        // we only put subwords to the token stream
+        // that are longer than minPartSize
+        if (partLength < this.minSubwordSize) {
+          continue;
+        }
+
+        // check the dictionary
+        if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
+          if (this.onlyLongestMatch) {
+            if (longestMatchToken != null) {
+              if (longestMatchToken.termLength() < partLength) {
+                longestMatchToken = createToken(start, partLength, token);
+              }
+            } else {
+              longestMatchToken = createToken(start, partLength, token);
+            }
+          } else {
+            tokens.add(createToken(start, partLength, token));
+          }
+        } else if (dictionary.contains(lowerCaseTermBuffer, start,
+            partLength - 1)) {
+          // check the dictionary again with a word that is one character
+          // shorter
+          // to avoid problems with genitive 's characters and other binding
+          // characters
+          if (this.onlyLongestMatch) {
+            if (longestMatchToken != null) {
+              if (longestMatchToken.termLength() < partLength - 1) {
+                longestMatchToken = createToken(start, partLength - 1, token);
+              }
+            } else {
+              longestMatchToken = createToken(start, partLength - 1, token);
+            }
+          } else {
+            tokens.add(createToken(start, partLength - 1, token));
+          }
+        }
+      }
+      if (this.onlyLongestMatch && longestMatchToken!=null) {
+        tokens.add(longestMatchToken);
+      }
+    }
+  }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java
new file mode 100644
index 00000000000..64768d435c7
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+import java.io.Serializable;
+
+/**
+ * This class implements a simple byte vector with access to the underlying
+ * array.
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+public class ByteVector implements Serializable {
+
+  /**
+   * Capacity increment size
+   */
+  private static final int DEFAULT_BLOCK_SIZE = 2048;
+
+  private int blockSize;
+
+  /**
+   * The encapsulated array
+   */
+  private byte[] array;
+
+  /**
+   * Points to next free item
+   */
+  private int n;
+
+  public ByteVector() {
+    this(DEFAULT_BLOCK_SIZE);
+  }
+
+  public ByteVector(int capacity) {
+    if (capacity > 0) {
+      blockSize = capacity;
+    } else {
+      blockSize = DEFAULT_BLOCK_SIZE;
+    }
+    array = new byte[blockSize];
+    n = 0;
+  }
+
+  public ByteVector(byte[] a) {
+    blockSize = DEFAULT_BLOCK_SIZE;
+    array = a;
+    n = 0;
+  }
+
+  public ByteVector(byte[] a, int capacity) {
+    if (capacity > 0) {
+      blockSize = capacity;
+    } else {
+      blockSize = DEFAULT_BLOCK_SIZE;
+    }
+    array = a;
+    n = 0;
+  }
+
+  public byte[] getArray() {
+    return array;
+  }
+
+  /**
+   * return number of items in array
+   */
+  public int length() {
+    return n;
+  }
+
+  /**
+   * returns current capacity of array
+   */
+  public int capacity() {
+    return array.length;
+  }
+
+  public void put(int index, byte val) {
+    array[index] = val;
+  }
+
+  public byte get(int index) {
+    return array[index];
+  }
+
+  /**
+   * This is to implement memory allocation in the array. Like malloc().
+   */
+  public int alloc(int size) {
+    int index = n;
+    int len = array.length;
+    if (n + size >= len) {
+      byte[] aux = new byte[len + blockSize];
+      System.arraycopy(array, 0, aux, 0, len);
+      array = aux;
+    }
+    n += size;
+    return index;
+  }
+
+  public void trimToSize() {
+    if (n < array.length) {
+      byte[] aux = new byte[n];
+      System.arraycopy(array, 0, aux, 0, n);
+      array = aux;
+    }
+  }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java
new file mode 100644
index 00000000000..00521808b88
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+import java.io.Serializable;
+
+/**
+ * This class implements a simple char vector with access to the underlying
+ * array.
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+public class CharVector implements Cloneable, Serializable {
+
+  /**
+   * Capacity increment size
+   */
+  private static final int DEFAULT_BLOCK_SIZE = 2048;
+
+  private int blockSize;
+
+  /**
+   * The encapsulated array
+   */
+  private char[] array;
+
+  /**
+   * Points to next free item
+   */
+  private int n;
+
+  public CharVector() {
+    this(DEFAULT_BLOCK_SIZE);
+  }
+
+  public CharVector(int capacity) {
+    if (capacity > 0) {
+      blockSize = capacity;
+    } else {
+      blockSize = DEFAULT_BLOCK_SIZE;
+    }
+    array = new char[blockSize];
+    n = 0;
+  }
+
+  public CharVector(char[] a) {
+    blockSize = DEFAULT_BLOCK_SIZE;
+    array = a;
+    n = a.length;
+  }
+
+  public CharVector(char[] a, int capacity) {
+    if (capacity > 0) {
+      blockSize = capacity;
+    } else {
+      blockSize = DEFAULT_BLOCK_SIZE;
+    }
+    array = a;
+    n = a.length;
+  }
+
+  /**
+   * Reset Vector but don't resize or clear elements
+   */
+  public void clear() {
+    n = 0;
+  }
+
+  public Object clone() {
+    CharVector cv = new CharVector((char[]) array.clone(), blockSize);
+    cv.n = this.n;
+    return cv;
+  }
+
+  public char[] getArray() {
+    return array;
+  }
+
+  /**
+   * return number of items in array
+   */
+  public int length() {
+    return n;
+  }
+
+  /**
+   * returns current capacity of array
+   */
+  public int capacity() {
+    return array.length;
+  }
+
+  public void put(int index, char val) {
+    array[index] = val;
+  }
+
+  public char get(int index) {
+    return array[index];
+  }
+
+  public int alloc(int size) {
+    int index = n;
+    int len = array.length;
+    if (n + size >= len) {
+      char[] aux = new char[len + blockSize];
+      System.arraycopy(array, 0, aux, 0, len);
+      array = aux;
+    }
+    n += size;
+    return index;
+  }
+
+  public void trimToSize() {
+    if (n < array.length) {
+      char[] aux = new char[n];
+      System.arraycopy(array, 0, aux, 0, n);
+      array = aux;
+    }
+  }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java
new file mode 100644
index 00000000000..65a3873afe8
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+import java.io.Serializable;
+
+/**
+ * This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
+ * pre-break text, post-break text and no-break. If no line-break is generated
+ * at this position, the no-break text is used, otherwise, pre-break and
+ * post-break are used. Typically, pre-break is equal to the hyphen character
+ * and the others are empty. However, this general scheme allows support for
+ * cases in some languages where words change spelling if they're split across
+ * lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
+ * from TeX.
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+
+public class Hyphen implements Serializable {
+  public String preBreak;
+
+  public String noBreak;
+
+  public String postBreak;
+
+  Hyphen(String pre, String no, String post) {
+    preBreak = pre;
+    noBreak = no;
+    postBreak = post;
+  }
+
+  Hyphen(String pre) {
+    preBreak = pre;
+    noBreak = null;
+    postBreak = null;
+  }
+
+  public String toString() {
+    if (noBreak == null && postBreak == null && preBreak != null
+        && preBreak.equals("-")) {
+      return "-";
+    }
+    StringBuffer res = new StringBuffer("{");
+    res.append(preBreak);
+    res.append("}{");
+    res.append(postBreak);
+    res.append("}{");
+    res.append(noBreak);
+    res.append('}');
+    return res.toString();
+  }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java
new file mode 100644
index 00000000000..7a276a8a7a2
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+/**
+ * This class represents a hyphenated word.
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ */
+public class Hyphenation {
+
+  private int[] hyphenPoints;
+
+  /**
+   * number of hyphenation points in word
+   */
+  private int len;
+
+  /**
+   * rawWord as made of alternating strings and {@link Hyphen Hyphen} instances
+   */
+  Hyphenation(int[] points) {
+    hyphenPoints = points;
+  }
+
+  /**
+   * @return the number of hyphenation points in the word
+   */
+  public int length() {
+    return hyphenPoints.length;
+  }
+
+  /**
+   * @return the hyphenation points
+   */
+  public int[] getHyphenationPoints() {
+    return hyphenPoints;
+  }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java
new file mode 100644
index 00000000000..3965244735f
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+/**
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+public class HyphenationException extends Exception {
+
+  /**
+   * @see java.lang.Throwable#Throwable(String)
+   */
+  public HyphenationException(String msg) {
+    super(msg);
+  }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
new file mode 100644
index 00000000000..a836494d4bf
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java
@@ -0,0 +1,475 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.xml.sax.InputSource;
+
+/**
+ * This tree structure stores the hyphenation patterns in an efficient way for
+ * fast lookup. It provides the provides the method to hyphenate a word.
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+public class HyphenationTree extends TernaryTree implements PatternConsumer,
+    Serializable {
+
+  private static final long serialVersionUID = -7842107987915665573L;
+
+  /**
+   * value space: stores the interletter values
+   */
+  protected ByteVector vspace;
+
+  /**
+   * This map stores hyphenation exceptions
+   */
+  protected HashMap stoplist;
+
+  /**
+   * This map stores the character classes
+   */
+  protected TernaryTree classmap;
+
+  /**
+   * Temporary map to store interletter values on pattern loading.
+   */
+  private transient TernaryTree ivalues;
+
+  public HyphenationTree() {
+    stoplist = new HashMap(23); // usually a small table
+    classmap = new TernaryTree();
+    vspace = new ByteVector();
+    vspace.alloc(1); // this reserves index 0, which we don't use
+  }
+
+  /**
+   * Packs the values by storing them in 4 bits, two values into a byte Values
+   * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
+   * value.
+   * 
+   * @param values a string of digits from '0' to '9' representing the
+   *        interletter values.
+   * @return the index into the vspace array where the packed values are stored.
+   */
+  protected int packValues(String values) {
+    int i, n = values.length();
+    int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
+    int offset = vspace.alloc(m);
+    byte[] va = vspace.getArray();
+    for (i = 0; i < n; i++) {
+      int j = i >> 1;
+      byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f);
+      if ((i & 1) == 1) {
+        va[j + offset] = (byte) (va[j + offset] | v);
+      } else {
+        va[j + offset] = (byte) (v << 4); // big endian
+      }
+    }
+    va[m - 1 + offset] = 0; // terminator
+    return offset;
+  }
+
+  protected String unpackValues(int k) {
+    StringBuffer buf = new StringBuffer();
+    byte v = vspace.get(k++);
+    while (v != 0) {
+      char c = (char) ((v >>> 4) - 1 + '0');
+      buf.append(c);
+      c = (char) (v & 0x0f);
+      if (c == 0) {
+        break;
+      }
+      c = (char) (c - 1 + '0');
+      buf.append(c);
+      v = vspace.get(k++);
+    }
+    return buf.toString();
+  }
+
+  /**
+   * Read hyphenation patterns from an XML file.
+   * 
+   * @param filename the filename
+   * @throws HyphenationException In case the parsing fails
+   */
+  public void loadPatterns(File f) throws HyphenationException {
+    try {
+      InputSource src = new InputSource(f.toURL().toExternalForm());
+      loadPatterns(src);
+    } catch (MalformedURLException e) {
+      throw new HyphenationException("Error converting the File '" + f
+          + "' to a URL: " + e.getMessage());
+    }
+  }
+
+  /**
+   * Read hyphenation patterns from an XML file.
+   * 
+   * @param source the InputSource for the file
+   * @throws HyphenationException In case the parsing fails
+   */
+  public void loadPatterns(InputSource source) throws HyphenationException {
+    PatternParser pp = new PatternParser(this);
+    ivalues = new TernaryTree();
+
+    pp.parse(source);
+
+    // patterns/values should be now in the tree
+    // let's optimize a bit
+    trimToSize();
+    vspace.trimToSize();
+    classmap.trimToSize();
+
+    // get rid of the auxiliary map
+    ivalues = null;
+  }
+
+  public String findPattern(String pat) {
+    int k = super.find(pat);
+    if (k >= 0) {
+      return unpackValues(k);
+    }
+    return "";
+  }
+
+  /**
+   * String compare, returns 0 if equal or t is a substring of s
+   */
+  protected int hstrcmp(char[] s, int si, char[] t, int ti) {
+    for (; s[si] == t[ti]; si++, ti++) {
+      if (s[si] == 0) {
+        return 0;
+      }
+    }
+    if (t[ti] == 0) {
+      return 0;
+    }
+    return s[si] - t[ti];
+  }
+
+  protected byte[] getValues(int k) {
+    StringBuffer buf = new StringBuffer();
+    byte v = vspace.get(k++);
+    while (v != 0) {
+      char c = (char) ((v >>> 4) - 1);
+      buf.append(c);
+      c = (char) (v & 0x0f);
+      if (c == 0) {
+        break;
+      }
+      c = (char) (c - 1);
+      buf.append(c);
+      v = vspace.get(k++);
+    }
+    byte[] res = new byte[buf.length()];
+    for (int i = 0; i < res.length; i++) {
+      res[i] = (byte) buf.charAt(i);
+    }
+    return res;
+  }
+
+  /**
+   * <p>
+   * Search for all possible partial matches of word starting at index an update
+   * interletter values. In other words, it does something like:
+   * </p>
+   * <code>
+   * for(i=0; i<patterns.length; i++) {
+   * if ( word.substring(index).startsWidth(patterns[i]) )
+   * update_interletter_values(patterns[i]);
+   * }
+   * </code>
+   * <p>
+   * But it is done in an efficient way since the patterns are stored in a
+   * ternary tree. In fact, this is the whole purpose of having the tree: doing
+   * this search without having to test every single pattern. The number of
+   * patterns for languages such as English range from 4000 to 10000. Thus,
+   * doing thousands of string comparisons for each word to hyphenate would be
+   * really slow without the tree. The tradeoff is memory, but using a ternary
+   * tree instead of a trie, almost halves the the memory used by Lout or TeX.
+   * It's also faster than using a hash table
+   * </p>
+   * 
+   * @param word null terminated word to match
+   * @param index start index from word
+   * @param il interletter values array to update
+   */
+  protected void searchPatterns(char[] word, int index, byte[] il) {
+    byte[] values;
+    int i = index;
+    char p, q;
+    char sp = word[i];
+    p = root;
+
+    while (p > 0 && p < sc.length) {
+      if (sc[p] == 0xFFFF) {
+        if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
+          values = getValues(eq[p]); // data pointer is in eq[]
+          int j = index;
+          for (int k = 0; k < values.length; k++) {
+            if (j < il.length && values[k] > il[j]) {
+              il[j] = values[k];
+            }
+            j++;
+          }
+        }
+        return;
+      }
+      int d = sp - sc[p];
+      if (d == 0) {
+        if (sp == 0) {
+          break;
+        }
+        sp = word[++i];
+        p = eq[p];
+        q = p;
+
+        // look for a pattern ending at this position by searching for
+        // the null char ( splitchar == 0 )
+        while (q > 0 && q < sc.length) {
+          if (sc[q] == 0xFFFF) { // stop at compressed branch
+            break;
+          }
+          if (sc[q] == 0) {
+            values = getValues(eq[q]);
+            int j = index;
+            for (int k = 0; k < values.length; k++) {
+              if (j < il.length && values[k] > il[j]) {
+                il[j] = values[k];
+              }
+              j++;
+            }
+            break;
+          } else {
+            q = lo[q];
+
+            /**
+             * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
+             * java chars are unsigned
+             */
+          }
+        }
+      } else {
+        p = d < 0 ? lo[p] : hi[p];
+      }
+    }
+  }
+
+  /**
+   * Hyphenate word and return a Hyphenation object.
+   * 
+   * @param word the word to be hyphenated
+   * @param remainCharCount Minimum number of characters allowed before the
+   *        hyphenation point.
+   * @param pushCharCount Minimum number of characters allowed after the
+   *        hyphenation point.
+   * @return a {@link Hyphenation Hyphenation} object representing the
+   *         hyphenated word or null if word is not hyphenated.
+   */
+  public Hyphenation hyphenate(String word, int remainCharCount,
+      int pushCharCount) {
+    char[] w = word.toCharArray();
+    return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
+  }
+
+  /**
+   * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
+   * may be absent, the first n is at offset, the first l is at offset +
+   * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
+   * into word. In the first part of the routine len = w.length, in the second
+   * part of the routine len = word.length. Three indices are used: index(w),
+   * the index in w, index(word), the index in word, letterindex(word), the
+   * index in the letter part of word. The following relations exist: index(w) =
+   * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
+   * index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
+   * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
+   * iIgnoreAtBeginning
+   */
+
+  /**
+   * Hyphenate word and return an array of hyphenation points.
+   * 
+   * @param w char array that contains the word
+   * @param offset Offset to first character in word
+   * @param len Length of word
+   * @param remainCharCount Minimum number of characters allowed before the
+   *        hyphenation point.
+   * @param pushCharCount Minimum number of characters allowed after the
+   *        hyphenation point.
+   * @return a {@link Hyphenation Hyphenation} object representing the
+   *         hyphenated word or null if word is not hyphenated.
+   */
+  public Hyphenation hyphenate(char[] w, int offset, int len,
+      int remainCharCount, int pushCharCount) {
+    int i;
+    char[] word = new char[len + 3];
+
+    // normalize word
+    char[] c = new char[2];
+    int iIgnoreAtBeginning = 0;
+    int iLength = len;
+    boolean bEndOfLetters = false;
+    for (i = 1; i <= len; i++) {
+      c[0] = w[offset + i - 1];
+      int nc = classmap.find(c, 0);
+      if (nc < 0) { // found a non-letter character ...
+        if (i == (1 + iIgnoreAtBeginning)) {
+          // ... before any letter character
+          iIgnoreAtBeginning++;
+        } else {
+          // ... after a letter character
+          bEndOfLetters = true;
+        }
+        iLength--;
+      } else {
+        if (!bEndOfLetters) {
+          word[i - iIgnoreAtBeginning] = (char) nc;
+        } else {
+          return null;
+        }
+      }
+    }
+    len = iLength;
+    if (len < (remainCharCount + pushCharCount)) {
+      // word is too short to be hyphenated
+      return null;
+    }
+    int[] result = new int[len + 1];
+    int k = 0;
+
+    // check exception list first
+    String sw = new String(word, 1, len);
+    if (stoplist.containsKey(sw)) {
+      // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
+      // null)
+      ArrayList hw = (ArrayList) stoplist.get(sw);
+      int j = 0;
+      for (i = 0; i < hw.size(); i++) {
+        Object o = hw.get(i);
+        // j = index(sw) = letterindex(word)?
+        // result[k] = corresponding index(w)
+        if (o instanceof String) {
+          j += ((String) o).length();
+          if (j >= remainCharCount && j < (len - pushCharCount)) {
+            result[k++] = j + iIgnoreAtBeginning;
+          }
+        }
+      }
+    } else {
+      // use algorithm to get hyphenation points
+      word[0] = '.'; // word start marker
+      word[len + 1] = '.'; // word end marker
+      word[len + 2] = 0; // null terminated
+      byte[] il = new byte[len + 3]; // initialized to zero
+      for (i = 0; i < len + 1; i++) {
+        searchPatterns(word, i, il);
+      }
+
+      // hyphenation points are located where interletter value is odd
+      // i is letterindex(word),
+      // i + 1 is index(word),
+      // result[k] = corresponding index(w)
+      for (i = 0; i < len; i++) {
+        if (((il[i + 1] & 1) == 1) && i >= remainCharCount
+            && i <= (len - pushCharCount)) {
+          result[k++] = i + iIgnoreAtBeginning;
+        }
+      }
+    }
+
+    if (k > 0) {
+      // trim result array
+      int[] res = new int[k+2];
+      System.arraycopy(result, 0, res, 1, k);
+      // We add the synthetical hyphenation points
+      // at the beginning and end of the word
+      res[0]=0;
+      res[k+1]=len;
+      return new Hyphenation(res);
+    } else {
+      return null;
+    }
+  }
+
+  /**
+   * Add a character class to the tree. It is used by
+   * {@link PatternParser PatternParser} as callback to add character classes.
+   * Character classes define the valid word characters for hyphenation. If a
+   * word contains a character not defined in any of the classes, it is not
+   * hyphenated. It also defines a way to normalize the characters in order to
+   * compare them with the stored patterns. Usually pattern files use only lower
+   * case characters, in this case a class for letter 'a', for example, should
+   * be defined as "aA", the first character being the normalization char.
+   */
+  public void addClass(String chargroup) {
+    if (chargroup.length() > 0) {
+      char equivChar = chargroup.charAt(0);
+      char[] key = new char[2];
+      key[1] = 0;
+      for (int i = 0; i < chargroup.length(); i++) {
+        key[0] = chargroup.charAt(i);
+        classmap.insert(key, 0, equivChar);
+      }
+    }
+  }
+
+  /**
+   * Add an exception to the tree. It is used by
+   * {@link PatternParser PatternParser} class as callback to store the
+   * hyphenation exceptions.
+   * 
+   * @param word normalized word
+   * @param hyphenatedword a vector of alternating strings and
+   *        {@link Hyphen hyphen} objects.
+   */
+  public void addException(String word, ArrayList hyphenatedword) {
+    stoplist.put(word, hyphenatedword);
+  }
+
+  /**
+   * Add a pattern to the tree. Mainly, to be used by
+   * {@link PatternParser PatternParser} class as callback to add a pattern to
+   * the tree.
+   * 
+   * @param pattern the hyphenation pattern
+   * @param ivalue interletter weight values indicating the desirability and
+   *        priority of hyphenating at a given point within the pattern. It
+   *        should contain only digit characters. (i.e. '0' to '9').
+   */
+  public void addPattern(String pattern, String ivalue) {
+    int k = ivalues.find(ivalue);
+    if (k <= 0) {
+      k = packValues(ivalue);
+      ivalues.insert(ivalue, (char) k);
+    }
+    insert(pattern, (char) k);
+  }
+
+  public void printStats() {
+    System.out.println("Value space size = "
+        + Integer.toString(vspace.length()));
+    super.printStats();
+
+  }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java
new file mode 100644
index 00000000000..243f2487811
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+import java.util.ArrayList;
+
+/**
+ * This interface is used to connect the XML pattern file parser to the
+ * hyphenation tree.
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
+ */
+public interface PatternConsumer {
+
+  /**
+   * Add a character class. A character class defines characters that are
+   * considered equivalent for the purpose of hyphenation (e.g. "aA"). It
+   * usually means to ignore case.
+   * 
+   * @param chargroup character group
+   */
+  void addClass(String chargroup);
+
+  /**
+   * Add a hyphenation exception. An exception replaces the result obtained by
+   * the algorithm for cases for which this fails or the user wants to provide
+   * his own hyphenation. A hyphenatedword is a vector of alternating String's
+   * and {@link Hyphen Hyphen} instances
+   */
+  void addException(String word, ArrayList hyphenatedword);
+
+  /**
+   * Add hyphenation patterns.
+   * 
+   * @param pattern the pattern
+   * @param values interletter values expressed as a string of digit characters.
+   */
+  void addPattern(String pattern, String values);
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
new file mode 100644
index 00000000000..5108c71d55e
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java
@@ -0,0 +1,518 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id: PatternParser.java 426576 2006-07-28 15:44:37Z jeremias $ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+// SAX
+import org.xml.sax.XMLReader;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.Attributes;
+
+// Java
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+
+import javax.xml.parsers.SAXParserFactory;
+
+/**
+ * A SAX document handler to read and parse hyphenation patterns from a XML
+ * file.
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+public class PatternParser extends DefaultHandler implements PatternConsumer {
+
+  XMLReader parser;
+
+  int currElement;
+
+  PatternConsumer consumer;
+
+  StringBuffer token;
+
+  ArrayList exception;
+
+  char hyphenChar;
+
+  String errMsg;
+
+  static final int ELEM_CLASSES = 1;
+
+  static final int ELEM_EXCEPTIONS = 2;
+
+  static final int ELEM_PATTERNS = 3;
+
+  static final int ELEM_HYPHEN = 4;
+
+  public PatternParser() throws HyphenationException {
+    token = new StringBuffer();
+    parser = createParser();
+    parser.setContentHandler(this);
+    parser.setErrorHandler(this);
+    parser.setEntityResolver(this);
+    hyphenChar = '-'; // default
+
+  }
+
+  public PatternParser(PatternConsumer consumer) throws HyphenationException {
+    this();
+    this.consumer = consumer;
+  }
+
+  public void setConsumer(PatternConsumer consumer) {
+    this.consumer = consumer;
+  }
+
+  /**
+   * Parses a hyphenation pattern file.
+   * 
+   * @param filename the filename
+   * @throws HyphenationException In case of an exception while parsing
+   */
+  public void parse(String filename) throws HyphenationException {
+    parse(new File(filename));
+  }
+
+  /**
+   * Parses a hyphenation pattern file.
+   * 
+   * @param file the pattern file
+   * @throws HyphenationException In case of an exception while parsing
+   */
+  public void parse(File file) throws HyphenationException {
+    try {
+      InputSource src = new InputSource(file.toURL().toExternalForm());
+      parse(src);
+    } catch (MalformedURLException e) {
+      throw new HyphenationException("Error converting the File '" + file
+          + "' to a URL: " + e.getMessage());
+    }
+  }
+
+  /**
+   * Parses a hyphenation pattern file.
+   * 
+   * @param source the InputSource for the file
+   * @throws HyphenationException In case of an exception while parsing
+   */
+  public void parse(InputSource source) throws HyphenationException {
+    try {
+      parser.parse(source);
+    } catch (FileNotFoundException fnfe) {
+      throw new HyphenationException("File not found: " + fnfe.getMessage());
+    } catch (IOException ioe) {
+      throw new HyphenationException(ioe.getMessage());
+    } catch (SAXException e) {
+      throw new HyphenationException(errMsg);
+    }
+  }
+
+  /**
+   * Creates a SAX parser using JAXP
+   * 
+   * @return the created SAX parser
+   */
+  static XMLReader createParser() {
+    try {
+      SAXParserFactory factory = SAXParserFactory.newInstance();
+      factory.setNamespaceAware(true);
+      return factory.newSAXParser().getXMLReader();
+    } catch (Exception e) {
+      throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
+    }
+  }
+
+  protected String readToken(StringBuffer chars) {
+    String word;
+    boolean space = false;
+    int i;
+    for (i = 0; i < chars.length(); i++) {
+      if (Character.isWhitespace(chars.charAt(i))) {
+        space = true;
+      } else {
+        break;
+      }
+    }
+    if (space) {
+      // chars.delete(0,i);
+      for (int countr = i; countr < chars.length(); countr++) {
+        chars.setCharAt(countr - i, chars.charAt(countr));
+      }
+      chars.setLength(chars.length() - i);
+      if (token.length() > 0) {
+        word = token.toString();
+        token.setLength(0);
+        return word;
+      }
+    }
+    space = false;
+    for (i = 0; i < chars.length(); i++) {
+      if (Character.isWhitespace(chars.charAt(i))) {
+        space = true;
+        break;
+      }
+    }
+    token.append(chars.toString().substring(0, i));
+    // chars.delete(0,i);
+    for (int countr = i; countr < chars.length(); countr++) {
+      chars.setCharAt(countr - i, chars.charAt(countr));
+    }
+    chars.setLength(chars.length() - i);
+    if (space) {
+      word = token.toString();
+      token.setLength(0);
+      return word;
+    }
+    token.append(chars);
+    return null;
+  }
+
+  protected static String getPattern(String word) {
+    StringBuffer pat = new StringBuffer();
+    int len = word.length();
+    for (int i = 0; i < len; i++) {
+      if (!Character.isDigit(word.charAt(i))) {
+        pat.append(word.charAt(i));
+      }
+    }
+    return pat.toString();
+  }
+
+  protected ArrayList normalizeException(ArrayList ex) {
+    ArrayList res = new ArrayList();
+    for (int i = 0; i < ex.size(); i++) {
+      Object item = ex.get(i);
+      if (item instanceof String) {
+        String str = (String) item;
+        StringBuffer buf = new StringBuffer();
+        for (int j = 0; j < str.length(); j++) {
+          char c = str.charAt(j);
+          if (c != hyphenChar) {
+            buf.append(c);
+          } else {
+            res.add(buf.toString());
+            buf.setLength(0);
+            char[] h = new char[1];
+            h[0] = hyphenChar;
+            // we use here hyphenChar which is not necessarily
+            // the one to be printed
+            res.add(new Hyphen(new String(h), null, null));
+          }
+        }
+        if (buf.length() > 0) {
+          res.add(buf.toString());
+        }
+      } else {
+        res.add(item);
+      }
+    }
+    return res;
+  }
+
+  protected String getExceptionWord(ArrayList ex) {
+    StringBuffer res = new StringBuffer();
+    for (int i = 0; i < ex.size(); i++) {
+      Object item = ex.get(i);
+      if (item instanceof String) {
+        res.append((String) item);
+      } else {
+        if (((Hyphen) item).noBreak != null) {
+          res.append(((Hyphen) item).noBreak);
+        }
+      }
+    }
+    return res.toString();
+  }
+
+  protected static String getInterletterValues(String pat) {
+    StringBuffer il = new StringBuffer();
+    String word = pat + "a"; // add dummy letter to serve as sentinel
+    int len = word.length();
+    for (int i = 0; i < len; i++) {
+      char c = word.charAt(i);
+      if (Character.isDigit(c)) {
+        il.append(c);
+        i++;
+      } else {
+        il.append('0');
+      }
+    }
+    return il.toString();
+  }
+
+  //
+  // EntityResolver methods
+  //
+  public InputSource resolveEntity(String publicId, String systemId)
+  throws SAXException, IOException {
+    return HyphenationDTDGenerator.generateDTD();
+  }
+
+  //
+  // ContentHandler methods
+  //
+
+  /**
+   * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
+   *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
+   */
+  public void startElement(String uri, String local, String raw,
+      Attributes attrs) {
+    if (local.equals("hyphen-char")) {
+      String h = attrs.getValue("value");
+      if (h != null && h.length() == 1) {
+        hyphenChar = h.charAt(0);
+      }
+    } else if (local.equals("classes")) {
+      currElement = ELEM_CLASSES;
+    } else if (local.equals("patterns")) {
+      currElement = ELEM_PATTERNS;
+    } else if (local.equals("exceptions")) {
+      currElement = ELEM_EXCEPTIONS;
+      exception = new ArrayList();
+    } else if (local.equals("hyphen")) {
+      if (token.length() > 0) {
+        exception.add(token.toString());
+      }
+      exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"),
+          attrs.getValue("post")));
+      currElement = ELEM_HYPHEN;
+    }
+    token.setLength(0);
+  }
+
+  /**
+   * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
+   *      java.lang.String, java.lang.String)
+   */
+  public void endElement(String uri, String local, String raw) {
+
+    if (token.length() > 0) {
+      String word = token.toString();
+      switch (currElement) {
+        case ELEM_CLASSES:
+          consumer.addClass(word);
+          break;
+        case ELEM_EXCEPTIONS:
+          exception.add(word);
+          exception = normalizeException(exception);
+          consumer.addException(getExceptionWord(exception),
+              (ArrayList) exception.clone());
+          break;
+        case ELEM_PATTERNS:
+          consumer.addPattern(getPattern(word), getInterletterValues(word));
+          break;
+        case ELEM_HYPHEN:
+          // nothing to do
+          break;
+      }
+      if (currElement != ELEM_HYPHEN) {
+        token.setLength(0);
+      }
+    }
+    if (currElement == ELEM_HYPHEN) {
+      currElement = ELEM_EXCEPTIONS;
+    } else {
+      currElement = 0;
+    }
+
+  }
+
+  /**
+   * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+   */
+  public void characters(char ch[], int start, int length) {
+    StringBuffer chars = new StringBuffer(length);
+    chars.append(ch, start, length);
+    String word = readToken(chars);
+    while (word != null) {
+      // System.out.println("\"" + word + "\"");
+      switch (currElement) {
+        case ELEM_CLASSES:
+          consumer.addClass(word);
+          break;
+        case ELEM_EXCEPTIONS:
+          exception.add(word);
+          exception = normalizeException(exception);
+          consumer.addException(getExceptionWord(exception),
+              (ArrayList) exception.clone());
+          exception.clear();
+          break;
+        case ELEM_PATTERNS:
+          consumer.addPattern(getPattern(word), getInterletterValues(word));
+          break;
+      }
+      word = readToken(chars);
+    }
+
+  }
+
+  //
+  // ErrorHandler methods
+  //
+
+  /**
+   * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
+   */
+  public void warning(SAXParseException ex) {
+    errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage();
+  }
+
+  /**
+   * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
+   */
+  public void error(SAXParseException ex) {
+    errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
+  }
+
+  /**
+   * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
+   */
+  public void fatalError(SAXParseException ex) throws SAXException {
+    errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage();
+    throw ex;
+  }
+
+  /**
+   * Returns a string of the location.
+   */
+  private String getLocationString(SAXParseException ex) {
+    StringBuffer str = new StringBuffer();
+
+    String systemId = ex.getSystemId();
+    if (systemId != null) {
+      int index = systemId.lastIndexOf('/');
+      if (index != -1) {
+        systemId = systemId.substring(index + 1);
+      }
+      str.append(systemId);
+    }
+    str.append(':');
+    str.append(ex.getLineNumber());
+    str.append(':');
+    str.append(ex.getColumnNumber());
+
+    return str.toString();
+
+  } // getLocationString(SAXParseException):String
+
+  // PatternConsumer implementation for testing purposes
+  public void addClass(String c) {
+    System.out.println("class: " + c);
+  }
+
+  public void addException(String w, ArrayList e) {
+    System.out.println("exception: " + w + " : " + e.toString());
+  }
+
+  public void addPattern(String p, String v) {
+    System.out.println("pattern: " + p + " : " + v);
+  }
+
+  public static void main(String[] args) throws Exception {
+    if (args.length > 0) {
+      PatternParser pp = new PatternParser();
+      pp.setConsumer(pp);
+      pp.parse(args[0]);
+    }
+  }
+}
+
+class HyphenationDTDGenerator {
+  public static final String DTD_STRING=
+    "<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
+    "<!--\n"+
+    "  Copyright 1999-2004 The Apache Software Foundation\n"+
+    "\n"+
+    "  Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
+    "  you may not use this file except in compliance with the License.\n"+
+    "  You may obtain a copy of the License at\n"+
+    "\n"+
+    "       http://www.apache.org/licenses/LICENSE-2.0\n"+
+    "\n"+
+    "  Unless required by applicable law or agreed to in writing, software\n"+
+    "  distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
+    "  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
+    "  See the License for the specific language governing permissions and\n"+
+    "  limitations under the License.\n"+
+    "-->\n"+
+    "<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
+    "\n"+
+    "<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
+    "                           classes, exceptions?, patterns)>\n"+
+    "\n"+
+    "<!-- Hyphen character to be used in the exception list as shortcut for\n"+
+    "     <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
+    "-->\n"+
+    "<!ELEMENT hyphen-char EMPTY>\n"+
+    "<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
+    "\n"+
+    "<!-- Default minimun length in characters of hyphenated word fragments\n"+
+    "     before and after the line break. For some languages this is not\n"+
+    "     only for aesthetic purposes, wrong hyphens may be generated if this\n"+
+    "     is not accounted for.\n"+
+    "-->\n"+
+    "<!ELEMENT hyphen-min EMPTY>\n"+
+    "<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
+    "<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
+    "\n"+
+    "<!-- Character equivalent classes: space separated list of character groups, all\n"+
+    "     characters in a group are to be treated equivalent as far as\n"+
+    "     the hyphenation algorithm is concerned. The first character in a group\n"+
+    "     is the group's equivalent character. Patterns should only contain\n"+
+    "     first characters. It also defines word characters, i.e. a word that\n"+
+    "     contains characters not present in any of the classes is not hyphenated.\n"+
+    "-->\n"+
+    "<!ELEMENT classes (#PCDATA)>\n"+
+    "\n"+
+    "<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
+    "     A hyphen is indicated by the hyphen tag, but you can use the\n"+
+    "     hyphen-char defined previously as shortcut. This is in cases\n"+
+    "     when the algorithm procedure finds wrong hyphens or you want\n"+
+    "     to provide your own hyphenation for some words.\n"+
+    "-->\n"+
+    "<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
+    "\n"+
+    "<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
+    "     characters as described before, between any two word characters a digit\n"+
+    "     in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
+    "     to zero. The '.' character is reserved to indicate begining or ending\n"+
+    "     of words. -->\n"+
+    "<!ELEMENT patterns (#PCDATA)>\n"+
+    "\n"+
+    "<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
+    "     with pre-break, post-break and no-break attributes.\n"+
+    "     To be used in the exceptions list, the hyphen character is not\n"+
+    "     automatically added -->\n"+
+    "<!ELEMENT hyphen EMPTY>\n"+
+    "<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
+    "<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
+    "<!ATTLIST hyphen post CDATA #IMPLIED>\n";
+  
+ public static InputSource generateDTD() {
+    return new InputSource(new StringReader(DTD_STRING));
+  }
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
new file mode 100644
index 00000000000..b327cd7ba4a
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java
@@ -0,0 +1,663 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.compound.hyphenation;
+
+import java.util.Enumeration;
+import java.util.Stack;
+import java.io.Serializable;
+
+/**
+ * <h2>Ternary Search Tree.</h2>
+ * 
+ * <p>
+ * A ternary search tree is a hibrid between a binary tree and a digital search
+ * tree (trie). Keys are limited to strings. A data value of type char is stored
+ * in each leaf node. It can be used as an index (or pointer) to the data.
+ * Branches that only contain one key are compressed to one node by storing a
+ * pointer to the trailer substring of the key. This class is intended to serve
+ * as base class or helper class to implement Dictionary collections or the
+ * like. Ternary trees have some nice properties as the following: the tree can
+ * be traversed in sorted order, partial matches (wildcard) can be implemented,
+ * retrieval of all keys within a given distance from the target, etc. The
+ * storage requirements are higher than a binary tree but a lot less than a
+ * trie. Performance is comparable with a hash table, sometimes it outperforms a
+ * hash function (most of the time can determine a miss faster than a hash).
+ * </p>
+ * 
+ * <p>
+ * The main purpose of this java port is to serve as a base for implementing
+ * TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
+ * requires from 5000 to 15000 hyphenation patterns which will be keys in this
+ * tree. The strings patterns are usually small (from 2 to 5 characters), but
+ * each char in the tree is stored in a node. Thus memory usage is the main
+ * concern. We will sacrify 'elegance' to keep memory requirenments to the
+ * minimum. Using java's char type as pointer (yes, I know pointer it is a
+ * forbidden word in java) we can keep the size of the node to be just 8 bytes
+ * (3 pointers and the data char). This gives room for about 65000 nodes. In my
+ * tests the english patterns took 7694 nodes and the german patterns 10055
+ * nodes, so I think we are safe.
+ * </p>
+ * 
+ * <p>
+ * All said, this is a map with strings as keys and char as value. Pretty
+ * limited!. It can be extended to a general map by using the string
+ * representation of an object and using the char value as an index to an array
+ * that contains the object values.
+ * </p>
+ * 
+ * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+ */
+
+public class TernaryTree implements Cloneable, Serializable {
+
+  /**
+   * We use 4 arrays to represent a node. I guess I should have created a proper
+   * node class, but somehow Knuth's pascal code made me forget we now have a
+   * portable language with virtual memory management and automatic garbage
+   * collection! And now is kind of late, furthermore, if it ain't broken, don't
+   * fix it.
+   */
+
+  /**
+   * Pointer to low branch and to rest of the key when it is stored directly in
+   * this node, we don't have unions in java!
+   */
+  protected char[] lo;
+
+  /**
+   * Pointer to high branch.
+   */
+  protected char[] hi;
+
+  /**
+   * Pointer to equal branch and to data when this node is a string terminator.
+   */
+  protected char[] eq;
+
+  /**
+   * <P>
+   * The character stored in this node: splitchar. Two special values are
+   * reserved:
+   * </P>
+   * <ul>
+   * <li>0x0000 as string terminator</li>
+   * <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
+   * </ul>
+   * <p>
+   * This shouldn't be a problem if we give the usual semantics to strings since
+   * 0xFFFF is garanteed not to be an Unicode character.
+   * </p>
+   */
+  protected char[] sc;
+
+  /**
+   * This vector holds the trailing of the keys when the branch is compressed.
+   */
+  protected CharVector kv;
+
+  protected char root;
+
+  protected char freenode;
+
+  protected int length; // number of items in tree
+
+  protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
+
+  TernaryTree() {
+    init();
+  }
+
+  protected void init() {
+    root = 0;
+    freenode = 1;
+    length = 0;
+    lo = new char[BLOCK_SIZE];
+    hi = new char[BLOCK_SIZE];
+    eq = new char[BLOCK_SIZE];
+    sc = new char[BLOCK_SIZE];
+    kv = new CharVector();
+  }
+
+  /**
+   * Branches are initially compressed, needing one node per key plus the size
+   * of the string key. They are decompressed as needed when another key with
+   * same prefix is inserted. This saves a lot of space, specially for long
+   * keys.
+   */
+  public void insert(String key, char val) {
+    // make sure we have enough room in the arrays
+    int len = key.length() + 1; // maximum number of nodes that may be generated
+    if (freenode + len > eq.length) {
+      redimNodeArrays(eq.length + BLOCK_SIZE);
+    }
+    char strkey[] = new char[len--];
+    key.getChars(0, len, strkey, 0);
+    strkey[len] = 0;
+    root = insert(root, strkey, 0, val);
+  }
+
+  public void insert(char[] key, int start, char val) {
+    int len = strlen(key) + 1;
+    if (freenode + len > eq.length) {
+      redimNodeArrays(eq.length + BLOCK_SIZE);
+    }
+    root = insert(root, key, start, val);
+  }
+
+  /**
+   * The actual insertion function, recursive version.
+   */
+  private char insert(char p, char[] key, int start, char val) {
+    int len = strlen(key, start);
+    if (p == 0) {
+      // this means there is no branch, this node will start a new branch.
+      // Instead of doing that, we store the key somewhere else and create
+      // only one node with a pointer to the key
+      p = freenode++;
+      eq[p] = val; // holds data
+      length++;
+      hi[p] = 0;
+      if (len > 0) {
+        sc[p] = 0xFFFF; // indicates branch is compressed
+        lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key
+        strcpy(kv.getArray(), lo[p], key, start);
+      } else {
+        sc[p] = 0;
+        lo[p] = 0;
+      }
+      return p;
+    }
+
+    if (sc[p] == 0xFFFF) {
+      // branch is compressed: need to decompress
+      // this will generate garbage in the external key array
+      // but we can do some garbage collection later
+      char pp = freenode++;
+      lo[pp] = lo[p]; // previous pointer to key
+      eq[pp] = eq[p]; // previous pointer to data
+      lo[p] = 0;
+      if (len > 0) {
+        sc[p] = kv.get(lo[pp]);
+        eq[p] = pp;
+        lo[pp]++;
+        if (kv.get(lo[pp]) == 0) {
+          // key completly decompressed leaving garbage in key array
+          lo[pp] = 0;
+          sc[pp] = 0;
+          hi[pp] = 0;
+        } else {
+          // we only got first char of key, rest is still there
+          sc[pp] = 0xFFFF;
+        }
+      } else {
+        // In this case we can save a node by swapping the new node
+        // with the compressed node
+        sc[pp] = 0xFFFF;
+        hi[p] = pp;
+        sc[p] = 0;
+        eq[p] = val;
+        length++;
+        return p;
+      }
+    }
+    char s = key[start];
+    if (s < sc[p]) {
+      lo[p] = insert(lo[p], key, start, val);
+    } else if (s == sc[p]) {
+      if (s != 0) {
+        eq[p] = insert(eq[p], key, start + 1, val);
+      } else {
+        // key already in tree, overwrite data
+        eq[p] = val;
+      }
+    } else {
+      hi[p] = insert(hi[p], key, start, val);
+    }
+    return p;
+  }
+
+  /**
+   * Compares 2 null terminated char arrays
+   */
+  public static int strcmp(char[] a, int startA, char[] b, int startB) {
+    for (; a[startA] == b[startB]; startA++, startB++) {
+      if (a[startA] == 0) {
+        return 0;
+      }
+    }
+    return a[startA] - b[startB];
+  }
+
+  /**
+   * Compares a string with null terminated char array
+   */
+  public static int strcmp(String str, char[] a, int start) {
+    int i, d, len = str.length();
+    for (i = 0; i < len; i++) {
+      d = (int) str.charAt(i) - a[start + i];
+      if (d != 0) {
+        return d;
+      }
+      if (a[start + i] == 0) {
+        return d;
+      }
+    }
+    if (a[start + i] != 0) {
+      return (int) -a[start + i];
+    }
+    return 0;
+
+  }
+
+  public static void strcpy(char[] dst, int di, char[] src, int si) {
+    while (src[si] != 0) {
+      dst[di++] = src[si++];
+    }
+    dst[di] = 0;
+  }
+
+  public static int strlen(char[] a, int start) {
+    int len = 0;
+    for (int i = start; i < a.length && a[i] != 0; i++) {
+      len++;
+    }
+    return len;
+  }
+
+  public static int strlen(char[] a) {
+    return strlen(a, 0);
+  }
+
+  public int find(String key) {
+    int len = key.length();
+    char strkey[] = new char[len + 1];
+    key.getChars(0, len, strkey, 0);
+    strkey[len] = 0;
+
+    return find(strkey, 0);
+  }
+
+  public int find(char[] key, int start) {
+    int d;
+    char p = root;
+    int i = start;
+    char c;
+
+    while (p != 0) {
+      if (sc[p] == 0xFFFF) {
+        if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
+          return eq[p];
+        } else {
+          return -1;
+        }
+      }
+      c = key[i];
+      d = c - sc[p];
+      if (d == 0) {
+        if (c == 0) {
+          return eq[p];
+        }
+        i++;
+        p = eq[p];
+      } else if (d < 0) {
+        p = lo[p];
+      } else {
+        p = hi[p];
+      }
+    }
+    return -1;
+  }
+
+  public boolean knows(String key) {
+    return (find(key) >= 0);
+  }
+
+  // redimension the arrays
+  private void redimNodeArrays(int newsize) {
+    int len = newsize < lo.length ? newsize : lo.length;
+    char[] na = new char[newsize];
+    System.arraycopy(lo, 0, na, 0, len);
+    lo = na;
+    na = new char[newsize];
+    System.arraycopy(hi, 0, na, 0, len);
+    hi = na;
+    na = new char[newsize];
+    System.arraycopy(eq, 0, na, 0, len);
+    eq = na;
+    na = new char[newsize];
+    System.arraycopy(sc, 0, na, 0, len);
+    sc = na;
+  }
+
+  public int size() {
+    return length;
+  }
+
+  public Object clone() {
+    TernaryTree t = new TernaryTree();
+    t.lo = (char[]) this.lo.clone();
+    t.hi = (char[]) this.hi.clone();
+    t.eq = (char[]) this.eq.clone();
+    t.sc = (char[]) this.sc.clone();
+    t.kv = (CharVector) this.kv.clone();
+    t.root = this.root;
+    t.freenode = this.freenode;
+    t.length = this.length;
+
+    return t;
+  }
+
+  /**
+   * Recursively insert the median first and then the median of the lower and
+   * upper halves, and so on in order to get a balanced tree. The array of keys
+   * is assumed to be sorted in ascending order.
+   */
+  protected void insertBalanced(String[] k, char[] v, int offset, int n) {
+    int m;
+    if (n < 1) {
+      return;
+    }
+    m = n >> 1;
+
+    insert(k[m + offset], v[m + offset]);
+    insertBalanced(k, v, offset, m);
+
+    insertBalanced(k, v, offset + m + 1, n - m - 1);
+  }
+
+  /**
+   * Balance the tree for best search performance
+   */
+  public void balance() {
+    // System.out.print("Before root splitchar = ");
+    // System.out.println(sc[root]);
+
+    int i = 0, n = length;
+    String[] k = new String[n];
+    char[] v = new char[n];
+    Iterator iter = new Iterator();
+    while (iter.hasMoreElements()) {
+      v[i] = iter.getValue();
+      k[i++] = (String) iter.nextElement();
+    }
+    init();
+    insertBalanced(k, v, 0, n);
+
+    // With uniform letter distribution sc[root] should be around 'm'
+    // System.out.print("After root splitchar = ");
+    // System.out.println(sc[root]);
+  }
+
+  /**
+   * Each node stores a character (splitchar) which is part of some key(s). In a
+   * compressed branch (one that only contain a single string key) the trailer
+   * of the key which is not already in nodes is stored externally in the kv
+   * array. As items are inserted, key substrings decrease. Some substrings may
+   * completely disappear when the whole branch is totally decompressed. The
+   * tree is traversed to find the key substrings actually used. In addition,
+   * duplicate substrings are removed using a map (implemented with a
+   * TernaryTree!).
+   * 
+   */
+  public void trimToSize() {
+    // first balance the tree for best performance
+    balance();
+
+    // redimension the node arrays
+    redimNodeArrays(freenode);
+
+    // ok, compact kv array
+    CharVector kx = new CharVector();
+    kx.alloc(1);
+    TernaryTree map = new TernaryTree();
+    compact(kx, map, root);
+    kv = kx;
+    kv.trimToSize();
+  }
+
+  private void compact(CharVector kx, TernaryTree map, char p) {
+    int k;
+    if (p == 0) {
+      return;
+    }
+    if (sc[p] == 0xFFFF) {
+      k = map.find(kv.getArray(), lo[p]);
+      if (k < 0) {
+        k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
+        strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
+        map.insert(kx.getArray(), k, (char) k);
+      }
+      lo[p] = (char) k;
+    } else {
+      compact(kx, map, lo[p]);
+      if (sc[p] != 0) {
+        compact(kx, map, eq[p]);
+      }
+      compact(kx, map, hi[p]);
+    }
+  }
+
+  public Enumeration keys() {
+    return new Iterator();
+  }
+
+  public class Iterator implements Enumeration {
+
+    /**
+     * current node index
+     */
+    int cur;
+
+    /**
+     * current key
+     */
+    String curkey;
+
+    private class Item implements Cloneable {
+      char parent;
+
+      char child;
+
+      public Item() {
+        parent = 0;
+        child = 0;
+      }
+
+      public Item(char p, char c) {
+        parent = p;
+        child = c;
+      }
+
+      public Object clone() {
+        return new Item(parent, child);
+      }
+
+    }
+
+    /**
+     * Node stack
+     */
+    Stack ns;
+
+    /**
+     * key stack implemented with a StringBuffer
+     */
+    StringBuffer ks;
+
+    public Iterator() {
+      cur = -1;
+      ns = new Stack();
+      ks = new StringBuffer();
+      rewind();
+    }
+
+    public void rewind() {
+      ns.removeAllElements();
+      ks.setLength(0);
+      cur = root;
+      run();
+    }
+
+    public Object nextElement() {
+      String res = new String(curkey);
+      cur = up();
+      run();
+      return res;
+    }
+
+    public char getValue() {
+      if (cur >= 0) {
+        return eq[cur];
+      }
+      return 0;
+    }
+
+    public boolean hasMoreElements() {
+      return (cur != -1);
+    }
+
+    /**
+     * traverse upwards
+     */
+    private int up() {
+      Item i = new Item();
+      int res = 0;
+
+      if (ns.empty()) {
+        return -1;
+      }
+
+      if (cur != 0 && sc[cur] == 0) {
+        return lo[cur];
+      }
+
+      boolean climb = true;
+
+      while (climb) {
+        i = (Item) ns.pop();
+        i.child++;
+        switch (i.child) {
+          case 1:
+            if (sc[i.parent] != 0) {
+              res = eq[i.parent];
+              ns.push(i.clone());
+              ks.append(sc[i.parent]);
+            } else {
+              i.child++;
+              ns.push(i.clone());
+              res = hi[i.parent];
+            }
+            climb = false;
+            break;
+
+          case 2:
+            res = hi[i.parent];
+            ns.push(i.clone());
+            if (ks.length() > 0) {
+              ks.setLength(ks.length() - 1); // pop
+            }
+            climb = false;
+            break;
+
+          default:
+            if (ns.empty()) {
+              return -1;
+            }
+            climb = true;
+            break;
+        }
+      }
+      return res;
+    }
+
+    /**
+     * traverse the tree to find next key
+     */
+    private int run() {
+      if (cur == -1) {
+        return -1;
+      }
+
+      boolean leaf = false;
+      while (true) {
+        // first go down on low branch until leaf or compressed branch
+        while (cur != 0) {
+          if (sc[cur] == 0xFFFF) {
+            leaf = true;
+            break;
+          }
+          ns.push(new Item((char) cur, '\u0000'));
+          if (sc[cur] == 0) {
+            leaf = true;
+            break;
+          }
+          cur = lo[cur];
+        }
+        if (leaf) {
+          break;
+        }
+        // nothing found, go up one node and try again
+        cur = up();
+        if (cur == -1) {
+          return -1;
+        }
+      }
+      // The current node should be a data node and
+      // the key should be in the key stack (at least partially)
+      StringBuffer buf = new StringBuffer(ks.toString());
+      if (sc[cur] == 0xFFFF) {
+        int p = lo[cur];
+        while (kv.get(p) != 0) {
+          buf.append(kv.get(p++));
+        }
+      }
+      curkey = buf.toString();
+      return 0;
+    }
+
+  }
+
+  public void printStats() {
+    System.out.println("Number of keys = " + Integer.toString(length));
+    System.out.println("Node count = " + Integer.toString(freenode));
+    // System.out.println("Array length = " + Integer.toString(eq.length));
+    System.out.println("Key Array length = " + Integer.toString(kv.length()));
+
+    /*
+     * for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
+     * System.out.print(kv.get(i)); else System.out.println("");
+     * System.out.println("Keys:"); for(Enumeration enum = keys();
+     * enum.hasMoreElements(); ) System.out.println(enum.nextElement());
+     */
+
+  }
+
+  public static void main(String[] args) throws Exception {
+    TernaryTree tt = new TernaryTree();
+    tt.insert("Carlos", 'C');
+    tt.insert("Car", 'r');
+    tt.insert("palos", 'l');
+    tt.insert("pa", 'p');
+    tt.trimToSize();
+    System.out.println((char) tt.find("Car"));
+    System.out.println((char) tt.find("Carlos"));
+    System.out.println((char) tt.find("alto"));
+    tt.printStats();
+  }
+
+}
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
new file mode 100644
index 00000000000..3cbd50eebc0
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd
@@ -0,0 +1,68 @@
+<?xml version="1.0" encoding="US-ASCII"?>
+<!--
+  Copyright 1999-2004 The Apache Software Foundation
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
+
+<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
+                           classes, exceptions?, patterns)>
+
+<!-- Hyphen character to be used in the exception list as shortcut for
+     <hyphen pre-break="-"/>. Defaults to '-'
+-->
+<!ELEMENT hyphen-char EMPTY>
+<!ATTLIST hyphen-char value CDATA #REQUIRED>
+
+<!-- Default minimun length in characters of hyphenated word fragments
+     before and after the line break. For some languages this is not
+     only for aesthetic purposes, wrong hyphens may be generated if this
+     is not accounted for.
+-->
+<!ELEMENT hyphen-min EMPTY>
+<!ATTLIST hyphen-min before CDATA #REQUIRED>
+<!ATTLIST hyphen-min after CDATA #REQUIRED>
+
+<!-- Character equivalent classes: space separated list of character groups, all
+     characters in a group are to be treated equivalent as far as
+     the hyphenation algorithm is concerned. The first character in a group
+     is the group's equivalent character. Patterns should only contain
+     first characters. It also defines word characters, i.e. a word that
+     contains characters not present in any of the classes is not hyphenated.
+-->
+<!ELEMENT classes (#PCDATA)>
+
+<!-- Hyphenation exceptions: space separated list of hyphenated words.
+     A hyphen is indicated by the hyphen tag, but you can use the
+     hyphen-char defined previously as shortcut. This is in cases
+     when the algorithm procedure finds wrong hyphens or you want
+     to provide your own hyphenation for some words.
+-->
+<!ELEMENT exceptions (#PCDATA|hyphen)* >
+
+<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
+     characters as described before, between any two word characters a digit
+     in the range 0 to 9 may be specified. The absence of a digit is equivalent
+     to zero. The '.' character is reserved to indicate begining or ending
+     of words. -->
+<!ELEMENT patterns (#PCDATA)>
+
+<!-- A "full hyphen" equivalent to TeX's \discretionary
+     with pre-break, post-break and no-break attributes.
+     To be used in the exceptions list, the hyphen character is not
+     automatically added -->
+<!ELEMENT hyphen EMPTY>
+<!ATTLIST hyphen pre CDATA #IMPLIED>
+<!ATTLIST hyphen no CDATA #IMPLIED>
+<!ATTLIST hyphen post CDATA #IMPLIED>
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html
new file mode 100644
index 00000000000..e62afc334ef
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html
@@ -0,0 +1,10 @@
+<html>
+  <head>
+    <title>Hypenation code for the CompoundWordTokenFilter</title>
+  </head>
+  <body>
+    <p>
+       The code for the compound word hyphenation is taken from the <a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>. All credits for the hyphenation code belongs to them.
+    </p>
+  </body>
+</html>
\ No newline at end of file
diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html
new file mode 100644
index 00000000000..cf3e8bf07b6
--- /dev/null
+++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html
@@ -0,0 +1,166 @@
+<html>
+<head>
+<title>CompoundWordTokenFilter</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></meta>
+</head>
+<body>
+A filter that decomposes compound words you find in many Germanic
+languages to the word parts. This example shows what it does:
+<table border="1">
+	<tr>
+		<th>Input token stream</th>
+	</tr>
+	<tr>
+		<td>Rindfleisch&uuml;berwachungsgesetz Drahtschere abba</td>
+	</tr>
+</table>
+<br>
+<table border="1">
+	<tr>
+		<th>Output token stream</th>
+	</tr>
+	<tr>
+		<td>(Rindfleisch&uuml;berwachungsgesetz,0,29)</td>
+	</tr>
+	<tr>
+		<td>(Rind,0,4,posIncr=0)</td>
+	</tr>
+	<tr>
+		<td>(fleisch,4,11,posIncr=0)</td>
+	</tr>
+	<tr>
+		<td>(&uuml;berwachung,11,22,posIncr=0)</td>
+	</tr>
+	<tr>
+		<td>(gesetz,23,29,posIncr=0)</td>
+	</tr>
+	<tr>
+		<td>(Drahtschere,30,41)</td>
+	</tr>
+	<tr>
+		<td>(Draht,30,35,posIncr=0)</td>
+	</tr>
+	<tr>
+		<td>(schere,35,41,posIncr=0)</td>
+	</tr>
+	<tr>
+		<td>(abba,42,46)</td>
+	</tr>
+</table>
+
+The input token is always preserved and the filters do not alter the case of word parts. There are two variants of the
+filter available:
+<ul>
+	<li><i>HyphenationCompoundWordTokenFilter</i>: it uses a
+	hyphenation grammer based approach to find potential word parts of a
+	given word.</li>
+	<li><i>DictionaryCompoundWordTokenFilter</i>: it uses a
+	brute-force dictionary-only based approach to find the word parts of a given
+	word.</li>
+</ul>
+
+<h3>Compound word token filters</h3>
+<h4>HyphenationCompoundWordTokenFilter</h4>
+The {@link
+org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
+HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
+potential subwords that a worth to check against the dictionary. The
+quality of the output tokens is directly connected to the quality of the
+grammar file you use. For languages like German they are quite good.
+<h5>Grammar file</h5>
+Unfortunately we cannot bundle the hyphenation grammar files with Lucene
+because they do not use an ASF compatible license (they use the LaTeX
+Project Public License instead). You can find the XML based grammar
+files at the
+<a href="http://offo.sourceforge.net/hyphenation/index.html">Objects
+For Formatting Objects</a>
+(OFFO) Sourceforge project (direct link to download the pattern files:
+<a href="http://downloads.sourceforge.net/offo/offo-hyphenation.zip">http://downloads.sourceforge.net/offo/offo-hyphenation.zip</a>
+). The files you need are in the subfolder
+<i>offo-hyphenation/hyph/</i>
+.
+<br />
+Credits for the hyphenation code go to the
+<a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>
+.
+
+<h4>DictionaryCompoundWordTokenFilter</h4>
+The {@link
+org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter
+DictionaryCompoundWordTokenFilter} uses a dictionary-only approach to
+find subwords in a compound word. It is much slower than the one that
+uses the hyphenation grammars. You can use it as a first start to
+see if your dictionary is good or not because it is much simpler in design.
+
+<h3>Dictionary</h3>
+The output quality of both token filters is directly connected to the
+quality of the dictionary you use. They are language dependent of course.
+You always should use a dictionary
+that fits to the text you want to index. If you index medical text for
+example then you should use a dictionary that contains medical words.
+A good start for general text are the dictionaries you find at the
+<a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+dictionaries</a>
+Wiki.
+
+<h3>Which variant should I use?</h3>
+This decision matrix should help you:
+<table border="1">
+	<tr>
+		<th>Token filter</th>
+		<th>Output quality</th>
+		<th>Performance</th>
+	</tr>
+	<tr>
+		<td>HyphenationCompoundWordTokenFilter</td>
+		<td>good if grammar file is good &ndash; acceptable otherwise</td>
+		<td>fast</td>
+	</tr>
+	<tr>
+		<td>DictionaryCompoundWordTokenFilter</td>
+		<td>good</td>
+		<td>slow</td>
+	</tr>
+</table>
+<h3>Examples</h3>
+<pre>
+  public void testHyphenationCompoundWordsDE() throws Exception {
+    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
+        "Aufgabe", "&Uuml;berwachung" };
+
+    Reader reader = new FileReader("de_DR.xml");
+
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(reader);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader(
+            "Rindfleisch&uuml;berwachungsgesetz Drahtschere abba")), hyphenator,
+        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+        
+    Token t;
+    while ((t=tf.next())!=null) {
+       System.out.println(t);
+    }
+  }
+  
+  public void testDumbCompoundWordsSE() throws Exception {
+    String[] dict = { "Bil", "D&ouml;rr", "Motor", "Tak", "Borr", "Slag", "Hammar",
+        "Pelar", "Glas", "&Ouml;gon", "Fodral", "Bas", "Fiol", "Makare", "Ges&auml;ll",
+        "Sko", "Vind", "Rute", "Torkare", "Blad" };
+
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+        new WhitespaceTokenizer(
+            new StringReader(
+                "Bild&ouml;rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas&ouml;gonfodral Basfiolsfodral Basfiolsfodralmakareges&auml;ll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
+        dict);
+    Token t;
+    while ((t=tf.next())!=null) {
+       System.out.println(t);
+    }
+  }
+</pre>
+</body>
+</html>
\ No newline at end of file
diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
new file mode 100644
index 00000000000..19f62a31a81
--- /dev/null
+++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java
@@ -0,0 +1,214 @@
+package org.apache.lucene.analysis.compound;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
+import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
+import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
+
+import junit.framework.TestCase;
+
+public class TestCompoundWordTokenFilter extends TestCase {
+  private static String[] locations = {
+      "http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
+      "http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
+      "http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
+      "http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
+
+  private byte[] patternsFileContent;
+
+  protected void setUp() throws Exception {
+    super.setUp();
+    getHyphenationPatternFileContents();
+  }
+
+  public void testHyphenationCompoundWordsDE() throws Exception {
+    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
+        "Aufgabe", "Überwachung" };
+
+    Reader reader = getHyphenationReader("de_DR.xml");
+    if (reader == null) {
+      // we gracefully die if we have no reader
+      return;
+    }
+
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(reader);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader(
+            "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
+        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+    assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
+        "fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
+        "abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
+        29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
+        0, 1 });
+  }
+
+  public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
+    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
+        "Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
+
+    Reader reader = getHyphenationReader("de_DR.xml");
+    if (reader == null) {
+      // we gracefully die if we have no reader
+      return;
+    }
+
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(reader);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader(
+            "Rindfleischüberwachungsgesetz")), hyphenator, dict,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
+    assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz",
+        "Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
+        0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
+        0, 0, 0 });
+  }
+
+  public void testDumbCompoundWordsSE() throws Exception {
+    String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
+        "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
+        "Sko", "Vind", "Rute", "Torkare", "Blad" };
+
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+        new WhitespaceTokenizer(
+            new StringReader(
+                "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
+        dict);
+
+    assertFiltersTo(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
+        "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
+        "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
+        "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
+        "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
+        "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
+        "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
+        "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
+        17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
+        77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
+        137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
+        28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
+        87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
+        155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
+        0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
+        0, 0, 0, 1 });
+  }
+
+  public void testDumbCompoundWordsSELongestMatch() throws Exception {
+    String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
+        "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
+        "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
+
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")),
+        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
+
+    assertFiltersTo(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
+        "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
+        14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
+        0, 0 });
+  }
+
+  private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
+      int[] endOffset, int[] posIncr) throws Exception {
+    for (int i = 0; i < s.length; ++i) {
+      Token t = tf.next();
+      assertNotNull(t);
+      assertEquals(s[i], new String(t.termBuffer(), 0, t.termLength()));
+      assertEquals(startOffset[i], t.startOffset());
+      assertEquals(endOffset[i], t.endOffset());
+      assertEquals(posIncr[i], t.getPositionIncrement());
+    }
+    assertNull(tf.next());
+  }
+
+  private void getHyphenationPatternFileContents() {
+    try {
+      List urls = new LinkedList(Arrays.asList(locations));
+      Collections.shuffle(urls);
+      URL url = new URL((String)urls.get(0));
+      InputStream in = url.openStream();
+      byte[] buffer = new byte[1024];
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      int count;
+
+      while ((count = in.read(buffer)) != -1) {
+        out.write(buffer, 0, count);
+      }
+      in.close();
+      out.close();
+      patternsFileContent = out.toByteArray();
+    } catch (IOException e) {
+      // we swallow all exceptions - the user might have no internet connection
+    }
+  }
+
+  private Reader getHyphenationReader(String filename) throws Exception {
+    if (patternsFileContent == null) {
+      return null;
+    }
+
+    ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
+        patternsFileContent));
+
+    ZipEntry entry;
+    while ((entry = zipstream.getNextEntry()) != null) {
+      if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
+        byte[] buffer = new byte[1024];
+        ByteArrayOutputStream outstream = new ByteArrayOutputStream();
+        int count;
+        while ((count = zipstream.read(buffer)) != -1) {
+          outstream.write(buffer, 0, count);
+        }
+        outstream.close();
+        zipstream.close();
+        return new StringReader(new String(outstream.toByteArray(),
+            "ISO-8859-1"));
+      }
+    }
+    // we never should get here
+    return null;
+  }
+}