From 7a27cdcbc901cd5e5ebb44065e3158866156dc08 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Fri, 16 May 2008 12:22:50 +0000 Subject: [PATCH] LUCENE-1166: Added token filter for decomposing compound words git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@657027 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 1 + .../compound/CompoundWordTokenFilterBase.java | 169 +++++ .../DictionaryCompoundWordTokenFilter.java | 114 +++ .../HyphenationCompoundWordTokenFilter.java | 217 ++++++ .../compound/hyphenation/ByteVector.java | 126 ++++ .../compound/hyphenation/CharVector.java | 136 ++++ .../analysis/compound/hyphenation/Hyphen.java | 69 ++ .../compound/hyphenation/Hyphenation.java | 54 ++ .../hyphenation/HyphenationException.java | 32 + .../compound/hyphenation/HyphenationTree.java | 475 +++++++++++++ .../compound/hyphenation/PatternConsumer.java | 55 ++ .../compound/hyphenation/PatternParser.java | 518 ++++++++++++++ .../compound/hyphenation/TernaryTree.java | 663 ++++++++++++++++++ .../compound/hyphenation/hyphenation.dtd | 68 ++ .../compound/hyphenation/package.html | 10 + .../lucene/analysis/compound/package.html | 166 +++++ .../compound/TestCompoundWordTokenFilter.java | 214 ++++++ 17 files changed, 3087 insertions(+) create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/hyphenation.dtd create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html create mode 100644 contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html create mode 100644 contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java diff --git a/CHANGES.txt b/CHANGES.txt index 3023c14462c..5be526d213e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -159,6 +159,7 @@ New features 12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll) +13. LUCENE-1166: Decomposition tokenfilter for languages like German and Swedish (Thomas Peuss via Grant Ingersoll) Optimizations diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java new file mode 100644 index 00000000000..7876977177e --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -0,0 +1,169 @@ +package org.apache.lucene.analysis.compound; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * Base class for decomposition token filters. + */ +public abstract class CompoundWordTokenFilterBase extends TokenFilter { + /** + * The default for minimal word length that gets decomposed + */ + public static final int DEFAULT_MIN_WORD_SIZE = 5; + + /** + * The default for minimal length of subwords that get propagated to the output of this filter + */ + public static final int DEFAULT_MIN_SUBWORD_SIZE = 2; + + /** + * The default for maximal length of subwords that get propagated to the output of this filter + */ + public static final int DEFAULT_MAX_SUBWORD_SIZE = 15; + + protected final CharArraySet dictionary; + protected final LinkedList tokens; + protected final int minWordSize; + protected final int minSubwordSize; + protected final int maxSubwordSize; + protected final boolean onlyLongestMatch; + + protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) { + this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) { + this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) { + this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + } + + protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) { + this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + } + + protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + super(input); + + this.tokens=new LinkedList(); + this.minWordSize=minWordSize; + this.minSubwordSize=minSubwordSize; + this.maxSubwordSize=maxSubwordSize; + this.onlyLongestMatch=onlyLongestMatch; + + if (dictionary instanceof CharArraySet) { + this.dictionary = (CharArraySet) dictionary; + } else { + this.dictionary = new CharArraySet(dictionary.size(), false); + addAllLowerCase(this.dictionary, dictionary); + } + } + + /** + * Create a set of words from an array + * The resulting Set does case insensitive matching + * TODO We should look for a faster dictionary lookup approach. + * @param dictionary + * @return + */ + public static final Set makeDictionary(final String[] dictionary) { + CharArraySet dict = new CharArraySet(dictionary.length, false); + addAllLowerCase(dict, Arrays.asList(dictionary)); + return dict; + } + + public Token next() throws IOException { + if (tokens.size() > 0) { + return (Token)tokens.removeFirst(); + } + + Token token = input.next(); + if (token == null) { + return null; + } + + decompose(token); + + if (tokens.size() > 0) { + return (Token)tokens.removeFirst(); + } else { + return null; + } + } + + protected static final void addAllLowerCase(Set target, Collection col) { + Iterator iter=col.iterator(); + + while (iter.hasNext()) { + target.add(((String)iter.next()).toLowerCase()); + } + } + + protected static char[] makeLowerCaseCopy(final char[] buffer) { + char[] result=new char[buffer.length]; + System.arraycopy(buffer, 0, result, 0, buffer.length); + + for (int i=0;itoken.termLength()) { + break; + } + if(dictionary.contains(lowerCaseTermBuffer, i, j)) { + if (this.onlyLongestMatch) { + if (longestMatchToken!=null) { + if (longestMatchToken.termLength() exit + if (hyphens == null) { + return; + } + + final int[] hyp = hyphens.getHyphenationPoints(); + char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer()); + + for (int i = 0; i < hyp.length; ++i) { + int remaining = hyp.length - i; + int start = hyp[i]; + Token longestMatchToken = null; + for (int j = 1; j < remaining; j++) { + int partLength = hyp[i + j] - start; + + // if the part is longer than maxSubwordSize we + // are done with this round + if (partLength > this.maxSubwordSize) { + break; + } + + // we only put subwords to the token stream + // that are longer than minPartSize + if (partLength < this.minSubwordSize) { + continue; + } + + // check the dictionary + if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) { + if (this.onlyLongestMatch) { + if (longestMatchToken != null) { + if (longestMatchToken.termLength() < partLength) { + longestMatchToken = createToken(start, partLength, token); + } + } else { + longestMatchToken = createToken(start, partLength, token); + } + } else { + tokens.add(createToken(start, partLength, token)); + } + } else if (dictionary.contains(lowerCaseTermBuffer, start, + partLength - 1)) { + // check the dictionary again with a word that is one character + // shorter + // to avoid problems with genitive 's characters and other binding + // characters + if (this.onlyLongestMatch) { + if (longestMatchToken != null) { + if (longestMatchToken.termLength() < partLength - 1) { + longestMatchToken = createToken(start, partLength - 1, token); + } + } else { + longestMatchToken = createToken(start, partLength - 1, token); + } + } else { + tokens.add(createToken(start, partLength - 1, token)); + } + } + } + if (this.onlyLongestMatch && longestMatchToken!=null) { + tokens.add(longestMatchToken); + } + } + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java new file mode 100644 index 00000000000..64768d435c7 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/ByteVector.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.lucene.analysis.compound.hyphenation; + +import java.io.Serializable; + +/** + * This class implements a simple byte vector with access to the underlying + * array. + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public class ByteVector implements Serializable { + + /** + * Capacity increment size + */ + private static final int DEFAULT_BLOCK_SIZE = 2048; + + private int blockSize; + + /** + * The encapsulated array + */ + private byte[] array; + + /** + * Points to next free item + */ + private int n; + + public ByteVector() { + this(DEFAULT_BLOCK_SIZE); + } + + public ByteVector(int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = new byte[blockSize]; + n = 0; + } + + public ByteVector(byte[] a) { + blockSize = DEFAULT_BLOCK_SIZE; + array = a; + n = 0; + } + + public ByteVector(byte[] a, int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = a; + n = 0; + } + + public byte[] getArray() { + return array; + } + + /** + * return number of items in array + */ + public int length() { + return n; + } + + /** + * returns current capacity of array + */ + public int capacity() { + return array.length; + } + + public void put(int index, byte val) { + array[index] = val; + } + + public byte get(int index) { + return array[index]; + } + + /** + * This is to implement memory allocation in the array. Like malloc(). + */ + public int alloc(int size) { + int index = n; + int len = array.length; + if (n + size >= len) { + byte[] aux = new byte[len + blockSize]; + System.arraycopy(array, 0, aux, 0, len); + array = aux; + } + n += size; + return index; + } + + public void trimToSize() { + if (n < array.length) { + byte[] aux = new byte[n]; + System.arraycopy(array, 0, aux, 0, n); + array = aux; + } + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java new file mode 100644 index 00000000000..00521808b88 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/CharVector.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +import java.io.Serializable; + +/** + * This class implements a simple char vector with access to the underlying + * array. + * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public class CharVector implements Cloneable, Serializable { + + /** + * Capacity increment size + */ + private static final int DEFAULT_BLOCK_SIZE = 2048; + + private int blockSize; + + /** + * The encapsulated array + */ + private char[] array; + + /** + * Points to next free item + */ + private int n; + + public CharVector() { + this(DEFAULT_BLOCK_SIZE); + } + + public CharVector(int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = new char[blockSize]; + n = 0; + } + + public CharVector(char[] a) { + blockSize = DEFAULT_BLOCK_SIZE; + array = a; + n = a.length; + } + + public CharVector(char[] a, int capacity) { + if (capacity > 0) { + blockSize = capacity; + } else { + blockSize = DEFAULT_BLOCK_SIZE; + } + array = a; + n = a.length; + } + + /** + * Reset Vector but don't resize or clear elements + */ + public void clear() { + n = 0; + } + + public Object clone() { + CharVector cv = new CharVector((char[]) array.clone(), blockSize); + cv.n = this.n; + return cv; + } + + public char[] getArray() { + return array; + } + + /** + * return number of items in array + */ + public int length() { + return n; + } + + /** + * returns current capacity of array + */ + public int capacity() { + return array.length; + } + + public void put(int index, char val) { + array[index] = val; + } + + public char get(int index) { + return array[index]; + } + + public int alloc(int size) { + int index = n; + int len = array.length; + if (n + size >= len) { + char[] aux = new char[len + blockSize]; + System.arraycopy(array, 0, aux, 0, len); + array = aux; + } + n += size; + return index; + } + + public void trimToSize() { + if (n < array.length) { + char[] aux = new char[n]; + System.arraycopy(array, 0, aux, 0, n); + array = aux; + } + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java new file mode 100644 index 00000000000..65a3873afe8 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphen.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +import java.io.Serializable; + +/** + * This class represents a hyphen. A 'full' hyphen is made of 3 parts: the + * pre-break text, post-break text and no-break. If no line-break is generated + * at this position, the no-break text is used, otherwise, pre-break and + * post-break are used. Typically, pre-break is equal to the hyphen character + * and the others are empty. However, this general scheme allows support for + * cases in some languages where words change spelling if they're split across + * lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes + * from TeX. + * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ + +public class Hyphen implements Serializable { + public String preBreak; + + public String noBreak; + + public String postBreak; + + Hyphen(String pre, String no, String post) { + preBreak = pre; + noBreak = no; + postBreak = post; + } + + Hyphen(String pre) { + preBreak = pre; + noBreak = null; + postBreak = null; + } + + public String toString() { + if (noBreak == null && postBreak == null && preBreak != null + && preBreak.equals("-")) { + return "-"; + } + StringBuffer res = new StringBuffer("{"); + res.append(preBreak); + res.append("}{"); + res.append(postBreak); + res.append("}{"); + res.append(noBreak); + res.append('}'); + return res.toString(); + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java new file mode 100644 index 00000000000..7a276a8a7a2 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/Hyphenation.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +/** + * This class represents a hyphenated word. + * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public class Hyphenation { + + private int[] hyphenPoints; + + /** + * number of hyphenation points in word + */ + private int len; + + /** + * rawWord as made of alternating strings and {@link Hyphen Hyphen} instances + */ + Hyphenation(int[] points) { + hyphenPoints = points; + } + + /** + * @return the number of hyphenation points in the word + */ + public int length() { + return hyphenPoints.length; + } + + /** + * @return the hyphenation points + */ + public int[] getHyphenationPoints() { + return hyphenPoints; + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java new file mode 100644 index 00000000000..3965244735f --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationException.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +/** + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public class HyphenationException extends Exception { + + /** + * @see java.lang.Throwable#Throwable(String) + */ + public HyphenationException(String msg) { + super(msg); + } + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java new file mode 100644 index 00000000000..a836494d4bf --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +import java.io.File; +import java.io.Serializable; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.HashMap; + +import org.xml.sax.InputSource; + +/** + * This tree structure stores the hyphenation patterns in an efficient way for + * fast lookup. It provides the provides the method to hyphenate a word. + * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public class HyphenationTree extends TernaryTree implements PatternConsumer, + Serializable { + + private static final long serialVersionUID = -7842107987915665573L; + + /** + * value space: stores the interletter values + */ + protected ByteVector vspace; + + /** + * This map stores hyphenation exceptions + */ + protected HashMap stoplist; + + /** + * This map stores the character classes + */ + protected TernaryTree classmap; + + /** + * Temporary map to store interletter values on pattern loading. + */ + private transient TernaryTree ivalues; + + public HyphenationTree() { + stoplist = new HashMap(23); // usually a small table + classmap = new TernaryTree(); + vspace = new ByteVector(); + vspace.alloc(1); // this reserves index 0, which we don't use + } + + /** + * Packs the values by storing them in 4 bits, two values into a byte Values + * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the + * value. + * + * @param values a string of digits from '0' to '9' representing the + * interletter values. + * @return the index into the vspace array where the packed values are stored. + */ + protected int packValues(String values) { + int i, n = values.length(); + int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; + int offset = vspace.alloc(m); + byte[] va = vspace.getArray(); + for (i = 0; i < n; i++) { + int j = i >> 1; + byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f); + if ((i & 1) == 1) { + va[j + offset] = (byte) (va[j + offset] | v); + } else { + va[j + offset] = (byte) (v << 4); // big endian + } + } + va[m - 1 + offset] = 0; // terminator + return offset; + } + + protected String unpackValues(int k) { + StringBuffer buf = new StringBuffer(); + byte v = vspace.get(k++); + while (v != 0) { + char c = (char) ((v >>> 4) - 1 + '0'); + buf.append(c); + c = (char) (v & 0x0f); + if (c == 0) { + break; + } + c = (char) (c - 1 + '0'); + buf.append(c); + v = vspace.get(k++); + } + return buf.toString(); + } + + /** + * Read hyphenation patterns from an XML file. + * + * @param filename the filename + * @throws HyphenationException In case the parsing fails + */ + public void loadPatterns(File f) throws HyphenationException { + try { + InputSource src = new InputSource(f.toURL().toExternalForm()); + loadPatterns(src); + } catch (MalformedURLException e) { + throw new HyphenationException("Error converting the File '" + f + + "' to a URL: " + e.getMessage()); + } + } + + /** + * Read hyphenation patterns from an XML file. + * + * @param source the InputSource for the file + * @throws HyphenationException In case the parsing fails + */ + public void loadPatterns(InputSource source) throws HyphenationException { + PatternParser pp = new PatternParser(this); + ivalues = new TernaryTree(); + + pp.parse(source); + + // patterns/values should be now in the tree + // let's optimize a bit + trimToSize(); + vspace.trimToSize(); + classmap.trimToSize(); + + // get rid of the auxiliary map + ivalues = null; + } + + public String findPattern(String pat) { + int k = super.find(pat); + if (k >= 0) { + return unpackValues(k); + } + return ""; + } + + /** + * String compare, returns 0 if equal or t is a substring of s + */ + protected int hstrcmp(char[] s, int si, char[] t, int ti) { + for (; s[si] == t[ti]; si++, ti++) { + if (s[si] == 0) { + return 0; + } + } + if (t[ti] == 0) { + return 0; + } + return s[si] - t[ti]; + } + + protected byte[] getValues(int k) { + StringBuffer buf = new StringBuffer(); + byte v = vspace.get(k++); + while (v != 0) { + char c = (char) ((v >>> 4) - 1); + buf.append(c); + c = (char) (v & 0x0f); + if (c == 0) { + break; + } + c = (char) (c - 1); + buf.append(c); + v = vspace.get(k++); + } + byte[] res = new byte[buf.length()]; + for (int i = 0; i < res.length; i++) { + res[i] = (byte) buf.charAt(i); + } + return res; + } + + /** + *

+ * Search for all possible partial matches of word starting at index an update + * interletter values. In other words, it does something like: + *

+ * + * for(i=0; i + *

+ * But it is done in an efficient way since the patterns are stored in a + * ternary tree. In fact, this is the whole purpose of having the tree: doing + * this search without having to test every single pattern. The number of + * patterns for languages such as English range from 4000 to 10000. Thus, + * doing thousands of string comparisons for each word to hyphenate would be + * really slow without the tree. The tradeoff is memory, but using a ternary + * tree instead of a trie, almost halves the the memory used by Lout or TeX. + * It's also faster than using a hash table + *

+ * + * @param word null terminated word to match + * @param index start index from word + * @param il interletter values array to update + */ + protected void searchPatterns(char[] word, int index, byte[] il) { + byte[] values; + int i = index; + char p, q; + char sp = word[i]; + p = root; + + while (p > 0 && p < sc.length) { + if (sc[p] == 0xFFFF) { + if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { + values = getValues(eq[p]); // data pointer is in eq[] + int j = index; + for (int k = 0; k < values.length; k++) { + if (j < il.length && values[k] > il[j]) { + il[j] = values[k]; + } + j++; + } + } + return; + } + int d = sp - sc[p]; + if (d == 0) { + if (sp == 0) { + break; + } + sp = word[++i]; + p = eq[p]; + q = p; + + // look for a pattern ending at this position by searching for + // the null char ( splitchar == 0 ) + while (q > 0 && q < sc.length) { + if (sc[q] == 0xFFFF) { // stop at compressed branch + break; + } + if (sc[q] == 0) { + values = getValues(eq[q]); + int j = index; + for (int k = 0; k < values.length; k++) { + if (j < il.length && values[k] > il[j]) { + il[j] = values[k]; + } + j++; + } + break; + } else { + q = lo[q]; + + /** + * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but + * java chars are unsigned + */ + } + } + } else { + p = d < 0 ? lo[p] : hi[p]; + } + } + } + + /** + * Hyphenate word and return a Hyphenation object. + * + * @param word the word to be hyphenated + * @param remainCharCount Minimum number of characters allowed before the + * hyphenation point. + * @param pushCharCount Minimum number of characters allowed after the + * hyphenation point. + * @return a {@link Hyphenation Hyphenation} object representing the + * hyphenated word or null if word is not hyphenated. + */ + public Hyphenation hyphenate(String word, int remainCharCount, + int pushCharCount) { + char[] w = word.toCharArray(); + return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); + } + + /** + * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n + * may be absent, the first n is at offset, the first l is at offset + + * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied + * into word. In the first part of the routine len = w.length, in the second + * part of the routine len = word.length. Three indices are used: index(w), + * the index in w, index(word), the index in word, letterindex(word), the + * index in the letter part of word. The following relations exist: index(w) = + * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) = + * index(word) - 1 (see first loop). It follows that: index(w) - index(word) = + * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset + + * iIgnoreAtBeginning + */ + + /** + * Hyphenate word and return an array of hyphenation points. + * + * @param w char array that contains the word + * @param offset Offset to first character in word + * @param len Length of word + * @param remainCharCount Minimum number of characters allowed before the + * hyphenation point. + * @param pushCharCount Minimum number of characters allowed after the + * hyphenation point. + * @return a {@link Hyphenation Hyphenation} object representing the + * hyphenated word or null if word is not hyphenated. + */ + public Hyphenation hyphenate(char[] w, int offset, int len, + int remainCharCount, int pushCharCount) { + int i; + char[] word = new char[len + 3]; + + // normalize word + char[] c = new char[2]; + int iIgnoreAtBeginning = 0; + int iLength = len; + boolean bEndOfLetters = false; + for (i = 1; i <= len; i++) { + c[0] = w[offset + i - 1]; + int nc = classmap.find(c, 0); + if (nc < 0) { // found a non-letter character ... + if (i == (1 + iIgnoreAtBeginning)) { + // ... before any letter character + iIgnoreAtBeginning++; + } else { + // ... after a letter character + bEndOfLetters = true; + } + iLength--; + } else { + if (!bEndOfLetters) { + word[i - iIgnoreAtBeginning] = (char) nc; + } else { + return null; + } + } + } + len = iLength; + if (len < (remainCharCount + pushCharCount)) { + // word is too short to be hyphenated + return null; + } + int[] result = new int[len + 1]; + int k = 0; + + // check exception list first + String sw = new String(word, 1, len); + if (stoplist.containsKey(sw)) { + // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = + // null) + ArrayList hw = (ArrayList) stoplist.get(sw); + int j = 0; + for (i = 0; i < hw.size(); i++) { + Object o = hw.get(i); + // j = index(sw) = letterindex(word)? + // result[k] = corresponding index(w) + if (o instanceof String) { + j += ((String) o).length(); + if (j >= remainCharCount && j < (len - pushCharCount)) { + result[k++] = j + iIgnoreAtBeginning; + } + } + } + } else { + // use algorithm to get hyphenation points + word[0] = '.'; // word start marker + word[len + 1] = '.'; // word end marker + word[len + 2] = 0; // null terminated + byte[] il = new byte[len + 3]; // initialized to zero + for (i = 0; i < len + 1; i++) { + searchPatterns(word, i, il); + } + + // hyphenation points are located where interletter value is odd + // i is letterindex(word), + // i + 1 is index(word), + // result[k] = corresponding index(w) + for (i = 0; i < len; i++) { + if (((il[i + 1] & 1) == 1) && i >= remainCharCount + && i <= (len - pushCharCount)) { + result[k++] = i + iIgnoreAtBeginning; + } + } + } + + if (k > 0) { + // trim result array + int[] res = new int[k+2]; + System.arraycopy(result, 0, res, 1, k); + // We add the synthetical hyphenation points + // at the beginning and end of the word + res[0]=0; + res[k+1]=len; + return new Hyphenation(res); + } else { + return null; + } + } + + /** + * Add a character class to the tree. It is used by + * {@link PatternParser PatternParser} as callback to add character classes. + * Character classes define the valid word characters for hyphenation. If a + * word contains a character not defined in any of the classes, it is not + * hyphenated. It also defines a way to normalize the characters in order to + * compare them with the stored patterns. Usually pattern files use only lower + * case characters, in this case a class for letter 'a', for example, should + * be defined as "aA", the first character being the normalization char. + */ + public void addClass(String chargroup) { + if (chargroup.length() > 0) { + char equivChar = chargroup.charAt(0); + char[] key = new char[2]; + key[1] = 0; + for (int i = 0; i < chargroup.length(); i++) { + key[0] = chargroup.charAt(i); + classmap.insert(key, 0, equivChar); + } + } + } + + /** + * Add an exception to the tree. It is used by + * {@link PatternParser PatternParser} class as callback to store the + * hyphenation exceptions. + * + * @param word normalized word + * @param hyphenatedword a vector of alternating strings and + * {@link Hyphen hyphen} objects. + */ + public void addException(String word, ArrayList hyphenatedword) { + stoplist.put(word, hyphenatedword); + } + + /** + * Add a pattern to the tree. Mainly, to be used by + * {@link PatternParser PatternParser} class as callback to add a pattern to + * the tree. + * + * @param pattern the hyphenation pattern + * @param ivalue interletter weight values indicating the desirability and + * priority of hyphenating at a given point within the pattern. It + * should contain only digit characters. (i.e. '0' to '9'). + */ + public void addPattern(String pattern, String ivalue) { + int k = ivalues.find(ivalue); + if (k <= 0) { + k = packValues(ivalue); + ivalues.insert(ivalue, (char) k); + } + insert(pattern, (char) k); + } + + public void printStats() { + System.out.println("Value space size = " + + Integer.toString(vspace.length())); + super.printStats(); + + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java new file mode 100644 index 00000000000..243f2487811 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternConsumer.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +import java.util.ArrayList; + +/** + * This interface is used to connect the XML pattern file parser to the + * hyphenation tree. + * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public interface PatternConsumer { + + /** + * Add a character class. A character class defines characters that are + * considered equivalent for the purpose of hyphenation (e.g. "aA"). It + * usually means to ignore case. + * + * @param chargroup character group + */ + void addClass(String chargroup); + + /** + * Add a hyphenation exception. An exception replaces the result obtained by + * the algorithm for cases for which this fails or the user wants to provide + * his own hyphenation. A hyphenatedword is a vector of alternating String's + * and {@link Hyphen Hyphen} instances + */ + void addException(String word, ArrayList hyphenatedword); + + /** + * Add hyphenation patterns. + * + * @param pattern the pattern + * @param values interletter values expressed as a string of digit characters. + */ + void addPattern(String pattern, String values); + +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java new file mode 100644 index 00000000000..5108c71d55e --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/PatternParser.java @@ -0,0 +1,518 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* $Id: PatternParser.java 426576 2006-07-28 15:44:37Z jeremias $ */ + +package org.apache.lucene.analysis.compound.hyphenation; + +// SAX +import org.xml.sax.XMLReader; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; +import org.xml.sax.helpers.DefaultHandler; +import org.xml.sax.Attributes; + +// Java +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.util.ArrayList; + +import javax.xml.parsers.SAXParserFactory; + +/** + * A SAX document handler to read and parse hyphenation patterns from a XML + * file. + * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ +public class PatternParser extends DefaultHandler implements PatternConsumer { + + XMLReader parser; + + int currElement; + + PatternConsumer consumer; + + StringBuffer token; + + ArrayList exception; + + char hyphenChar; + + String errMsg; + + static final int ELEM_CLASSES = 1; + + static final int ELEM_EXCEPTIONS = 2; + + static final int ELEM_PATTERNS = 3; + + static final int ELEM_HYPHEN = 4; + + public PatternParser() throws HyphenationException { + token = new StringBuffer(); + parser = createParser(); + parser.setContentHandler(this); + parser.setErrorHandler(this); + parser.setEntityResolver(this); + hyphenChar = '-'; // default + + } + + public PatternParser(PatternConsumer consumer) throws HyphenationException { + this(); + this.consumer = consumer; + } + + public void setConsumer(PatternConsumer consumer) { + this.consumer = consumer; + } + + /** + * Parses a hyphenation pattern file. + * + * @param filename the filename + * @throws HyphenationException In case of an exception while parsing + */ + public void parse(String filename) throws HyphenationException { + parse(new File(filename)); + } + + /** + * Parses a hyphenation pattern file. + * + * @param file the pattern file + * @throws HyphenationException In case of an exception while parsing + */ + public void parse(File file) throws HyphenationException { + try { + InputSource src = new InputSource(file.toURL().toExternalForm()); + parse(src); + } catch (MalformedURLException e) { + throw new HyphenationException("Error converting the File '" + file + + "' to a URL: " + e.getMessage()); + } + } + + /** + * Parses a hyphenation pattern file. + * + * @param source the InputSource for the file + * @throws HyphenationException In case of an exception while parsing + */ + public void parse(InputSource source) throws HyphenationException { + try { + parser.parse(source); + } catch (FileNotFoundException fnfe) { + throw new HyphenationException("File not found: " + fnfe.getMessage()); + } catch (IOException ioe) { + throw new HyphenationException(ioe.getMessage()); + } catch (SAXException e) { + throw new HyphenationException(errMsg); + } + } + + /** + * Creates a SAX parser using JAXP + * + * @return the created SAX parser + */ + static XMLReader createParser() { + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + return factory.newSAXParser().getXMLReader(); + } catch (Exception e) { + throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage()); + } + } + + protected String readToken(StringBuffer chars) { + String word; + boolean space = false; + int i; + for (i = 0; i < chars.length(); i++) { + if (Character.isWhitespace(chars.charAt(i))) { + space = true; + } else { + break; + } + } + if (space) { + // chars.delete(0,i); + for (int countr = i; countr < chars.length(); countr++) { + chars.setCharAt(countr - i, chars.charAt(countr)); + } + chars.setLength(chars.length() - i); + if (token.length() > 0) { + word = token.toString(); + token.setLength(0); + return word; + } + } + space = false; + for (i = 0; i < chars.length(); i++) { + if (Character.isWhitespace(chars.charAt(i))) { + space = true; + break; + } + } + token.append(chars.toString().substring(0, i)); + // chars.delete(0,i); + for (int countr = i; countr < chars.length(); countr++) { + chars.setCharAt(countr - i, chars.charAt(countr)); + } + chars.setLength(chars.length() - i); + if (space) { + word = token.toString(); + token.setLength(0); + return word; + } + token.append(chars); + return null; + } + + protected static String getPattern(String word) { + StringBuffer pat = new StringBuffer(); + int len = word.length(); + for (int i = 0; i < len; i++) { + if (!Character.isDigit(word.charAt(i))) { + pat.append(word.charAt(i)); + } + } + return pat.toString(); + } + + protected ArrayList normalizeException(ArrayList ex) { + ArrayList res = new ArrayList(); + for (int i = 0; i < ex.size(); i++) { + Object item = ex.get(i); + if (item instanceof String) { + String str = (String) item; + StringBuffer buf = new StringBuffer(); + for (int j = 0; j < str.length(); j++) { + char c = str.charAt(j); + if (c != hyphenChar) { + buf.append(c); + } else { + res.add(buf.toString()); + buf.setLength(0); + char[] h = new char[1]; + h[0] = hyphenChar; + // we use here hyphenChar which is not necessarily + // the one to be printed + res.add(new Hyphen(new String(h), null, null)); + } + } + if (buf.length() > 0) { + res.add(buf.toString()); + } + } else { + res.add(item); + } + } + return res; + } + + protected String getExceptionWord(ArrayList ex) { + StringBuffer res = new StringBuffer(); + for (int i = 0; i < ex.size(); i++) { + Object item = ex.get(i); + if (item instanceof String) { + res.append((String) item); + } else { + if (((Hyphen) item).noBreak != null) { + res.append(((Hyphen) item).noBreak); + } + } + } + return res.toString(); + } + + protected static String getInterletterValues(String pat) { + StringBuffer il = new StringBuffer(); + String word = pat + "a"; // add dummy letter to serve as sentinel + int len = word.length(); + for (int i = 0; i < len; i++) { + char c = word.charAt(i); + if (Character.isDigit(c)) { + il.append(c); + i++; + } else { + il.append('0'); + } + } + return il.toString(); + } + + // + // EntityResolver methods + // + public InputSource resolveEntity(String publicId, String systemId) + throws SAXException, IOException { + return HyphenationDTDGenerator.generateDTD(); + } + + // + // ContentHandler methods + // + + /** + * @see org.xml.sax.ContentHandler#startElement(java.lang.String, + * java.lang.String, java.lang.String, org.xml.sax.Attributes) + */ + public void startElement(String uri, String local, String raw, + Attributes attrs) { + if (local.equals("hyphen-char")) { + String h = attrs.getValue("value"); + if (h != null && h.length() == 1) { + hyphenChar = h.charAt(0); + } + } else if (local.equals("classes")) { + currElement = ELEM_CLASSES; + } else if (local.equals("patterns")) { + currElement = ELEM_PATTERNS; + } else if (local.equals("exceptions")) { + currElement = ELEM_EXCEPTIONS; + exception = new ArrayList(); + } else if (local.equals("hyphen")) { + if (token.length() > 0) { + exception.add(token.toString()); + } + exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"), + attrs.getValue("post"))); + currElement = ELEM_HYPHEN; + } + token.setLength(0); + } + + /** + * @see org.xml.sax.ContentHandler#endElement(java.lang.String, + * java.lang.String, java.lang.String) + */ + public void endElement(String uri, String local, String raw) { + + if (token.length() > 0) { + String word = token.toString(); + switch (currElement) { + case ELEM_CLASSES: + consumer.addClass(word); + break; + case ELEM_EXCEPTIONS: + exception.add(word); + exception = normalizeException(exception); + consumer.addException(getExceptionWord(exception), + (ArrayList) exception.clone()); + break; + case ELEM_PATTERNS: + consumer.addPattern(getPattern(word), getInterletterValues(word)); + break; + case ELEM_HYPHEN: + // nothing to do + break; + } + if (currElement != ELEM_HYPHEN) { + token.setLength(0); + } + } + if (currElement == ELEM_HYPHEN) { + currElement = ELEM_EXCEPTIONS; + } else { + currElement = 0; + } + + } + + /** + * @see org.xml.sax.ContentHandler#characters(char[], int, int) + */ + public void characters(char ch[], int start, int length) { + StringBuffer chars = new StringBuffer(length); + chars.append(ch, start, length); + String word = readToken(chars); + while (word != null) { + // System.out.println("\"" + word + "\""); + switch (currElement) { + case ELEM_CLASSES: + consumer.addClass(word); + break; + case ELEM_EXCEPTIONS: + exception.add(word); + exception = normalizeException(exception); + consumer.addException(getExceptionWord(exception), + (ArrayList) exception.clone()); + exception.clear(); + break; + case ELEM_PATTERNS: + consumer.addPattern(getPattern(word), getInterletterValues(word)); + break; + } + word = readToken(chars); + } + + } + + // + // ErrorHandler methods + // + + /** + * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException) + */ + public void warning(SAXParseException ex) { + errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage(); + } + + /** + * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException) + */ + public void error(SAXParseException ex) { + errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage(); + } + + /** + * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException) + */ + public void fatalError(SAXParseException ex) throws SAXException { + errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage(); + throw ex; + } + + /** + * Returns a string of the location. + */ + private String getLocationString(SAXParseException ex) { + StringBuffer str = new StringBuffer(); + + String systemId = ex.getSystemId(); + if (systemId != null) { + int index = systemId.lastIndexOf('/'); + if (index != -1) { + systemId = systemId.substring(index + 1); + } + str.append(systemId); + } + str.append(':'); + str.append(ex.getLineNumber()); + str.append(':'); + str.append(ex.getColumnNumber()); + + return str.toString(); + + } // getLocationString(SAXParseException):String + + // PatternConsumer implementation for testing purposes + public void addClass(String c) { + System.out.println("class: " + c); + } + + public void addException(String w, ArrayList e) { + System.out.println("exception: " + w + " : " + e.toString()); + } + + public void addPattern(String p, String v) { + System.out.println("pattern: " + p + " : " + v); + } + + public static void main(String[] args) throws Exception { + if (args.length > 0) { + PatternParser pp = new PatternParser(); + pp.setConsumer(pp); + pp.parse(args[0]); + } + } +} + +class HyphenationDTDGenerator { + public static final String DTD_STRING= + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"+ + "\n"; + + public static InputSource generateDTD() { + return new InputSource(new StringReader(DTD_STRING)); + } +} diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java new file mode 100644 index 00000000000..b327cd7ba4a --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/TernaryTree.java @@ -0,0 +1,663 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.analysis.compound.hyphenation; + +import java.util.Enumeration; +import java.util.Stack; +import java.io.Serializable; + +/** + *

Ternary Search Tree.

+ * + *

+ * A ternary search tree is a hibrid between a binary tree and a digital search + * tree (trie). Keys are limited to strings. A data value of type char is stored + * in each leaf node. It can be used as an index (or pointer) to the data. + * Branches that only contain one key are compressed to one node by storing a + * pointer to the trailer substring of the key. This class is intended to serve + * as base class or helper class to implement Dictionary collections or the + * like. Ternary trees have some nice properties as the following: the tree can + * be traversed in sorted order, partial matches (wildcard) can be implemented, + * retrieval of all keys within a given distance from the target, etc. The + * storage requirements are higher than a binary tree but a lot less than a + * trie. Performance is comparable with a hash table, sometimes it outperforms a + * hash function (most of the time can determine a miss faster than a hash). + *

+ * + *

+ * The main purpose of this java port is to serve as a base for implementing + * TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language + * requires from 5000 to 15000 hyphenation patterns which will be keys in this + * tree. The strings patterns are usually small (from 2 to 5 characters), but + * each char in the tree is stored in a node. Thus memory usage is the main + * concern. We will sacrify 'elegance' to keep memory requirenments to the + * minimum. Using java's char type as pointer (yes, I know pointer it is a + * forbidden word in java) we can keep the size of the node to be just 8 bytes + * (3 pointers and the data char). This gives room for about 65000 nodes. In my + * tests the english patterns took 7694 nodes and the german patterns 10055 + * nodes, so I think we are safe. + *

+ * + *

+ * All said, this is a map with strings as keys and char as value. Pretty + * limited!. It can be extended to a general map by using the string + * representation of an object and using the char value as an index to an array + * that contains the object values. + *

+ * + * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. + */ + +public class TernaryTree implements Cloneable, Serializable { + + /** + * We use 4 arrays to represent a node. I guess I should have created a proper + * node class, but somehow Knuth's pascal code made me forget we now have a + * portable language with virtual memory management and automatic garbage + * collection! And now is kind of late, furthermore, if it ain't broken, don't + * fix it. + */ + + /** + * Pointer to low branch and to rest of the key when it is stored directly in + * this node, we don't have unions in java! + */ + protected char[] lo; + + /** + * Pointer to high branch. + */ + protected char[] hi; + + /** + * Pointer to equal branch and to data when this node is a string terminator. + */ + protected char[] eq; + + /** + *

+ * The character stored in this node: splitchar. Two special values are + * reserved: + *

+ *
    + *
  • 0x0000 as string terminator
  • + *
  • 0xFFFF to indicate that the branch starting at this node is compressed
  • + *
+ *

+ * This shouldn't be a problem if we give the usual semantics to strings since + * 0xFFFF is garanteed not to be an Unicode character. + *

+ */ + protected char[] sc; + + /** + * This vector holds the trailing of the keys when the branch is compressed. + */ + protected CharVector kv; + + protected char root; + + protected char freenode; + + protected int length; // number of items in tree + + protected static final int BLOCK_SIZE = 2048; // allocation size for arrays + + TernaryTree() { + init(); + } + + protected void init() { + root = 0; + freenode = 1; + length = 0; + lo = new char[BLOCK_SIZE]; + hi = new char[BLOCK_SIZE]; + eq = new char[BLOCK_SIZE]; + sc = new char[BLOCK_SIZE]; + kv = new CharVector(); + } + + /** + * Branches are initially compressed, needing one node per key plus the size + * of the string key. They are decompressed as needed when another key with + * same prefix is inserted. This saves a lot of space, specially for long + * keys. + */ + public void insert(String key, char val) { + // make sure we have enough room in the arrays + int len = key.length() + 1; // maximum number of nodes that may be generated + if (freenode + len > eq.length) { + redimNodeArrays(eq.length + BLOCK_SIZE); + } + char strkey[] = new char[len--]; + key.getChars(0, len, strkey, 0); + strkey[len] = 0; + root = insert(root, strkey, 0, val); + } + + public void insert(char[] key, int start, char val) { + int len = strlen(key) + 1; + if (freenode + len > eq.length) { + redimNodeArrays(eq.length + BLOCK_SIZE); + } + root = insert(root, key, start, val); + } + + /** + * The actual insertion function, recursive version. + */ + private char insert(char p, char[] key, int start, char val) { + int len = strlen(key, start); + if (p == 0) { + // this means there is no branch, this node will start a new branch. + // Instead of doing that, we store the key somewhere else and create + // only one node with a pointer to the key + p = freenode++; + eq[p] = val; // holds data + length++; + hi[p] = 0; + if (len > 0) { + sc[p] = 0xFFFF; // indicates branch is compressed + lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key + strcpy(kv.getArray(), lo[p], key, start); + } else { + sc[p] = 0; + lo[p] = 0; + } + return p; + } + + if (sc[p] == 0xFFFF) { + // branch is compressed: need to decompress + // this will generate garbage in the external key array + // but we can do some garbage collection later + char pp = freenode++; + lo[pp] = lo[p]; // previous pointer to key + eq[pp] = eq[p]; // previous pointer to data + lo[p] = 0; + if (len > 0) { + sc[p] = kv.get(lo[pp]); + eq[p] = pp; + lo[pp]++; + if (kv.get(lo[pp]) == 0) { + // key completly decompressed leaving garbage in key array + lo[pp] = 0; + sc[pp] = 0; + hi[pp] = 0; + } else { + // we only got first char of key, rest is still there + sc[pp] = 0xFFFF; + } + } else { + // In this case we can save a node by swapping the new node + // with the compressed node + sc[pp] = 0xFFFF; + hi[p] = pp; + sc[p] = 0; + eq[p] = val; + length++; + return p; + } + } + char s = key[start]; + if (s < sc[p]) { + lo[p] = insert(lo[p], key, start, val); + } else if (s == sc[p]) { + if (s != 0) { + eq[p] = insert(eq[p], key, start + 1, val); + } else { + // key already in tree, overwrite data + eq[p] = val; + } + } else { + hi[p] = insert(hi[p], key, start, val); + } + return p; + } + + /** + * Compares 2 null terminated char arrays + */ + public static int strcmp(char[] a, int startA, char[] b, int startB) { + for (; a[startA] == b[startB]; startA++, startB++) { + if (a[startA] == 0) { + return 0; + } + } + return a[startA] - b[startB]; + } + + /** + * Compares a string with null terminated char array + */ + public static int strcmp(String str, char[] a, int start) { + int i, d, len = str.length(); + for (i = 0; i < len; i++) { + d = (int) str.charAt(i) - a[start + i]; + if (d != 0) { + return d; + } + if (a[start + i] == 0) { + return d; + } + } + if (a[start + i] != 0) { + return (int) -a[start + i]; + } + return 0; + + } + + public static void strcpy(char[] dst, int di, char[] src, int si) { + while (src[si] != 0) { + dst[di++] = src[si++]; + } + dst[di] = 0; + } + + public static int strlen(char[] a, int start) { + int len = 0; + for (int i = start; i < a.length && a[i] != 0; i++) { + len++; + } + return len; + } + + public static int strlen(char[] a) { + return strlen(a, 0); + } + + public int find(String key) { + int len = key.length(); + char strkey[] = new char[len + 1]; + key.getChars(0, len, strkey, 0); + strkey[len] = 0; + + return find(strkey, 0); + } + + public int find(char[] key, int start) { + int d; + char p = root; + int i = start; + char c; + + while (p != 0) { + if (sc[p] == 0xFFFF) { + if (strcmp(key, i, kv.getArray(), lo[p]) == 0) { + return eq[p]; + } else { + return -1; + } + } + c = key[i]; + d = c - sc[p]; + if (d == 0) { + if (c == 0) { + return eq[p]; + } + i++; + p = eq[p]; + } else if (d < 0) { + p = lo[p]; + } else { + p = hi[p]; + } + } + return -1; + } + + public boolean knows(String key) { + return (find(key) >= 0); + } + + // redimension the arrays + private void redimNodeArrays(int newsize) { + int len = newsize < lo.length ? newsize : lo.length; + char[] na = new char[newsize]; + System.arraycopy(lo, 0, na, 0, len); + lo = na; + na = new char[newsize]; + System.arraycopy(hi, 0, na, 0, len); + hi = na; + na = new char[newsize]; + System.arraycopy(eq, 0, na, 0, len); + eq = na; + na = new char[newsize]; + System.arraycopy(sc, 0, na, 0, len); + sc = na; + } + + public int size() { + return length; + } + + public Object clone() { + TernaryTree t = new TernaryTree(); + t.lo = (char[]) this.lo.clone(); + t.hi = (char[]) this.hi.clone(); + t.eq = (char[]) this.eq.clone(); + t.sc = (char[]) this.sc.clone(); + t.kv = (CharVector) this.kv.clone(); + t.root = this.root; + t.freenode = this.freenode; + t.length = this.length; + + return t; + } + + /** + * Recursively insert the median first and then the median of the lower and + * upper halves, and so on in order to get a balanced tree. The array of keys + * is assumed to be sorted in ascending order. + */ + protected void insertBalanced(String[] k, char[] v, int offset, int n) { + int m; + if (n < 1) { + return; + } + m = n >> 1; + + insert(k[m + offset], v[m + offset]); + insertBalanced(k, v, offset, m); + + insertBalanced(k, v, offset + m + 1, n - m - 1); + } + + /** + * Balance the tree for best search performance + */ + public void balance() { + // System.out.print("Before root splitchar = "); + // System.out.println(sc[root]); + + int i = 0, n = length; + String[] k = new String[n]; + char[] v = new char[n]; + Iterator iter = new Iterator(); + while (iter.hasMoreElements()) { + v[i] = iter.getValue(); + k[i++] = (String) iter.nextElement(); + } + init(); + insertBalanced(k, v, 0, n); + + // With uniform letter distribution sc[root] should be around 'm' + // System.out.print("After root splitchar = "); + // System.out.println(sc[root]); + } + + /** + * Each node stores a character (splitchar) which is part of some key(s). In a + * compressed branch (one that only contain a single string key) the trailer + * of the key which is not already in nodes is stored externally in the kv + * array. As items are inserted, key substrings decrease. Some substrings may + * completely disappear when the whole branch is totally decompressed. The + * tree is traversed to find the key substrings actually used. In addition, + * duplicate substrings are removed using a map (implemented with a + * TernaryTree!). + * + */ + public void trimToSize() { + // first balance the tree for best performance + balance(); + + // redimension the node arrays + redimNodeArrays(freenode); + + // ok, compact kv array + CharVector kx = new CharVector(); + kx.alloc(1); + TernaryTree map = new TernaryTree(); + compact(kx, map, root); + kv = kx; + kv.trimToSize(); + } + + private void compact(CharVector kx, TernaryTree map, char p) { + int k; + if (p == 0) { + return; + } + if (sc[p] == 0xFFFF) { + k = map.find(kv.getArray(), lo[p]); + if (k < 0) { + k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1); + strcpy(kx.getArray(), k, kv.getArray(), lo[p]); + map.insert(kx.getArray(), k, (char) k); + } + lo[p] = (char) k; + } else { + compact(kx, map, lo[p]); + if (sc[p] != 0) { + compact(kx, map, eq[p]); + } + compact(kx, map, hi[p]); + } + } + + public Enumeration keys() { + return new Iterator(); + } + + public class Iterator implements Enumeration { + + /** + * current node index + */ + int cur; + + /** + * current key + */ + String curkey; + + private class Item implements Cloneable { + char parent; + + char child; + + public Item() { + parent = 0; + child = 0; + } + + public Item(char p, char c) { + parent = p; + child = c; + } + + public Object clone() { + return new Item(parent, child); + } + + } + + /** + * Node stack + */ + Stack ns; + + /** + * key stack implemented with a StringBuffer + */ + StringBuffer ks; + + public Iterator() { + cur = -1; + ns = new Stack(); + ks = new StringBuffer(); + rewind(); + } + + public void rewind() { + ns.removeAllElements(); + ks.setLength(0); + cur = root; + run(); + } + + public Object nextElement() { + String res = new String(curkey); + cur = up(); + run(); + return res; + } + + public char getValue() { + if (cur >= 0) { + return eq[cur]; + } + return 0; + } + + public boolean hasMoreElements() { + return (cur != -1); + } + + /** + * traverse upwards + */ + private int up() { + Item i = new Item(); + int res = 0; + + if (ns.empty()) { + return -1; + } + + if (cur != 0 && sc[cur] == 0) { + return lo[cur]; + } + + boolean climb = true; + + while (climb) { + i = (Item) ns.pop(); + i.child++; + switch (i.child) { + case 1: + if (sc[i.parent] != 0) { + res = eq[i.parent]; + ns.push(i.clone()); + ks.append(sc[i.parent]); + } else { + i.child++; + ns.push(i.clone()); + res = hi[i.parent]; + } + climb = false; + break; + + case 2: + res = hi[i.parent]; + ns.push(i.clone()); + if (ks.length() > 0) { + ks.setLength(ks.length() - 1); // pop + } + climb = false; + break; + + default: + if (ns.empty()) { + return -1; + } + climb = true; + break; + } + } + return res; + } + + /** + * traverse the tree to find next key + */ + private int run() { + if (cur == -1) { + return -1; + } + + boolean leaf = false; + while (true) { + // first go down on low branch until leaf or compressed branch + while (cur != 0) { + if (sc[cur] == 0xFFFF) { + leaf = true; + break; + } + ns.push(new Item((char) cur, '\u0000')); + if (sc[cur] == 0) { + leaf = true; + break; + } + cur = lo[cur]; + } + if (leaf) { + break; + } + // nothing found, go up one node and try again + cur = up(); + if (cur == -1) { + return -1; + } + } + // The current node should be a data node and + // the key should be in the key stack (at least partially) + StringBuffer buf = new StringBuffer(ks.toString()); + if (sc[cur] == 0xFFFF) { + int p = lo[cur]; + while (kv.get(p) != 0) { + buf.append(kv.get(p++)); + } + } + curkey = buf.toString(); + return 0; + } + + } + + public void printStats() { + System.out.println("Number of keys = " + Integer.toString(length)); + System.out.println("Node count = " + Integer.toString(freenode)); + // System.out.println("Array length = " + Integer.toString(eq.length)); + System.out.println("Key Array length = " + Integer.toString(kv.length())); + + /* + * for(int i=0; i + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html new file mode 100644 index 00000000000..e62afc334ef --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/hyphenation/package.html @@ -0,0 +1,10 @@ + + + Hypenation code for the CompoundWordTokenFilter + + +

+ The code for the compound word hyphenation is taken from the Apache FOP project. All credits for the hyphenation code belongs to them. +

+ + \ No newline at end of file diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html new file mode 100644 index 00000000000..cf3e8bf07b6 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/compound/package.html @@ -0,0 +1,166 @@ + + +CompoundWordTokenFilter + + + +A filter that decomposes compound words you find in many Germanic +languages to the word parts. This example shows what it does: + + + + + + + +
Input token stream
Rindfleischüberwachungsgesetz Drahtschere abba
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Output token stream
(Rindfleischüberwachungsgesetz,0,29)
(Rind,0,4,posIncr=0)
(fleisch,4,11,posIncr=0)
(überwachung,11,22,posIncr=0)
(gesetz,23,29,posIncr=0)
(Drahtschere,30,41)
(Draht,30,35,posIncr=0)
(schere,35,41,posIncr=0)
(abba,42,46)
+ +The input token is always preserved and the filters do not alter the case of word parts. There are two variants of the +filter available: +
    +
  • HyphenationCompoundWordTokenFilter: it uses a + hyphenation grammer based approach to find potential word parts of a + given word.
  • +
  • DictionaryCompoundWordTokenFilter: it uses a + brute-force dictionary-only based approach to find the word parts of a given + word.
  • +
+ +

Compound word token filters

+

HyphenationCompoundWordTokenFilter

+The {@link +org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter +HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find +potential subwords that a worth to check against the dictionary. The +quality of the output tokens is directly connected to the quality of the +grammar file you use. For languages like German they are quite good. +
Grammar file
+Unfortunately we cannot bundle the hyphenation grammar files with Lucene +because they do not use an ASF compatible license (they use the LaTeX +Project Public License instead). You can find the XML based grammar +files at the +Objects +For Formatting Objects +(OFFO) Sourceforge project (direct link to download the pattern files: +http://downloads.sourceforge.net/offo/offo-hyphenation.zip +). The files you need are in the subfolder +offo-hyphenation/hyph/ +. +
+Credits for the hyphenation code go to the +Apache FOP project +. + +

DictionaryCompoundWordTokenFilter

+The {@link +org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter +DictionaryCompoundWordTokenFilter} uses a dictionary-only approach to +find subwords in a compound word. It is much slower than the one that +uses the hyphenation grammars. You can use it as a first start to +see if your dictionary is good or not because it is much simpler in design. + +

Dictionary

+The output quality of both token filters is directly connected to the +quality of the dictionary you use. They are language dependent of course. +You always should use a dictionary +that fits to the text you want to index. If you index medical text for +example then you should use a dictionary that contains medical words. +A good start for general text are the dictionaries you find at the +OpenOffice +dictionaries +Wiki. + +

Which variant should I use?

+This decision matrix should help you: + + + + + + + + + + + + + + + + +
Token filterOutput qualityPerformance
HyphenationCompoundWordTokenFiltergood if grammar file is good – acceptable otherwisefast
DictionaryCompoundWordTokenFiltergoodslow
+

Examples

+
+  public void testHyphenationCompoundWordsDE() throws Exception {
+    String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
+        "Aufgabe", "Überwachung" };
+
+    Reader reader = new FileReader("de_DR.xml");
+
+    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
+        .getHyphenationTree(reader);
+
+    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
+        new WhitespaceTokenizer(new StringReader(
+            "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
+        dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
+        CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
+        
+    Token t;
+    while ((t=tf.next())!=null) {
+       System.out.println(t);
+    }
+  }
+  
+  public void testDumbCompoundWordsSE() throws Exception {
+    String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
+        "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
+        "Sko", "Vind", "Rute", "Torkare", "Blad" };
+
+    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
+        new WhitespaceTokenizer(
+            new StringReader(
+                "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
+        dict);
+    Token t;
+    while ((t=tf.next())!=null) {
+       System.out.println(t);
+    }
+  }
+
+ + \ No newline at end of file diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java new file mode 100644 index 00000000000..19f62a31a81 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -0,0 +1,214 @@ +package org.apache.lucene.analysis.compound; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase; +import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; +import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; + +import junit.framework.TestCase; + +public class TestCompoundWordTokenFilter extends TestCase { + private static String[] locations = { + "http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip", + "http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip", + "http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip", + "http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"}; + + private byte[] patternsFileContent; + + protected void setUp() throws Exception { + super.setUp(); + getHyphenationPatternFileContents(); + } + + public void testHyphenationCompoundWordsDE() throws Exception { + String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", + "Aufgabe", "Überwachung" }; + + Reader reader = getHyphenationReader("de_DR.xml"); + if (reader == null) { + // we gracefully die if we have no reader + return; + } + + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(reader); + + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + new WhitespaceTokenizer(new StringReader( + "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator, + dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); + assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind", + "fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere", + "abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] { + 29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0, + 0, 1 }); + } + + public void testHyphenationCompoundWordsDELongestMatch() throws Exception { + String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz", + "Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" }; + + Reader reader = getHyphenationReader("de_DR.xml"); + if (reader == null) { + // we gracefully die if we have no reader + return; + } + + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(reader); + + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + new WhitespaceTokenizer(new StringReader( + "Rindfleischüberwachungsgesetz")), hyphenator, dict, + CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true); + assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", + "Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] { + 0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0, + 0, 0, 0 }); + } + + public void testDumbCompoundWordsSE() throws Exception { + String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", + "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll", + "Sko", "Vind", "Rute", "Torkare", "Blad" }; + + DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( + new WhitespaceTokenizer( + new StringReader( + "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")), + dict); + + assertFiltersTo(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor", + "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr", + "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr", + "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas", + "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol", + "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare", + "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad", + "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17, + 17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72, + 77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137, + 137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32, + 28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110, + 87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145, + 155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, + 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, + 0, 0, 0, 1 }); + } + + public void testDumbCompoundWordsSELongestMatch() throws Exception { + String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", + "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll", + "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" }; + + DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( + new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")), + dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, + CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true); + + assertFiltersTo(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas", + "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8, + 14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0, + 0, 0 }); + } + + private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset, + int[] endOffset, int[] posIncr) throws Exception { + for (int i = 0; i < s.length; ++i) { + Token t = tf.next(); + assertNotNull(t); + assertEquals(s[i], new String(t.termBuffer(), 0, t.termLength())); + assertEquals(startOffset[i], t.startOffset()); + assertEquals(endOffset[i], t.endOffset()); + assertEquals(posIncr[i], t.getPositionIncrement()); + } + assertNull(tf.next()); + } + + private void getHyphenationPatternFileContents() { + try { + List urls = new LinkedList(Arrays.asList(locations)); + Collections.shuffle(urls); + URL url = new URL((String)urls.get(0)); + InputStream in = url.openStream(); + byte[] buffer = new byte[1024]; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + int count; + + while ((count = in.read(buffer)) != -1) { + out.write(buffer, 0, count); + } + in.close(); + out.close(); + patternsFileContent = out.toByteArray(); + } catch (IOException e) { + // we swallow all exceptions - the user might have no internet connection + } + } + + private Reader getHyphenationReader(String filename) throws Exception { + if (patternsFileContent == null) { + return null; + } + + ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream( + patternsFileContent)); + + ZipEntry entry; + while ((entry = zipstream.getNextEntry()) != null) { + if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) { + byte[] buffer = new byte[1024]; + ByteArrayOutputStream outstream = new ByteArrayOutputStream(); + int count; + while ((count = zipstream.read(buffer)) != -1) { + outstream.write(buffer, 0, count); + } + outstream.close(); + zipstream.close(); + return new StringReader(new String(outstream.toByteArray(), + "ISO-8859-1")); + } + } + // we never should get here + return null; + } +}