LUCENE-1166: Added token filter for decomposing compound words

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@657027 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-05-16 12:22:50 +00:00
parent aa0074f5db
commit 7a27cdcbc9
17 changed files with 3087 additions and 0 deletions

View File

@ -159,6 +159,7 @@ New features
12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper 12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll) that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll)
13. LUCENE-1166: Decomposition tokenfilter for languages like German and Swedish (Thomas Peuss via Grant Ingersoll)
Optimizations Optimizations

View File

@ -0,0 +1,169 @@
package org.apache.lucene.analysis.compound;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Base class for decomposition token filters.
*/
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
/**
* The default for minimal word length that gets decomposed
*/
public static final int DEFAULT_MIN_WORD_SIZE = 5;
/**
* The default for minimal length of subwords that get propagated to the output of this filter
*/
public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
/**
* The default for maximal length of subwords that get propagated to the output of this filter
*/
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
protected final CharArraySet dictionary;
protected final LinkedList tokens;
protected final int minWordSize;
protected final int minSubwordSize;
protected final int maxSubwordSize;
protected final boolean onlyLongestMatch;
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) {
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
}
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) {
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
}
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input);
this.tokens=new LinkedList();
this.minWordSize=minWordSize;
this.minSubwordSize=minSubwordSize;
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
if (dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
this.dictionary = new CharArraySet(dictionary.size(), false);
addAllLowerCase(this.dictionary, dictionary);
}
}
/**
* Create a set of words from an array
* The resulting Set does case insensitive matching
* TODO We should look for a faster dictionary lookup approach.
* @param dictionary
* @return
*/
public static final Set makeDictionary(final String[] dictionary) {
CharArraySet dict = new CharArraySet(dictionary.length, false);
addAllLowerCase(dict, Arrays.asList(dictionary));
return dict;
}
public Token next() throws IOException {
if (tokens.size() > 0) {
return (Token)tokens.removeFirst();
}
Token token = input.next();
if (token == null) {
return null;
}
decompose(token);
if (tokens.size() > 0) {
return (Token)tokens.removeFirst();
} else {
return null;
}
}
protected static final void addAllLowerCase(Set target, Collection col) {
Iterator iter=col.iterator();
while (iter.hasNext()) {
target.add(((String)iter.next()).toLowerCase());
}
}
protected static char[] makeLowerCaseCopy(final char[] buffer) {
char[] result=new char[buffer.length];
System.arraycopy(buffer, 0, result, 0, buffer.length);
for (int i=0;i<buffer.length;++i) {
result[i]=Character.toLowerCase(buffer[i]);
}
return result;
}
protected final Token createToken(final int offset, final int length,
final Token prototype) {
Token t = new Token(prototype.startOffset() + offset, prototype
.startOffset()
+ offset + length, prototype.type());
t.setTermBuffer(prototype.termBuffer(), offset, length);
t.setPositionIncrement(0);
return t;
}
protected void decompose(final Token token) {
// In any case we give the original token back
tokens.add(token);
// Only words longer than minWordSize get processed
if (token.termLength() < this.minWordSize) {
return;
}
decomposeInternal(token);
}
protected abstract void decomposeInternal(final Token token);
}

View File

@ -0,0 +1,114 @@
package org.apache.lucene.analysis.compound;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* A TokenFilter that decomposes compound words found in many germanic languages
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
* "Donaudampfschiff" even when you only enter "schiff".
* It uses a brute-force algorithm to achieve this.
*/
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
/**
*
* @param input the token stream to process
* @param dictionary the word dictionary to match against
* @param minWordSize only words longer than this get processed
* @param minSubwordSize only subwords longer than this get to the output stream
* @param maxSubwordSize only subwords shorter than this get to the output stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
/**
*
* @param input the token stream to process
* @param dictionary the word dictionary to match against
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
super(input, dictionary);
}
/**
*
* @param input the token stream to process
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
super(input, dictionary);
}
/**
*
* @param input the token stream to process
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
* @param minWordSize only words longer than this get processed
* @param minSubwordSize only subwords longer than this get to the output stream
* @param maxSubwordSize only subwords shorter than this get to the output stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
*/
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
protected void decomposeInternal(final Token token) {
// Only words longer than minWordSize get processed
if (token.termLength() < this.minWordSize) {
return;
}
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
for (int i=0;i<token.termLength()-this.minSubwordSize;++i) {
Token longestMatchToken=null;
for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
if(i+j>token.termLength()) {
break;
}
if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
if (this.onlyLongestMatch) {
if (longestMatchToken!=null) {
if (longestMatchToken.termLength()<j) {
longestMatchToken=createToken(i,j,token);
}
} else {
longestMatchToken=createToken(i,j,token);
}
} else {
tokens.add(createToken(i,j,token));
}
}
}
if (this.onlyLongestMatch && longestMatchToken!=null) {
tokens.add(longestMatchToken);
}
}
}
}

View File

@ -0,0 +1,217 @@
package org.apache.lucene.analysis.compound;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.xml.sax.InputSource;
/**
* A TokenFilter that decomposes compound words found in many germanic languages
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
* "Donaudampfschiff" even when you only enter "schiff" It uses a hyphenation
* grammar and a word dictionary to achieve this.
*/
public class HyphenationCompoundWordTokenFilter extends
CompoundWordTokenFilterBase {
private HyphenationTree hyphenator;
/**
*
* @param input the token stream to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against
* @param minWordSize only words longer than this get processed
* @param minSubwordSize only subwords longer than this get to the output
* stream
* @param maxSubwordSize only subwords shorter than this get to the output
* stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
this(input, hyphenator, makeDictionary(dictionary), minWordSize,
minSubwordSize, maxSubwordSize, onlyLongestMatch);
}
/**
*
* @param input the token stream to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, String[] dictionary) {
this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
*
* @param input the token stream to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, Set dictionary) {
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
}
/**
*
* @param input the token stream to process
* @param hyphenator the hyphenation pattern tree to use for hyphenation
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
* lower case strings.
* @param minWordSize only words longer than this get processed
* @param minSubwordSize only subwords longer than this get to the output
* stream
* @param maxSubwordSize only subwords shorter than this get to the output
* stream
* @param onlyLongestMatch Add only the longest matching subword to the stream
*/
public HyphenationCompoundWordTokenFilter(TokenStream input,
HyphenationTree hyphenator, Set dictionary, int minWordSize,
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
onlyLongestMatch);
this.hyphenator = hyphenator;
}
/**
* Create a hyphenator tree
*
* @param hyphenationFilename the filename of the XML grammar to load
* @return An object representing the hyphenation patterns
* @throws Exception
*/
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
throws Exception {
return getHyphenationTree(new File(hyphenationFilename));
}
/**
* Create a hyphenator tree
*
* @param hyphenationFile the file of the XML grammar to load
* @return An object representing the hyphenation patterns
* @throws Exception
*/
public static HyphenationTree getHyphenationTree(File hyphenationFile)
throws Exception {
return getHyphenationTree(new InputStreamReader(new FileInputStream(
hyphenationFile), "ISO-8859-1"));
}
/**
* Create a hyphenator tree
*
* @param hyphenationReader the reader of the XML grammar to load from
* @return An object representing the hyphenation patterns
* @throws Exception
*/
public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
throws Exception {
HyphenationTree tree = new HyphenationTree();
tree.loadPatterns(new InputSource(hyphenationReader));
return tree;
}
protected void decomposeInternal(final Token token) {
// get the hpyphenation points
Hyphenation hyphens = hyphenator.hyphenate(token.termBuffer(), 0, token
.termLength(), 1, 1);
// No hyphen points found -> exit
if (hyphens == null) {
return;
}
final int[] hyp = hyphens.getHyphenationPoints();
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
for (int i = 0; i < hyp.length; ++i) {
int remaining = hyp.length - i;
int start = hyp[i];
Token longestMatchToken = null;
for (int j = 1; j < remaining; j++) {
int partLength = hyp[i + j] - start;
// if the part is longer than maxSubwordSize we
// are done with this round
if (partLength > this.maxSubwordSize) {
break;
}
// we only put subwords to the token stream
// that are longer than minPartSize
if (partLength < this.minSubwordSize) {
continue;
}
// check the dictionary
if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
if (longestMatchToken.termLength() < partLength) {
longestMatchToken = createToken(start, partLength, token);
}
} else {
longestMatchToken = createToken(start, partLength, token);
}
} else {
tokens.add(createToken(start, partLength, token));
}
} else if (dictionary.contains(lowerCaseTermBuffer, start,
partLength - 1)) {
// check the dictionary again with a word that is one character
// shorter
// to avoid problems with genitive 's characters and other binding
// characters
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
if (longestMatchToken.termLength() < partLength - 1) {
longestMatchToken = createToken(start, partLength - 1, token);
}
} else {
longestMatchToken = createToken(start, partLength - 1, token);
}
} else {
tokens.add(createToken(start, partLength - 1, token));
}
}
}
if (this.onlyLongestMatch && longestMatchToken!=null) {
tokens.add(longestMatchToken);
}
}
}
}

View File

@ -0,0 +1,126 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.Serializable;
/**
* This class implements a simple byte vector with access to the underlying
* array.
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class ByteVector implements Serializable {
/**
* Capacity increment size
*/
private static final int DEFAULT_BLOCK_SIZE = 2048;
private int blockSize;
/**
* The encapsulated array
*/
private byte[] array;
/**
* Points to next free item
*/
private int n;
public ByteVector() {
this(DEFAULT_BLOCK_SIZE);
}
public ByteVector(int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = new byte[blockSize];
n = 0;
}
public ByteVector(byte[] a) {
blockSize = DEFAULT_BLOCK_SIZE;
array = a;
n = 0;
}
public ByteVector(byte[] a, int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = a;
n = 0;
}
public byte[] getArray() {
return array;
}
/**
* return number of items in array
*/
public int length() {
return n;
}
/**
* returns current capacity of array
*/
public int capacity() {
return array.length;
}
public void put(int index, byte val) {
array[index] = val;
}
public byte get(int index) {
return array[index];
}
/**
* This is to implement memory allocation in the array. Like malloc().
*/
public int alloc(int size) {
int index = n;
int len = array.length;
if (n + size >= len) {
byte[] aux = new byte[len + blockSize];
System.arraycopy(array, 0, aux, 0, len);
array = aux;
}
n += size;
return index;
}
public void trimToSize() {
if (n < array.length) {
byte[] aux = new byte[n];
System.arraycopy(array, 0, aux, 0, n);
array = aux;
}
}
}

View File

@ -0,0 +1,136 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.Serializable;
/**
* This class implements a simple char vector with access to the underlying
* array.
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class CharVector implements Cloneable, Serializable {
/**
* Capacity increment size
*/
private static final int DEFAULT_BLOCK_SIZE = 2048;
private int blockSize;
/**
* The encapsulated array
*/
private char[] array;
/**
* Points to next free item
*/
private int n;
public CharVector() {
this(DEFAULT_BLOCK_SIZE);
}
public CharVector(int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = new char[blockSize];
n = 0;
}
public CharVector(char[] a) {
blockSize = DEFAULT_BLOCK_SIZE;
array = a;
n = a.length;
}
public CharVector(char[] a, int capacity) {
if (capacity > 0) {
blockSize = capacity;
} else {
blockSize = DEFAULT_BLOCK_SIZE;
}
array = a;
n = a.length;
}
/**
* Reset Vector but don't resize or clear elements
*/
public void clear() {
n = 0;
}
public Object clone() {
CharVector cv = new CharVector((char[]) array.clone(), blockSize);
cv.n = this.n;
return cv;
}
public char[] getArray() {
return array;
}
/**
* return number of items in array
*/
public int length() {
return n;
}
/**
* returns current capacity of array
*/
public int capacity() {
return array.length;
}
public void put(int index, char val) {
array[index] = val;
}
public char get(int index) {
return array[index];
}
public int alloc(int size) {
int index = n;
int len = array.length;
if (n + size >= len) {
char[] aux = new char[len + blockSize];
System.arraycopy(array, 0, aux, 0, len);
array = aux;
}
n += size;
return index;
}
public void trimToSize() {
if (n < array.length) {
char[] aux = new char[n];
System.arraycopy(array, 0, aux, 0, n);
array = aux;
}
}
}

View File

@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.Serializable;
/**
* This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
* pre-break text, post-break text and no-break. If no line-break is generated
* at this position, the no-break text is used, otherwise, pre-break and
* post-break are used. Typically, pre-break is equal to the hyphen character
* and the others are empty. However, this general scheme allows support for
* cases in some languages where words change spelling if they're split across
* lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
* from TeX.
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class Hyphen implements Serializable {
public String preBreak;
public String noBreak;
public String postBreak;
Hyphen(String pre, String no, String post) {
preBreak = pre;
noBreak = no;
postBreak = post;
}
Hyphen(String pre) {
preBreak = pre;
noBreak = null;
postBreak = null;
}
public String toString() {
if (noBreak == null && postBreak == null && preBreak != null
&& preBreak.equals("-")) {
return "-";
}
StringBuffer res = new StringBuffer("{");
res.append(preBreak);
res.append("}{");
res.append(postBreak);
res.append("}{");
res.append(noBreak);
res.append('}');
return res.toString();
}
}

View File

@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
/**
* This class represents a hyphenated word.
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class Hyphenation {
private int[] hyphenPoints;
/**
* number of hyphenation points in word
*/
private int len;
/**
* rawWord as made of alternating strings and {@link Hyphen Hyphen} instances
*/
Hyphenation(int[] points) {
hyphenPoints = points;
}
/**
* @return the number of hyphenation points in the word
*/
public int length() {
return hyphenPoints.length;
}
/**
* @return the hyphenation points
*/
public int[] getHyphenationPoints() {
return hyphenPoints;
}
}

View File

@ -0,0 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
/**
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class HyphenationException extends Exception {
/**
* @see java.lang.Throwable#Throwable(String)
*/
public HyphenationException(String msg) {
super(msg);
}
}

View File

@ -0,0 +1,475 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
import java.io.File;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import org.xml.sax.InputSource;
/**
* This tree structure stores the hyphenation patterns in an efficient way for
* fast lookup. It provides the provides the method to hyphenate a word.
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class HyphenationTree extends TernaryTree implements PatternConsumer,
Serializable {
private static final long serialVersionUID = -7842107987915665573L;
/**
* value space: stores the interletter values
*/
protected ByteVector vspace;
/**
* This map stores hyphenation exceptions
*/
protected HashMap stoplist;
/**
* This map stores the character classes
*/
protected TernaryTree classmap;
/**
* Temporary map to store interletter values on pattern loading.
*/
private transient TernaryTree ivalues;
public HyphenationTree() {
stoplist = new HashMap(23); // usually a small table
classmap = new TernaryTree();
vspace = new ByteVector();
vspace.alloc(1); // this reserves index 0, which we don't use
}
/**
* Packs the values by storing them in 4 bits, two values into a byte Values
* range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
* value.
*
* @param values a string of digits from '0' to '9' representing the
* interletter values.
* @return the index into the vspace array where the packed values are stored.
*/
protected int packValues(String values) {
int i, n = values.length();
int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
int offset = vspace.alloc(m);
byte[] va = vspace.getArray();
for (i = 0; i < n; i++) {
int j = i >> 1;
byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f);
if ((i & 1) == 1) {
va[j + offset] = (byte) (va[j + offset] | v);
} else {
va[j + offset] = (byte) (v << 4); // big endian
}
}
va[m - 1 + offset] = 0; // terminator
return offset;
}
protected String unpackValues(int k) {
StringBuffer buf = new StringBuffer();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char) ((v >>> 4) - 1 + '0');
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {
break;
}
c = (char) (c - 1 + '0');
buf.append(c);
v = vspace.get(k++);
}
return buf.toString();
}
/**
* Read hyphenation patterns from an XML file.
*
* @param filename the filename
* @throws HyphenationException In case the parsing fails
*/
public void loadPatterns(File f) throws HyphenationException {
try {
InputSource src = new InputSource(f.toURL().toExternalForm());
loadPatterns(src);
} catch (MalformedURLException e) {
throw new HyphenationException("Error converting the File '" + f
+ "' to a URL: " + e.getMessage());
}
}
/**
* Read hyphenation patterns from an XML file.
*
* @param source the InputSource for the file
* @throws HyphenationException In case the parsing fails
*/
public void loadPatterns(InputSource source) throws HyphenationException {
PatternParser pp = new PatternParser(this);
ivalues = new TernaryTree();
pp.parse(source);
// patterns/values should be now in the tree
// let's optimize a bit
trimToSize();
vspace.trimToSize();
classmap.trimToSize();
// get rid of the auxiliary map
ivalues = null;
}
public String findPattern(String pat) {
int k = super.find(pat);
if (k >= 0) {
return unpackValues(k);
}
return "";
}
/**
* String compare, returns 0 if equal or t is a substring of s
*/
protected int hstrcmp(char[] s, int si, char[] t, int ti) {
for (; s[si] == t[ti]; si++, ti++) {
if (s[si] == 0) {
return 0;
}
}
if (t[ti] == 0) {
return 0;
}
return s[si] - t[ti];
}
protected byte[] getValues(int k) {
StringBuffer buf = new StringBuffer();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char) ((v >>> 4) - 1);
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {
break;
}
c = (char) (c - 1);
buf.append(c);
v = vspace.get(k++);
}
byte[] res = new byte[buf.length()];
for (int i = 0; i < res.length; i++) {
res[i] = (byte) buf.charAt(i);
}
return res;
}
/**
* <p>
* Search for all possible partial matches of word starting at index an update
* interletter values. In other words, it does something like:
* </p>
* <code>
* for(i=0; i<patterns.length; i++) {
* if ( word.substring(index).startsWidth(patterns[i]) )
* update_interletter_values(patterns[i]);
* }
* </code>
* <p>
* But it is done in an efficient way since the patterns are stored in a
* ternary tree. In fact, this is the whole purpose of having the tree: doing
* this search without having to test every single pattern. The number of
* patterns for languages such as English range from 4000 to 10000. Thus,
* doing thousands of string comparisons for each word to hyphenate would be
* really slow without the tree. The tradeoff is memory, but using a ternary
* tree instead of a trie, almost halves the the memory used by Lout or TeX.
* It's also faster than using a hash table
* </p>
*
* @param word null terminated word to match
* @param index start index from word
* @param il interletter values array to update
*/
protected void searchPatterns(char[] word, int index, byte[] il) {
byte[] values;
int i = index;
char p, q;
char sp = word[i];
p = root;
while (p > 0 && p < sc.length) {
if (sc[p] == 0xFFFF) {
if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
values = getValues(eq[p]); // data pointer is in eq[]
int j = index;
for (int k = 0; k < values.length; k++) {
if (j < il.length && values[k] > il[j]) {
il[j] = values[k];
}
j++;
}
}
return;
}
int d = sp - sc[p];
if (d == 0) {
if (sp == 0) {
break;
}
sp = word[++i];
p = eq[p];
q = p;
// look for a pattern ending at this position by searching for
// the null char ( splitchar == 0 )
while (q > 0 && q < sc.length) {
if (sc[q] == 0xFFFF) { // stop at compressed branch
break;
}
if (sc[q] == 0) {
values = getValues(eq[q]);
int j = index;
for (int k = 0; k < values.length; k++) {
if (j < il.length && values[k] > il[j]) {
il[j] = values[k];
}
j++;
}
break;
} else {
q = lo[q];
/**
* actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
* java chars are unsigned
*/
}
}
} else {
p = d < 0 ? lo[p] : hi[p];
}
}
}
/**
* Hyphenate word and return a Hyphenation object.
*
* @param word the word to be hyphenated
* @param remainCharCount Minimum number of characters allowed before the
* hyphenation point.
* @param pushCharCount Minimum number of characters allowed after the
* hyphenation point.
* @return a {@link Hyphenation Hyphenation} object representing the
* hyphenated word or null if word is not hyphenated.
*/
public Hyphenation hyphenate(String word, int remainCharCount,
int pushCharCount) {
char[] w = word.toCharArray();
return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
}
/**
* w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
* may be absent, the first n is at offset, the first l is at offset +
* iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
* into word. In the first part of the routine len = w.length, in the second
* part of the routine len = word.length. Three indices are used: index(w),
* the index in w, index(word), the index in word, letterindex(word), the
* index in the letter part of word. The following relations exist: index(w) =
* offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
* index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
* offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
* iIgnoreAtBeginning
*/
/**
* Hyphenate word and return an array of hyphenation points.
*
* @param w char array that contains the word
* @param offset Offset to first character in word
* @param len Length of word
* @param remainCharCount Minimum number of characters allowed before the
* hyphenation point.
* @param pushCharCount Minimum number of characters allowed after the
* hyphenation point.
* @return a {@link Hyphenation Hyphenation} object representing the
* hyphenated word or null if word is not hyphenated.
*/
public Hyphenation hyphenate(char[] w, int offset, int len,
int remainCharCount, int pushCharCount) {
int i;
char[] word = new char[len + 3];
// normalize word
char[] c = new char[2];
int iIgnoreAtBeginning = 0;
int iLength = len;
boolean bEndOfLetters = false;
for (i = 1; i <= len; i++) {
c[0] = w[offset + i - 1];
int nc = classmap.find(c, 0);
if (nc < 0) { // found a non-letter character ...
if (i == (1 + iIgnoreAtBeginning)) {
// ... before any letter character
iIgnoreAtBeginning++;
} else {
// ... after a letter character
bEndOfLetters = true;
}
iLength--;
} else {
if (!bEndOfLetters) {
word[i - iIgnoreAtBeginning] = (char) nc;
} else {
return null;
}
}
}
len = iLength;
if (len < (remainCharCount + pushCharCount)) {
// word is too short to be hyphenated
return null;
}
int[] result = new int[len + 1];
int k = 0;
// check exception list first
String sw = new String(word, 1, len);
if (stoplist.containsKey(sw)) {
// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
// null)
ArrayList hw = (ArrayList) stoplist.get(sw);
int j = 0;
for (i = 0; i < hw.size(); i++) {
Object o = hw.get(i);
// j = index(sw) = letterindex(word)?
// result[k] = corresponding index(w)
if (o instanceof String) {
j += ((String) o).length();
if (j >= remainCharCount && j < (len - pushCharCount)) {
result[k++] = j + iIgnoreAtBeginning;
}
}
}
} else {
// use algorithm to get hyphenation points
word[0] = '.'; // word start marker
word[len + 1] = '.'; // word end marker
word[len + 2] = 0; // null terminated
byte[] il = new byte[len + 3]; // initialized to zero
for (i = 0; i < len + 1; i++) {
searchPatterns(word, i, il);
}
// hyphenation points are located where interletter value is odd
// i is letterindex(word),
// i + 1 is index(word),
// result[k] = corresponding index(w)
for (i = 0; i < len; i++) {
if (((il[i + 1] & 1) == 1) && i >= remainCharCount
&& i <= (len - pushCharCount)) {
result[k++] = i + iIgnoreAtBeginning;
}
}
}
if (k > 0) {
// trim result array
int[] res = new int[k+2];
System.arraycopy(result, 0, res, 1, k);
// We add the synthetical hyphenation points
// at the beginning and end of the word
res[0]=0;
res[k+1]=len;
return new Hyphenation(res);
} else {
return null;
}
}
/**
* Add a character class to the tree. It is used by
* {@link PatternParser PatternParser} as callback to add character classes.
* Character classes define the valid word characters for hyphenation. If a
* word contains a character not defined in any of the classes, it is not
* hyphenated. It also defines a way to normalize the characters in order to
* compare them with the stored patterns. Usually pattern files use only lower
* case characters, in this case a class for letter 'a', for example, should
* be defined as "aA", the first character being the normalization char.
*/
public void addClass(String chargroup) {
if (chargroup.length() > 0) {
char equivChar = chargroup.charAt(0);
char[] key = new char[2];
key[1] = 0;
for (int i = 0; i < chargroup.length(); i++) {
key[0] = chargroup.charAt(i);
classmap.insert(key, 0, equivChar);
}
}
}
/**
* Add an exception to the tree. It is used by
* {@link PatternParser PatternParser} class as callback to store the
* hyphenation exceptions.
*
* @param word normalized word
* @param hyphenatedword a vector of alternating strings and
* {@link Hyphen hyphen} objects.
*/
public void addException(String word, ArrayList hyphenatedword) {
stoplist.put(word, hyphenatedword);
}
/**
* Add a pattern to the tree. Mainly, to be used by
* {@link PatternParser PatternParser} class as callback to add a pattern to
* the tree.
*
* @param pattern the hyphenation pattern
* @param ivalue interletter weight values indicating the desirability and
* priority of hyphenating at a given point within the pattern. It
* should contain only digit characters. (i.e. '0' to '9').
*/
public void addPattern(String pattern, String ivalue) {
int k = ivalues.find(ivalue);
if (k <= 0) {
k = packValues(ivalue);
ivalues.insert(ivalue, (char) k);
}
insert(pattern, (char) k);
}
public void printStats() {
System.out.println("Value space size = "
+ Integer.toString(vspace.length()));
super.printStats();
}
}

View File

@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
import java.util.ArrayList;
/**
* This interface is used to connect the XML pattern file parser to the
* hyphenation tree.
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public interface PatternConsumer {
/**
* Add a character class. A character class defines characters that are
* considered equivalent for the purpose of hyphenation (e.g. "aA"). It
* usually means to ignore case.
*
* @param chargroup character group
*/
void addClass(String chargroup);
/**
* Add a hyphenation exception. An exception replaces the result obtained by
* the algorithm for cases for which this fails or the user wants to provide
* his own hyphenation. A hyphenatedword is a vector of alternating String's
* and {@link Hyphen Hyphen} instances
*/
void addException(String word, ArrayList hyphenatedword);
/**
* Add hyphenation patterns.
*
* @param pattern the pattern
* @param values interletter values expressed as a string of digit characters.
*/
void addPattern(String pattern, String values);
}

View File

@ -0,0 +1,518 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* $Id: PatternParser.java 426576 2006-07-28 15:44:37Z jeremias $ */
package org.apache.lucene.analysis.compound.hyphenation;
// SAX
import org.xml.sax.XMLReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
// Java
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import javax.xml.parsers.SAXParserFactory;
/**
* A SAX document handler to read and parse hyphenation patterns from a XML
* file.
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class PatternParser extends DefaultHandler implements PatternConsumer {
XMLReader parser;
int currElement;
PatternConsumer consumer;
StringBuffer token;
ArrayList exception;
char hyphenChar;
String errMsg;
static final int ELEM_CLASSES = 1;
static final int ELEM_EXCEPTIONS = 2;
static final int ELEM_PATTERNS = 3;
static final int ELEM_HYPHEN = 4;
public PatternParser() throws HyphenationException {
token = new StringBuffer();
parser = createParser();
parser.setContentHandler(this);
parser.setErrorHandler(this);
parser.setEntityResolver(this);
hyphenChar = '-'; // default
}
public PatternParser(PatternConsumer consumer) throws HyphenationException {
this();
this.consumer = consumer;
}
public void setConsumer(PatternConsumer consumer) {
this.consumer = consumer;
}
/**
* Parses a hyphenation pattern file.
*
* @param filename the filename
* @throws HyphenationException In case of an exception while parsing
*/
public void parse(String filename) throws HyphenationException {
parse(new File(filename));
}
/**
* Parses a hyphenation pattern file.
*
* @param file the pattern file
* @throws HyphenationException In case of an exception while parsing
*/
public void parse(File file) throws HyphenationException {
try {
InputSource src = new InputSource(file.toURL().toExternalForm());
parse(src);
} catch (MalformedURLException e) {
throw new HyphenationException("Error converting the File '" + file
+ "' to a URL: " + e.getMessage());
}
}
/**
* Parses a hyphenation pattern file.
*
* @param source the InputSource for the file
* @throws HyphenationException In case of an exception while parsing
*/
public void parse(InputSource source) throws HyphenationException {
try {
parser.parse(source);
} catch (FileNotFoundException fnfe) {
throw new HyphenationException("File not found: " + fnfe.getMessage());
} catch (IOException ioe) {
throw new HyphenationException(ioe.getMessage());
} catch (SAXException e) {
throw new HyphenationException(errMsg);
}
}
/**
* Creates a SAX parser using JAXP
*
* @return the created SAX parser
*/
static XMLReader createParser() {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
return factory.newSAXParser().getXMLReader();
} catch (Exception e) {
throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
}
}
protected String readToken(StringBuffer chars) {
String word;
boolean space = false;
int i;
for (i = 0; i < chars.length(); i++) {
if (Character.isWhitespace(chars.charAt(i))) {
space = true;
} else {
break;
}
}
if (space) {
// chars.delete(0,i);
for (int countr = i; countr < chars.length(); countr++) {
chars.setCharAt(countr - i, chars.charAt(countr));
}
chars.setLength(chars.length() - i);
if (token.length() > 0) {
word = token.toString();
token.setLength(0);
return word;
}
}
space = false;
for (i = 0; i < chars.length(); i++) {
if (Character.isWhitespace(chars.charAt(i))) {
space = true;
break;
}
}
token.append(chars.toString().substring(0, i));
// chars.delete(0,i);
for (int countr = i; countr < chars.length(); countr++) {
chars.setCharAt(countr - i, chars.charAt(countr));
}
chars.setLength(chars.length() - i);
if (space) {
word = token.toString();
token.setLength(0);
return word;
}
token.append(chars);
return null;
}
protected static String getPattern(String word) {
StringBuffer pat = new StringBuffer();
int len = word.length();
for (int i = 0; i < len; i++) {
if (!Character.isDigit(word.charAt(i))) {
pat.append(word.charAt(i));
}
}
return pat.toString();
}
protected ArrayList normalizeException(ArrayList ex) {
ArrayList res = new ArrayList();
for (int i = 0; i < ex.size(); i++) {
Object item = ex.get(i);
if (item instanceof String) {
String str = (String) item;
StringBuffer buf = new StringBuffer();
for (int j = 0; j < str.length(); j++) {
char c = str.charAt(j);
if (c != hyphenChar) {
buf.append(c);
} else {
res.add(buf.toString());
buf.setLength(0);
char[] h = new char[1];
h[0] = hyphenChar;
// we use here hyphenChar which is not necessarily
// the one to be printed
res.add(new Hyphen(new String(h), null, null));
}
}
if (buf.length() > 0) {
res.add(buf.toString());
}
} else {
res.add(item);
}
}
return res;
}
protected String getExceptionWord(ArrayList ex) {
StringBuffer res = new StringBuffer();
for (int i = 0; i < ex.size(); i++) {
Object item = ex.get(i);
if (item instanceof String) {
res.append((String) item);
} else {
if (((Hyphen) item).noBreak != null) {
res.append(((Hyphen) item).noBreak);
}
}
}
return res.toString();
}
protected static String getInterletterValues(String pat) {
StringBuffer il = new StringBuffer();
String word = pat + "a"; // add dummy letter to serve as sentinel
int len = word.length();
for (int i = 0; i < len; i++) {
char c = word.charAt(i);
if (Character.isDigit(c)) {
il.append(c);
i++;
} else {
il.append('0');
}
}
return il.toString();
}
//
// EntityResolver methods
//
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException {
return HyphenationDTDGenerator.generateDTD();
}
//
// ContentHandler methods
//
/**
* @see org.xml.sax.ContentHandler#startElement(java.lang.String,
* java.lang.String, java.lang.String, org.xml.sax.Attributes)
*/
public void startElement(String uri, String local, String raw,
Attributes attrs) {
if (local.equals("hyphen-char")) {
String h = attrs.getValue("value");
if (h != null && h.length() == 1) {
hyphenChar = h.charAt(0);
}
} else if (local.equals("classes")) {
currElement = ELEM_CLASSES;
} else if (local.equals("patterns")) {
currElement = ELEM_PATTERNS;
} else if (local.equals("exceptions")) {
currElement = ELEM_EXCEPTIONS;
exception = new ArrayList();
} else if (local.equals("hyphen")) {
if (token.length() > 0) {
exception.add(token.toString());
}
exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"),
attrs.getValue("post")));
currElement = ELEM_HYPHEN;
}
token.setLength(0);
}
/**
* @see org.xml.sax.ContentHandler#endElement(java.lang.String,
* java.lang.String, java.lang.String)
*/
public void endElement(String uri, String local, String raw) {
if (token.length() > 0) {
String word = token.toString();
switch (currElement) {
case ELEM_CLASSES:
consumer.addClass(word);
break;
case ELEM_EXCEPTIONS:
exception.add(word);
exception = normalizeException(exception);
consumer.addException(getExceptionWord(exception),
(ArrayList) exception.clone());
break;
case ELEM_PATTERNS:
consumer.addPattern(getPattern(word), getInterletterValues(word));
break;
case ELEM_HYPHEN:
// nothing to do
break;
}
if (currElement != ELEM_HYPHEN) {
token.setLength(0);
}
}
if (currElement == ELEM_HYPHEN) {
currElement = ELEM_EXCEPTIONS;
} else {
currElement = 0;
}
}
/**
* @see org.xml.sax.ContentHandler#characters(char[], int, int)
*/
public void characters(char ch[], int start, int length) {
StringBuffer chars = new StringBuffer(length);
chars.append(ch, start, length);
String word = readToken(chars);
while (word != null) {
// System.out.println("\"" + word + "\"");
switch (currElement) {
case ELEM_CLASSES:
consumer.addClass(word);
break;
case ELEM_EXCEPTIONS:
exception.add(word);
exception = normalizeException(exception);
consumer.addException(getExceptionWord(exception),
(ArrayList) exception.clone());
exception.clear();
break;
case ELEM_PATTERNS:
consumer.addPattern(getPattern(word), getInterletterValues(word));
break;
}
word = readToken(chars);
}
}
//
// ErrorHandler methods
//
/**
* @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
*/
public void warning(SAXParseException ex) {
errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage();
}
/**
* @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
*/
public void error(SAXParseException ex) {
errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
}
/**
* @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
*/
public void fatalError(SAXParseException ex) throws SAXException {
errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage();
throw ex;
}
/**
* Returns a string of the location.
*/
private String getLocationString(SAXParseException ex) {
StringBuffer str = new StringBuffer();
String systemId = ex.getSystemId();
if (systemId != null) {
int index = systemId.lastIndexOf('/');
if (index != -1) {
systemId = systemId.substring(index + 1);
}
str.append(systemId);
}
str.append(':');
str.append(ex.getLineNumber());
str.append(':');
str.append(ex.getColumnNumber());
return str.toString();
} // getLocationString(SAXParseException):String
// PatternConsumer implementation for testing purposes
public void addClass(String c) {
System.out.println("class: " + c);
}
public void addException(String w, ArrayList e) {
System.out.println("exception: " + w + " : " + e.toString());
}
public void addPattern(String p, String v) {
System.out.println("pattern: " + p + " : " + v);
}
public static void main(String[] args) throws Exception {
if (args.length > 0) {
PatternParser pp = new PatternParser();
pp.setConsumer(pp);
pp.parse(args[0]);
}
}
}
class HyphenationDTDGenerator {
public static final String DTD_STRING=
"<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
"<!--\n"+
" Copyright 1999-2004 The Apache Software Foundation\n"+
"\n"+
" Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
" you may not use this file except in compliance with the License.\n"+
" You may obtain a copy of the License at\n"+
"\n"+
" http://www.apache.org/licenses/LICENSE-2.0\n"+
"\n"+
" Unless required by applicable law or agreed to in writing, software\n"+
" distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
" See the License for the specific language governing permissions and\n"+
" limitations under the License.\n"+
"-->\n"+
"<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
"\n"+
"<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
" classes, exceptions?, patterns)>\n"+
"\n"+
"<!-- Hyphen character to be used in the exception list as shortcut for\n"+
" <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
"-->\n"+
"<!ELEMENT hyphen-char EMPTY>\n"+
"<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
"\n"+
"<!-- Default minimun length in characters of hyphenated word fragments\n"+
" before and after the line break. For some languages this is not\n"+
" only for aesthetic purposes, wrong hyphens may be generated if this\n"+
" is not accounted for.\n"+
"-->\n"+
"<!ELEMENT hyphen-min EMPTY>\n"+
"<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
"<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
"\n"+
"<!-- Character equivalent classes: space separated list of character groups, all\n"+
" characters in a group are to be treated equivalent as far as\n"+
" the hyphenation algorithm is concerned. The first character in a group\n"+
" is the group's equivalent character. Patterns should only contain\n"+
" first characters. It also defines word characters, i.e. a word that\n"+
" contains characters not present in any of the classes is not hyphenated.\n"+
"-->\n"+
"<!ELEMENT classes (#PCDATA)>\n"+
"\n"+
"<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
" A hyphen is indicated by the hyphen tag, but you can use the\n"+
" hyphen-char defined previously as shortcut. This is in cases\n"+
" when the algorithm procedure finds wrong hyphens or you want\n"+
" to provide your own hyphenation for some words.\n"+
"-->\n"+
"<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
"\n"+
"<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
" characters as described before, between any two word characters a digit\n"+
" in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
" to zero. The '.' character is reserved to indicate begining or ending\n"+
" of words. -->\n"+
"<!ELEMENT patterns (#PCDATA)>\n"+
"\n"+
"<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
" with pre-break, post-break and no-break attributes.\n"+
" To be used in the exceptions list, the hyphen character is not\n"+
" automatically added -->\n"+
"<!ELEMENT hyphen EMPTY>\n"+
"<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
"<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
"<!ATTLIST hyphen post CDATA #IMPLIED>\n";
public static InputSource generateDTD() {
return new InputSource(new StringReader(DTD_STRING));
}
}

View File

@ -0,0 +1,663 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.compound.hyphenation;
import java.util.Enumeration;
import java.util.Stack;
import java.io.Serializable;
/**
* <h2>Ternary Search Tree.</h2>
*
* <p>
* A ternary search tree is a hibrid between a binary tree and a digital search
* tree (trie). Keys are limited to strings. A data value of type char is stored
* in each leaf node. It can be used as an index (or pointer) to the data.
* Branches that only contain one key are compressed to one node by storing a
* pointer to the trailer substring of the key. This class is intended to serve
* as base class or helper class to implement Dictionary collections or the
* like. Ternary trees have some nice properties as the following: the tree can
* be traversed in sorted order, partial matches (wildcard) can be implemented,
* retrieval of all keys within a given distance from the target, etc. The
* storage requirements are higher than a binary tree but a lot less than a
* trie. Performance is comparable with a hash table, sometimes it outperforms a
* hash function (most of the time can determine a miss faster than a hash).
* </p>
*
* <p>
* The main purpose of this java port is to serve as a base for implementing
* TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
* requires from 5000 to 15000 hyphenation patterns which will be keys in this
* tree. The strings patterns are usually small (from 2 to 5 characters), but
* each char in the tree is stored in a node. Thus memory usage is the main
* concern. We will sacrify 'elegance' to keep memory requirenments to the
* minimum. Using java's char type as pointer (yes, I know pointer it is a
* forbidden word in java) we can keep the size of the node to be just 8 bytes
* (3 pointers and the data char). This gives room for about 65000 nodes. In my
* tests the english patterns took 7694 nodes and the german patterns 10055
* nodes, so I think we are safe.
* </p>
*
* <p>
* All said, this is a map with strings as keys and char as value. Pretty
* limited!. It can be extended to a general map by using the string
* representation of an object and using the char value as an index to an array
* that contains the object values.
* </p>
*
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
*/
public class TernaryTree implements Cloneable, Serializable {
/**
* We use 4 arrays to represent a node. I guess I should have created a proper
* node class, but somehow Knuth's pascal code made me forget we now have a
* portable language with virtual memory management and automatic garbage
* collection! And now is kind of late, furthermore, if it ain't broken, don't
* fix it.
*/
/**
* Pointer to low branch and to rest of the key when it is stored directly in
* this node, we don't have unions in java!
*/
protected char[] lo;
/**
* Pointer to high branch.
*/
protected char[] hi;
/**
* Pointer to equal branch and to data when this node is a string terminator.
*/
protected char[] eq;
/**
* <P>
* The character stored in this node: splitchar. Two special values are
* reserved:
* </P>
* <ul>
* <li>0x0000 as string terminator</li>
* <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
* </ul>
* <p>
* This shouldn't be a problem if we give the usual semantics to strings since
* 0xFFFF is garanteed not to be an Unicode character.
* </p>
*/
protected char[] sc;
/**
* This vector holds the trailing of the keys when the branch is compressed.
*/
protected CharVector kv;
protected char root;
protected char freenode;
protected int length; // number of items in tree
protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
TernaryTree() {
init();
}
protected void init() {
root = 0;
freenode = 1;
length = 0;
lo = new char[BLOCK_SIZE];
hi = new char[BLOCK_SIZE];
eq = new char[BLOCK_SIZE];
sc = new char[BLOCK_SIZE];
kv = new CharVector();
}
/**
* Branches are initially compressed, needing one node per key plus the size
* of the string key. They are decompressed as needed when another key with
* same prefix is inserted. This saves a lot of space, specially for long
* keys.
*/
public void insert(String key, char val) {
// make sure we have enough room in the arrays
int len = key.length() + 1; // maximum number of nodes that may be generated
if (freenode + len > eq.length) {
redimNodeArrays(eq.length + BLOCK_SIZE);
}
char strkey[] = new char[len--];
key.getChars(0, len, strkey, 0);
strkey[len] = 0;
root = insert(root, strkey, 0, val);
}
public void insert(char[] key, int start, char val) {
int len = strlen(key) + 1;
if (freenode + len > eq.length) {
redimNodeArrays(eq.length + BLOCK_SIZE);
}
root = insert(root, key, start, val);
}
/**
* The actual insertion function, recursive version.
*/
private char insert(char p, char[] key, int start, char val) {
int len = strlen(key, start);
if (p == 0) {
// this means there is no branch, this node will start a new branch.
// Instead of doing that, we store the key somewhere else and create
// only one node with a pointer to the key
p = freenode++;
eq[p] = val; // holds data
length++;
hi[p] = 0;
if (len > 0) {
sc[p] = 0xFFFF; // indicates branch is compressed
lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key
strcpy(kv.getArray(), lo[p], key, start);
} else {
sc[p] = 0;
lo[p] = 0;
}
return p;
}
if (sc[p] == 0xFFFF) {
// branch is compressed: need to decompress
// this will generate garbage in the external key array
// but we can do some garbage collection later
char pp = freenode++;
lo[pp] = lo[p]; // previous pointer to key
eq[pp] = eq[p]; // previous pointer to data
lo[p] = 0;
if (len > 0) {
sc[p] = kv.get(lo[pp]);
eq[p] = pp;
lo[pp]++;
if (kv.get(lo[pp]) == 0) {
// key completly decompressed leaving garbage in key array
lo[pp] = 0;
sc[pp] = 0;
hi[pp] = 0;
} else {
// we only got first char of key, rest is still there
sc[pp] = 0xFFFF;
}
} else {
// In this case we can save a node by swapping the new node
// with the compressed node
sc[pp] = 0xFFFF;
hi[p] = pp;
sc[p] = 0;
eq[p] = val;
length++;
return p;
}
}
char s = key[start];
if (s < sc[p]) {
lo[p] = insert(lo[p], key, start, val);
} else if (s == sc[p]) {
if (s != 0) {
eq[p] = insert(eq[p], key, start + 1, val);
} else {
// key already in tree, overwrite data
eq[p] = val;
}
} else {
hi[p] = insert(hi[p], key, start, val);
}
return p;
}
/**
* Compares 2 null terminated char arrays
*/
public static int strcmp(char[] a, int startA, char[] b, int startB) {
for (; a[startA] == b[startB]; startA++, startB++) {
if (a[startA] == 0) {
return 0;
}
}
return a[startA] - b[startB];
}
/**
* Compares a string with null terminated char array
*/
public static int strcmp(String str, char[] a, int start) {
int i, d, len = str.length();
for (i = 0; i < len; i++) {
d = (int) str.charAt(i) - a[start + i];
if (d != 0) {
return d;
}
if (a[start + i] == 0) {
return d;
}
}
if (a[start + i] != 0) {
return (int) -a[start + i];
}
return 0;
}
public static void strcpy(char[] dst, int di, char[] src, int si) {
while (src[si] != 0) {
dst[di++] = src[si++];
}
dst[di] = 0;
}
public static int strlen(char[] a, int start) {
int len = 0;
for (int i = start; i < a.length && a[i] != 0; i++) {
len++;
}
return len;
}
public static int strlen(char[] a) {
return strlen(a, 0);
}
public int find(String key) {
int len = key.length();
char strkey[] = new char[len + 1];
key.getChars(0, len, strkey, 0);
strkey[len] = 0;
return find(strkey, 0);
}
public int find(char[] key, int start) {
int d;
char p = root;
int i = start;
char c;
while (p != 0) {
if (sc[p] == 0xFFFF) {
if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
return eq[p];
} else {
return -1;
}
}
c = key[i];
d = c - sc[p];
if (d == 0) {
if (c == 0) {
return eq[p];
}
i++;
p = eq[p];
} else if (d < 0) {
p = lo[p];
} else {
p = hi[p];
}
}
return -1;
}
public boolean knows(String key) {
return (find(key) >= 0);
}
// redimension the arrays
private void redimNodeArrays(int newsize) {
int len = newsize < lo.length ? newsize : lo.length;
char[] na = new char[newsize];
System.arraycopy(lo, 0, na, 0, len);
lo = na;
na = new char[newsize];
System.arraycopy(hi, 0, na, 0, len);
hi = na;
na = new char[newsize];
System.arraycopy(eq, 0, na, 0, len);
eq = na;
na = new char[newsize];
System.arraycopy(sc, 0, na, 0, len);
sc = na;
}
public int size() {
return length;
}
public Object clone() {
TernaryTree t = new TernaryTree();
t.lo = (char[]) this.lo.clone();
t.hi = (char[]) this.hi.clone();
t.eq = (char[]) this.eq.clone();
t.sc = (char[]) this.sc.clone();
t.kv = (CharVector) this.kv.clone();
t.root = this.root;
t.freenode = this.freenode;
t.length = this.length;
return t;
}
/**
* Recursively insert the median first and then the median of the lower and
* upper halves, and so on in order to get a balanced tree. The array of keys
* is assumed to be sorted in ascending order.
*/
protected void insertBalanced(String[] k, char[] v, int offset, int n) {
int m;
if (n < 1) {
return;
}
m = n >> 1;
insert(k[m + offset], v[m + offset]);
insertBalanced(k, v, offset, m);
insertBalanced(k, v, offset + m + 1, n - m - 1);
}
/**
* Balance the tree for best search performance
*/
public void balance() {
// System.out.print("Before root splitchar = ");
// System.out.println(sc[root]);
int i = 0, n = length;
String[] k = new String[n];
char[] v = new char[n];
Iterator iter = new Iterator();
while (iter.hasMoreElements()) {
v[i] = iter.getValue();
k[i++] = (String) iter.nextElement();
}
init();
insertBalanced(k, v, 0, n);
// With uniform letter distribution sc[root] should be around 'm'
// System.out.print("After root splitchar = ");
// System.out.println(sc[root]);
}
/**
* Each node stores a character (splitchar) which is part of some key(s). In a
* compressed branch (one that only contain a single string key) the trailer
* of the key which is not already in nodes is stored externally in the kv
* array. As items are inserted, key substrings decrease. Some substrings may
* completely disappear when the whole branch is totally decompressed. The
* tree is traversed to find the key substrings actually used. In addition,
* duplicate substrings are removed using a map (implemented with a
* TernaryTree!).
*
*/
public void trimToSize() {
// first balance the tree for best performance
balance();
// redimension the node arrays
redimNodeArrays(freenode);
// ok, compact kv array
CharVector kx = new CharVector();
kx.alloc(1);
TernaryTree map = new TernaryTree();
compact(kx, map, root);
kv = kx;
kv.trimToSize();
}
private void compact(CharVector kx, TernaryTree map, char p) {
int k;
if (p == 0) {
return;
}
if (sc[p] == 0xFFFF) {
k = map.find(kv.getArray(), lo[p]);
if (k < 0) {
k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
map.insert(kx.getArray(), k, (char) k);
}
lo[p] = (char) k;
} else {
compact(kx, map, lo[p]);
if (sc[p] != 0) {
compact(kx, map, eq[p]);
}
compact(kx, map, hi[p]);
}
}
public Enumeration keys() {
return new Iterator();
}
public class Iterator implements Enumeration {
/**
* current node index
*/
int cur;
/**
* current key
*/
String curkey;
private class Item implements Cloneable {
char parent;
char child;
public Item() {
parent = 0;
child = 0;
}
public Item(char p, char c) {
parent = p;
child = c;
}
public Object clone() {
return new Item(parent, child);
}
}
/**
* Node stack
*/
Stack ns;
/**
* key stack implemented with a StringBuffer
*/
StringBuffer ks;
public Iterator() {
cur = -1;
ns = new Stack();
ks = new StringBuffer();
rewind();
}
public void rewind() {
ns.removeAllElements();
ks.setLength(0);
cur = root;
run();
}
public Object nextElement() {
String res = new String(curkey);
cur = up();
run();
return res;
}
public char getValue() {
if (cur >= 0) {
return eq[cur];
}
return 0;
}
public boolean hasMoreElements() {
return (cur != -1);
}
/**
* traverse upwards
*/
private int up() {
Item i = new Item();
int res = 0;
if (ns.empty()) {
return -1;
}
if (cur != 0 && sc[cur] == 0) {
return lo[cur];
}
boolean climb = true;
while (climb) {
i = (Item) ns.pop();
i.child++;
switch (i.child) {
case 1:
if (sc[i.parent] != 0) {
res = eq[i.parent];
ns.push(i.clone());
ks.append(sc[i.parent]);
} else {
i.child++;
ns.push(i.clone());
res = hi[i.parent];
}
climb = false;
break;
case 2:
res = hi[i.parent];
ns.push(i.clone());
if (ks.length() > 0) {
ks.setLength(ks.length() - 1); // pop
}
climb = false;
break;
default:
if (ns.empty()) {
return -1;
}
climb = true;
break;
}
}
return res;
}
/**
* traverse the tree to find next key
*/
private int run() {
if (cur == -1) {
return -1;
}
boolean leaf = false;
while (true) {
// first go down on low branch until leaf or compressed branch
while (cur != 0) {
if (sc[cur] == 0xFFFF) {
leaf = true;
break;
}
ns.push(new Item((char) cur, '\u0000'));
if (sc[cur] == 0) {
leaf = true;
break;
}
cur = lo[cur];
}
if (leaf) {
break;
}
// nothing found, go up one node and try again
cur = up();
if (cur == -1) {
return -1;
}
}
// The current node should be a data node and
// the key should be in the key stack (at least partially)
StringBuffer buf = new StringBuffer(ks.toString());
if (sc[cur] == 0xFFFF) {
int p = lo[cur];
while (kv.get(p) != 0) {
buf.append(kv.get(p++));
}
}
curkey = buf.toString();
return 0;
}
}
public void printStats() {
System.out.println("Number of keys = " + Integer.toString(length));
System.out.println("Node count = " + Integer.toString(freenode));
// System.out.println("Array length = " + Integer.toString(eq.length));
System.out.println("Key Array length = " + Integer.toString(kv.length()));
/*
* for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
* System.out.print(kv.get(i)); else System.out.println("");
* System.out.println("Keys:"); for(Enumeration enum = keys();
* enum.hasMoreElements(); ) System.out.println(enum.nextElement());
*/
}
public static void main(String[] args) throws Exception {
TernaryTree tt = new TernaryTree();
tt.insert("Carlos", 'C');
tt.insert("Car", 'r');
tt.insert("palos", 'l');
tt.insert("pa", 'p');
tt.trimToSize();
System.out.println((char) tt.find("Car"));
System.out.println((char) tt.find("Carlos"));
System.out.println((char) tt.find("alto"));
tt.printStats();
}
}

View File

@ -0,0 +1,68 @@
<?xml version="1.0" encoding="US-ASCII"?>
<!--
Copyright 1999-2004 The Apache Software Foundation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
classes, exceptions?, patterns)>
<!-- Hyphen character to be used in the exception list as shortcut for
<hyphen pre-break="-"/>. Defaults to '-'
-->
<!ELEMENT hyphen-char EMPTY>
<!ATTLIST hyphen-char value CDATA #REQUIRED>
<!-- Default minimun length in characters of hyphenated word fragments
before and after the line break. For some languages this is not
only for aesthetic purposes, wrong hyphens may be generated if this
is not accounted for.
-->
<!ELEMENT hyphen-min EMPTY>
<!ATTLIST hyphen-min before CDATA #REQUIRED>
<!ATTLIST hyphen-min after CDATA #REQUIRED>
<!-- Character equivalent classes: space separated list of character groups, all
characters in a group are to be treated equivalent as far as
the hyphenation algorithm is concerned. The first character in a group
is the group's equivalent character. Patterns should only contain
first characters. It also defines word characters, i.e. a word that
contains characters not present in any of the classes is not hyphenated.
-->
<!ELEMENT classes (#PCDATA)>
<!-- Hyphenation exceptions: space separated list of hyphenated words.
A hyphen is indicated by the hyphen tag, but you can use the
hyphen-char defined previously as shortcut. This is in cases
when the algorithm procedure finds wrong hyphens or you want
to provide your own hyphenation for some words.
-->
<!ELEMENT exceptions (#PCDATA|hyphen)* >
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
characters as described before, between any two word characters a digit
in the range 0 to 9 may be specified. The absence of a digit is equivalent
to zero. The '.' character is reserved to indicate begining or ending
of words. -->
<!ELEMENT patterns (#PCDATA)>
<!-- A "full hyphen" equivalent to TeX's \discretionary
with pre-break, post-break and no-break attributes.
To be used in the exceptions list, the hyphen character is not
automatically added -->
<!ELEMENT hyphen EMPTY>
<!ATTLIST hyphen pre CDATA #IMPLIED>
<!ATTLIST hyphen no CDATA #IMPLIED>
<!ATTLIST hyphen post CDATA #IMPLIED>

View File

@ -0,0 +1,10 @@
<html>
<head>
<title>Hypenation code for the CompoundWordTokenFilter</title>
</head>
<body>
<p>
The code for the compound word hyphenation is taken from the <a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>. All credits for the hyphenation code belongs to them.
</p>
</body>
</html>

View File

@ -0,0 +1,166 @@
<html>
<head>
<title>CompoundWordTokenFilter</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></meta>
</head>
<body>
A filter that decomposes compound words you find in many Germanic
languages to the word parts. This example shows what it does:
<table border="1">
<tr>
<th>Input token stream</th>
</tr>
<tr>
<td>Rindfleisch&uuml;berwachungsgesetz Drahtschere abba</td>
</tr>
</table>
<br>
<table border="1">
<tr>
<th>Output token stream</th>
</tr>
<tr>
<td>(Rindfleisch&uuml;berwachungsgesetz,0,29)</td>
</tr>
<tr>
<td>(Rind,0,4,posIncr=0)</td>
</tr>
<tr>
<td>(fleisch,4,11,posIncr=0)</td>
</tr>
<tr>
<td>(&uuml;berwachung,11,22,posIncr=0)</td>
</tr>
<tr>
<td>(gesetz,23,29,posIncr=0)</td>
</tr>
<tr>
<td>(Drahtschere,30,41)</td>
</tr>
<tr>
<td>(Draht,30,35,posIncr=0)</td>
</tr>
<tr>
<td>(schere,35,41,posIncr=0)</td>
</tr>
<tr>
<td>(abba,42,46)</td>
</tr>
</table>
The input token is always preserved and the filters do not alter the case of word parts. There are two variants of the
filter available:
<ul>
<li><i>HyphenationCompoundWordTokenFilter</i>: it uses a
hyphenation grammer based approach to find potential word parts of a
given word.</li>
<li><i>DictionaryCompoundWordTokenFilter</i>: it uses a
brute-force dictionary-only based approach to find the word parts of a given
word.</li>
</ul>
<h3>Compound word token filters</h3>
<h4>HyphenationCompoundWordTokenFilter</h4>
The {@link
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
potential subwords that a worth to check against the dictionary. The
quality of the output tokens is directly connected to the quality of the
grammar file you use. For languages like German they are quite good.
<h5>Grammar file</h5>
Unfortunately we cannot bundle the hyphenation grammar files with Lucene
because they do not use an ASF compatible license (they use the LaTeX
Project Public License instead). You can find the XML based grammar
files at the
<a href="http://offo.sourceforge.net/hyphenation/index.html">Objects
For Formatting Objects</a>
(OFFO) Sourceforge project (direct link to download the pattern files:
<a href="http://downloads.sourceforge.net/offo/offo-hyphenation.zip">http://downloads.sourceforge.net/offo/offo-hyphenation.zip</a>
). The files you need are in the subfolder
<i>offo-hyphenation/hyph/</i>
.
<br />
Credits for the hyphenation code go to the
<a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>
.
<h4>DictionaryCompoundWordTokenFilter</h4>
The {@link
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter
DictionaryCompoundWordTokenFilter} uses a dictionary-only approach to
find subwords in a compound word. It is much slower than the one that
uses the hyphenation grammars. You can use it as a first start to
see if your dictionary is good or not because it is much simpler in design.
<h3>Dictionary</h3>
The output quality of both token filters is directly connected to the
quality of the dictionary you use. They are language dependent of course.
You always should use a dictionary
that fits to the text you want to index. If you index medical text for
example then you should use a dictionary that contains medical words.
A good start for general text are the dictionaries you find at the
<a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
dictionaries</a>
Wiki.
<h3>Which variant should I use?</h3>
This decision matrix should help you:
<table border="1">
<tr>
<th>Token filter</th>
<th>Output quality</th>
<th>Performance</th>
</tr>
<tr>
<td>HyphenationCompoundWordTokenFilter</td>
<td>good if grammar file is good &ndash; acceptable otherwise</td>
<td>fast</td>
</tr>
<tr>
<td>DictionaryCompoundWordTokenFilter</td>
<td>good</td>
<td>slow</td>
</tr>
</table>
<h3>Examples</h3>
<pre>
public void testHyphenationCompoundWordsDE() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "&Uuml;berwachung" };
Reader reader = new FileReader("de_DR.xml");
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader(
"Rindfleisch&uuml;berwachungsgesetz Drahtschere abba")), hyphenator,
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
Token t;
while ((t=tf.next())!=null) {
System.out.println(t);
}
}
public void testDumbCompoundWordsSE() throws Exception {
String[] dict = { "Bil", "D&ouml;rr", "Motor", "Tak", "Borr", "Slag", "Hammar",
"Pelar", "Glas", "&Ouml;gon", "Fodral", "Bas", "Fiol", "Makare", "Ges&auml;ll",
"Sko", "Vind", "Rute", "Torkare", "Blad" };
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
new WhitespaceTokenizer(
new StringReader(
"Bild&ouml;rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas&ouml;gonfodral Basfiolsfodral Basfiolsfodralmakareges&auml;ll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
dict);
Token t;
while ((t=tf.next())!=null) {
System.out.println(t);
}
}
</pre>
</body>
</html>

View File

@ -0,0 +1,214 @@
package org.apache.lucene.analysis.compound;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import junit.framework.TestCase;
public class TestCompoundWordTokenFilter extends TestCase {
private static String[] locations = {
"http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
"http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
"http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
"http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
private byte[] patternsFileContent;
protected void setUp() throws Exception {
super.setUp();
getHyphenationPatternFileContents();
}
public void testHyphenationCompoundWordsDE() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung" };
Reader reader = getHyphenationReader("de_DR.xml");
if (reader == null) {
// we gracefully die if we have no reader
return;
}
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader(
"Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
"fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
"abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
0, 1 });
}
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
"Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
Reader reader = getHyphenationReader("de_DR.xml");
if (reader == null) {
// we gracefully die if we have no reader
return;
}
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader(
"Rindfleischüberwachungsgesetz")), hyphenator, dict,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz",
"Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
0, 0, 0 });
}
public void testDumbCompoundWordsSE() throws Exception {
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
"Sko", "Vind", "Rute", "Torkare", "Blad" };
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
new WhitespaceTokenizer(
new StringReader(
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
dict);
assertFiltersTo(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
"Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
"Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
"Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
"fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
"fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
"Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
"Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
0, 0, 0, 1 });
}
public void testDumbCompoundWordsSELongestMatch() throws Exception {
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")),
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
assertFiltersTo(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
"fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
0, 0 });
}
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
int[] endOffset, int[] posIncr) throws Exception {
for (int i = 0; i < s.length; ++i) {
Token t = tf.next();
assertNotNull(t);
assertEquals(s[i], new String(t.termBuffer(), 0, t.termLength()));
assertEquals(startOffset[i], t.startOffset());
assertEquals(endOffset[i], t.endOffset());
assertEquals(posIncr[i], t.getPositionIncrement());
}
assertNull(tf.next());
}
private void getHyphenationPatternFileContents() {
try {
List urls = new LinkedList(Arrays.asList(locations));
Collections.shuffle(urls);
URL url = new URL((String)urls.get(0));
InputStream in = url.openStream();
byte[] buffer = new byte[1024];
ByteArrayOutputStream out = new ByteArrayOutputStream();
int count;
while ((count = in.read(buffer)) != -1) {
out.write(buffer, 0, count);
}
in.close();
out.close();
patternsFileContent = out.toByteArray();
} catch (IOException e) {
// we swallow all exceptions - the user might have no internet connection
}
}
private Reader getHyphenationReader(String filename) throws Exception {
if (patternsFileContent == null) {
return null;
}
ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
patternsFileContent));
ZipEntry entry;
while ((entry = zipstream.getNextEntry()) != null) {
if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
byte[] buffer = new byte[1024];
ByteArrayOutputStream outstream = new ByteArrayOutputStream();
int count;
while ((count = zipstream.read(buffer)) != -1) {
outstream.write(buffer, 0, count);
}
outstream.close();
zipstream.close();
return new StringReader(new String(outstream.toByteArray(),
"ISO-8859-1"));
}
}
// we never should get here
return null;
}
}