mirror of https://github.com/apache/lucene.git
LUCENE-1166: Added token filter for decomposing compound words
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@657027 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aa0074f5db
commit
7a27cdcbc9
|
@ -159,6 +159,7 @@ New features
|
|||
12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper
|
||||
that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll)
|
||||
|
||||
13. LUCENE-1166: Decomposition tokenfilter for languages like German and Swedish (Thomas Peuss via Grant Ingersoll)
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Base class for decomposition token filters.
|
||||
*/
|
||||
public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||
/**
|
||||
* The default for minimal word length that gets decomposed
|
||||
*/
|
||||
public static final int DEFAULT_MIN_WORD_SIZE = 5;
|
||||
|
||||
/**
|
||||
* The default for minimal length of subwords that get propagated to the output of this filter
|
||||
*/
|
||||
public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
|
||||
|
||||
/**
|
||||
* The default for maximal length of subwords that get propagated to the output of this filter
|
||||
*/
|
||||
public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
|
||||
|
||||
protected final CharArraySet dictionary;
|
||||
protected final LinkedList tokens;
|
||||
protected final int minWordSize;
|
||||
protected final int minSubwordSize;
|
||||
protected final int maxSubwordSize;
|
||||
protected final boolean onlyLongestMatch;
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
|
||||
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
|
||||
this(input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) {
|
||||
this(input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input);
|
||||
|
||||
this.tokens=new LinkedList();
|
||||
this.minWordSize=minWordSize;
|
||||
this.minSubwordSize=minSubwordSize;
|
||||
this.maxSubwordSize=maxSubwordSize;
|
||||
this.onlyLongestMatch=onlyLongestMatch;
|
||||
|
||||
if (dictionary instanceof CharArraySet) {
|
||||
this.dictionary = (CharArraySet) dictionary;
|
||||
} else {
|
||||
this.dictionary = new CharArraySet(dictionary.size(), false);
|
||||
addAllLowerCase(this.dictionary, dictionary);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a set of words from an array
|
||||
* The resulting Set does case insensitive matching
|
||||
* TODO We should look for a faster dictionary lookup approach.
|
||||
* @param dictionary
|
||||
* @return
|
||||
*/
|
||||
public static final Set makeDictionary(final String[] dictionary) {
|
||||
CharArraySet dict = new CharArraySet(dictionary.length, false);
|
||||
addAllLowerCase(dict, Arrays.asList(dictionary));
|
||||
return dict;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
if (tokens.size() > 0) {
|
||||
return (Token)tokens.removeFirst();
|
||||
}
|
||||
|
||||
Token token = input.next();
|
||||
if (token == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
decompose(token);
|
||||
|
||||
if (tokens.size() > 0) {
|
||||
return (Token)tokens.removeFirst();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
protected static final void addAllLowerCase(Set target, Collection col) {
|
||||
Iterator iter=col.iterator();
|
||||
|
||||
while (iter.hasNext()) {
|
||||
target.add(((String)iter.next()).toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
protected static char[] makeLowerCaseCopy(final char[] buffer) {
|
||||
char[] result=new char[buffer.length];
|
||||
System.arraycopy(buffer, 0, result, 0, buffer.length);
|
||||
|
||||
for (int i=0;i<buffer.length;++i) {
|
||||
result[i]=Character.toLowerCase(buffer[i]);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected final Token createToken(final int offset, final int length,
|
||||
final Token prototype) {
|
||||
Token t = new Token(prototype.startOffset() + offset, prototype
|
||||
.startOffset()
|
||||
+ offset + length, prototype.type());
|
||||
t.setTermBuffer(prototype.termBuffer(), offset, length);
|
||||
t.setPositionIncrement(0);
|
||||
return t;
|
||||
}
|
||||
|
||||
protected void decompose(final Token token) {
|
||||
// In any case we give the original token back
|
||||
tokens.add(token);
|
||||
|
||||
// Only words longer than minWordSize get processed
|
||||
if (token.termLength() < this.minWordSize) {
|
||||
return;
|
||||
}
|
||||
|
||||
decomposeInternal(token);
|
||||
}
|
||||
|
||||
protected abstract void decomposeInternal(final Token token);
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* A TokenFilter that decomposes compound words found in many germanic languages
|
||||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff".
|
||||
* It uses a brute-force algorithm to achieve this.
|
||||
*/
|
||||
public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param dictionary the word dictionary to match against
|
||||
* @param minWordSize only words longer than this get processed
|
||||
* @param minSubwordSize only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param dictionary the word dictionary to match against
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
|
||||
super(input, dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
|
||||
super(input, dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
* @param minWordSize only words longer than this get processed
|
||||
* @param minSubwordSize only subwords longer than this get to the output stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
*/
|
||||
public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
|
||||
int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
protected void decomposeInternal(final Token token) {
|
||||
// Only words longer than minWordSize get processed
|
||||
if (token.termLength() < this.minWordSize) {
|
||||
return;
|
||||
}
|
||||
|
||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
|
||||
|
||||
for (int i=0;i<token.termLength()-this.minSubwordSize;++i) {
|
||||
Token longestMatchToken=null;
|
||||
for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
|
||||
if(i+j>token.termLength()) {
|
||||
break;
|
||||
}
|
||||
if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken!=null) {
|
||||
if (longestMatchToken.termLength()<j) {
|
||||
longestMatchToken=createToken(i,j,token);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken=createToken(i,j,token);
|
||||
}
|
||||
} else {
|
||||
tokens.add(createToken(i,j,token));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this.onlyLongestMatch && longestMatchToken!=null) {
|
||||
tokens.add(longestMatchToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,217 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
* A TokenFilter that decomposes compound words found in many germanic languages
|
||||
* "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
|
||||
* "Donaudampfschiff" even when you only enter "schiff" It uses a hyphenation
|
||||
* grammar and a word dictionary to achieve this.
|
||||
*/
|
||||
public class HyphenationCompoundWordTokenFilter extends
|
||||
CompoundWordTokenFilterBase {
|
||||
private HyphenationTree hyphenator;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against
|
||||
* @param minWordSize only words longer than this get processed
|
||||
* @param minSubwordSize only subwords longer than this get to the output
|
||||
* stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output
|
||||
* stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
this(input, hyphenator, makeDictionary(dictionary), minWordSize,
|
||||
minSubwordSize, maxSubwordSize, onlyLongestMatch);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, String[] dictionary) {
|
||||
this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, Set dictionary) {
|
||||
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input the token stream to process
|
||||
* @param hyphenator the hyphenation pattern tree to use for hyphenation
|
||||
* @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
|
||||
* lower case strings.
|
||||
* @param minWordSize only words longer than this get processed
|
||||
* @param minSubwordSize only subwords longer than this get to the output
|
||||
* stream
|
||||
* @param maxSubwordSize only subwords shorter than this get to the output
|
||||
* stream
|
||||
* @param onlyLongestMatch Add only the longest matching subword to the stream
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||
HyphenationTree hyphenator, Set dictionary, int minWordSize,
|
||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||
super(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||
onlyLongestMatch);
|
||||
|
||||
this.hyphenator = hyphenator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationFilename the filename of the XML grammar to load
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws Exception
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(String hyphenationFilename)
|
||||
throws Exception {
|
||||
return getHyphenationTree(new File(hyphenationFilename));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationFile the file of the XML grammar to load
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws Exception
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(File hyphenationFile)
|
||||
throws Exception {
|
||||
return getHyphenationTree(new InputStreamReader(new FileInputStream(
|
||||
hyphenationFile), "ISO-8859-1"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a hyphenator tree
|
||||
*
|
||||
* @param hyphenationReader the reader of the XML grammar to load from
|
||||
* @return An object representing the hyphenation patterns
|
||||
* @throws Exception
|
||||
*/
|
||||
public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
|
||||
throws Exception {
|
||||
HyphenationTree tree = new HyphenationTree();
|
||||
|
||||
tree.loadPatterns(new InputSource(hyphenationReader));
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
protected void decomposeInternal(final Token token) {
|
||||
// get the hpyphenation points
|
||||
Hyphenation hyphens = hyphenator.hyphenate(token.termBuffer(), 0, token
|
||||
.termLength(), 1, 1);
|
||||
// No hyphen points found -> exit
|
||||
if (hyphens == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
final int[] hyp = hyphens.getHyphenationPoints();
|
||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
|
||||
|
||||
for (int i = 0; i < hyp.length; ++i) {
|
||||
int remaining = hyp.length - i;
|
||||
int start = hyp[i];
|
||||
Token longestMatchToken = null;
|
||||
for (int j = 1; j < remaining; j++) {
|
||||
int partLength = hyp[i + j] - start;
|
||||
|
||||
// if the part is longer than maxSubwordSize we
|
||||
// are done with this round
|
||||
if (partLength > this.maxSubwordSize) {
|
||||
break;
|
||||
}
|
||||
|
||||
// we only put subwords to the token stream
|
||||
// that are longer than minPartSize
|
||||
if (partLength < this.minSubwordSize) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// check the dictionary
|
||||
if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.termLength() < partLength) {
|
||||
longestMatchToken = createToken(start, partLength, token);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken = createToken(start, partLength, token);
|
||||
}
|
||||
} else {
|
||||
tokens.add(createToken(start, partLength, token));
|
||||
}
|
||||
} else if (dictionary.contains(lowerCaseTermBuffer, start,
|
||||
partLength - 1)) {
|
||||
// check the dictionary again with a word that is one character
|
||||
// shorter
|
||||
// to avoid problems with genitive 's characters and other binding
|
||||
// characters
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.termLength() < partLength - 1) {
|
||||
longestMatchToken = createToken(start, partLength - 1, token);
|
||||
}
|
||||
} else {
|
||||
longestMatchToken = createToken(start, partLength - 1, token);
|
||||
}
|
||||
} else {
|
||||
tokens.add(createToken(start, partLength - 1, token));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (this.onlyLongestMatch && longestMatchToken!=null) {
|
||||
tokens.add(longestMatchToken);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class implements a simple byte vector with access to the underlying
|
||||
* array.
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class ByteVector implements Serializable {
|
||||
|
||||
/**
|
||||
* Capacity increment size
|
||||
*/
|
||||
private static final int DEFAULT_BLOCK_SIZE = 2048;
|
||||
|
||||
private int blockSize;
|
||||
|
||||
/**
|
||||
* The encapsulated array
|
||||
*/
|
||||
private byte[] array;
|
||||
|
||||
/**
|
||||
* Points to next free item
|
||||
*/
|
||||
private int n;
|
||||
|
||||
public ByteVector() {
|
||||
this(DEFAULT_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public ByteVector(int capacity) {
|
||||
if (capacity > 0) {
|
||||
blockSize = capacity;
|
||||
} else {
|
||||
blockSize = DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
array = new byte[blockSize];
|
||||
n = 0;
|
||||
}
|
||||
|
||||
public ByteVector(byte[] a) {
|
||||
blockSize = DEFAULT_BLOCK_SIZE;
|
||||
array = a;
|
||||
n = 0;
|
||||
}
|
||||
|
||||
public ByteVector(byte[] a, int capacity) {
|
||||
if (capacity > 0) {
|
||||
blockSize = capacity;
|
||||
} else {
|
||||
blockSize = DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
array = a;
|
||||
n = 0;
|
||||
}
|
||||
|
||||
public byte[] getArray() {
|
||||
return array;
|
||||
}
|
||||
|
||||
/**
|
||||
* return number of items in array
|
||||
*/
|
||||
public int length() {
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns current capacity of array
|
||||
*/
|
||||
public int capacity() {
|
||||
return array.length;
|
||||
}
|
||||
|
||||
public void put(int index, byte val) {
|
||||
array[index] = val;
|
||||
}
|
||||
|
||||
public byte get(int index) {
|
||||
return array[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* This is to implement memory allocation in the array. Like malloc().
|
||||
*/
|
||||
public int alloc(int size) {
|
||||
int index = n;
|
||||
int len = array.length;
|
||||
if (n + size >= len) {
|
||||
byte[] aux = new byte[len + blockSize];
|
||||
System.arraycopy(array, 0, aux, 0, len);
|
||||
array = aux;
|
||||
}
|
||||
n += size;
|
||||
return index;
|
||||
}
|
||||
|
||||
public void trimToSize() {
|
||||
if (n < array.length) {
|
||||
byte[] aux = new byte[n];
|
||||
System.arraycopy(array, 0, aux, 0, n);
|
||||
array = aux;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class implements a simple char vector with access to the underlying
|
||||
* array.
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class CharVector implements Cloneable, Serializable {
|
||||
|
||||
/**
|
||||
* Capacity increment size
|
||||
*/
|
||||
private static final int DEFAULT_BLOCK_SIZE = 2048;
|
||||
|
||||
private int blockSize;
|
||||
|
||||
/**
|
||||
* The encapsulated array
|
||||
*/
|
||||
private char[] array;
|
||||
|
||||
/**
|
||||
* Points to next free item
|
||||
*/
|
||||
private int n;
|
||||
|
||||
public CharVector() {
|
||||
this(DEFAULT_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
public CharVector(int capacity) {
|
||||
if (capacity > 0) {
|
||||
blockSize = capacity;
|
||||
} else {
|
||||
blockSize = DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
array = new char[blockSize];
|
||||
n = 0;
|
||||
}
|
||||
|
||||
public CharVector(char[] a) {
|
||||
blockSize = DEFAULT_BLOCK_SIZE;
|
||||
array = a;
|
||||
n = a.length;
|
||||
}
|
||||
|
||||
public CharVector(char[] a, int capacity) {
|
||||
if (capacity > 0) {
|
||||
blockSize = capacity;
|
||||
} else {
|
||||
blockSize = DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
array = a;
|
||||
n = a.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset Vector but don't resize or clear elements
|
||||
*/
|
||||
public void clear() {
|
||||
n = 0;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
CharVector cv = new CharVector((char[]) array.clone(), blockSize);
|
||||
cv.n = this.n;
|
||||
return cv;
|
||||
}
|
||||
|
||||
public char[] getArray() {
|
||||
return array;
|
||||
}
|
||||
|
||||
/**
|
||||
* return number of items in array
|
||||
*/
|
||||
public int length() {
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns current capacity of array
|
||||
*/
|
||||
public int capacity() {
|
||||
return array.length;
|
||||
}
|
||||
|
||||
public void put(int index, char val) {
|
||||
array[index] = val;
|
||||
}
|
||||
|
||||
public char get(int index) {
|
||||
return array[index];
|
||||
}
|
||||
|
||||
public int alloc(int size) {
|
||||
int index = n;
|
||||
int len = array.length;
|
||||
if (n + size >= len) {
|
||||
char[] aux = new char[len + blockSize];
|
||||
System.arraycopy(array, 0, aux, 0, len);
|
||||
array = aux;
|
||||
}
|
||||
n += size;
|
||||
return index;
|
||||
}
|
||||
|
||||
public void trimToSize() {
|
||||
if (n < array.length) {
|
||||
char[] aux = new char[n];
|
||||
System.arraycopy(array, 0, aux, 0, n);
|
||||
array = aux;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* This class represents a hyphen. A 'full' hyphen is made of 3 parts: the
|
||||
* pre-break text, post-break text and no-break. If no line-break is generated
|
||||
* at this position, the no-break text is used, otherwise, pre-break and
|
||||
* post-break are used. Typically, pre-break is equal to the hyphen character
|
||||
* and the others are empty. However, this general scheme allows support for
|
||||
* cases in some languages where words change spelling if they're split across
|
||||
* lines, like german's 'backen' which hyphenates 'bak-ken'. BTW, this comes
|
||||
* from TeX.
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
|
||||
public class Hyphen implements Serializable {
|
||||
public String preBreak;
|
||||
|
||||
public String noBreak;
|
||||
|
||||
public String postBreak;
|
||||
|
||||
Hyphen(String pre, String no, String post) {
|
||||
preBreak = pre;
|
||||
noBreak = no;
|
||||
postBreak = post;
|
||||
}
|
||||
|
||||
Hyphen(String pre) {
|
||||
preBreak = pre;
|
||||
noBreak = null;
|
||||
postBreak = null;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
if (noBreak == null && postBreak == null && preBreak != null
|
||||
&& preBreak.equals("-")) {
|
||||
return "-";
|
||||
}
|
||||
StringBuffer res = new StringBuffer("{");
|
||||
res.append(preBreak);
|
||||
res.append("}{");
|
||||
res.append(postBreak);
|
||||
res.append("}{");
|
||||
res.append(noBreak);
|
||||
res.append('}');
|
||||
return res.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
/**
|
||||
* This class represents a hyphenated word.
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class Hyphenation {
|
||||
|
||||
private int[] hyphenPoints;
|
||||
|
||||
/**
|
||||
* number of hyphenation points in word
|
||||
*/
|
||||
private int len;
|
||||
|
||||
/**
|
||||
* rawWord as made of alternating strings and {@link Hyphen Hyphen} instances
|
||||
*/
|
||||
Hyphenation(int[] points) {
|
||||
hyphenPoints = points;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of hyphenation points in the word
|
||||
*/
|
||||
public int length() {
|
||||
return hyphenPoints.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the hyphenation points
|
||||
*/
|
||||
public int[] getHyphenationPoints() {
|
||||
return hyphenPoints;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
/**
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class HyphenationException extends Exception {
|
||||
|
||||
/**
|
||||
* @see java.lang.Throwable#Throwable(String)
|
||||
*/
|
||||
public HyphenationException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,475 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
/**
|
||||
* This tree structure stores the hyphenation patterns in an efficient way for
|
||||
* fast lookup. It provides the provides the method to hyphenate a word.
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class HyphenationTree extends TernaryTree implements PatternConsumer,
|
||||
Serializable {
|
||||
|
||||
private static final long serialVersionUID = -7842107987915665573L;
|
||||
|
||||
/**
|
||||
* value space: stores the interletter values
|
||||
*/
|
||||
protected ByteVector vspace;
|
||||
|
||||
/**
|
||||
* This map stores hyphenation exceptions
|
||||
*/
|
||||
protected HashMap stoplist;
|
||||
|
||||
/**
|
||||
* This map stores the character classes
|
||||
*/
|
||||
protected TernaryTree classmap;
|
||||
|
||||
/**
|
||||
* Temporary map to store interletter values on pattern loading.
|
||||
*/
|
||||
private transient TernaryTree ivalues;
|
||||
|
||||
public HyphenationTree() {
|
||||
stoplist = new HashMap(23); // usually a small table
|
||||
classmap = new TernaryTree();
|
||||
vspace = new ByteVector();
|
||||
vspace.alloc(1); // this reserves index 0, which we don't use
|
||||
}
|
||||
|
||||
/**
|
||||
* Packs the values by storing them in 4 bits, two values into a byte Values
|
||||
* range is from 0 to 9. We use zero as terminator, so we'll add 1 to the
|
||||
* value.
|
||||
*
|
||||
* @param values a string of digits from '0' to '9' representing the
|
||||
* interletter values.
|
||||
* @return the index into the vspace array where the packed values are stored.
|
||||
*/
|
||||
protected int packValues(String values) {
|
||||
int i, n = values.length();
|
||||
int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
|
||||
int offset = vspace.alloc(m);
|
||||
byte[] va = vspace.getArray();
|
||||
for (i = 0; i < n; i++) {
|
||||
int j = i >> 1;
|
||||
byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f);
|
||||
if ((i & 1) == 1) {
|
||||
va[j + offset] = (byte) (va[j + offset] | v);
|
||||
} else {
|
||||
va[j + offset] = (byte) (v << 4); // big endian
|
||||
}
|
||||
}
|
||||
va[m - 1 + offset] = 0; // terminator
|
||||
return offset;
|
||||
}
|
||||
|
||||
protected String unpackValues(int k) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
byte v = vspace.get(k++);
|
||||
while (v != 0) {
|
||||
char c = (char) ((v >>> 4) - 1 + '0');
|
||||
buf.append(c);
|
||||
c = (char) (v & 0x0f);
|
||||
if (c == 0) {
|
||||
break;
|
||||
}
|
||||
c = (char) (c - 1 + '0');
|
||||
buf.append(c);
|
||||
v = vspace.get(k++);
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Read hyphenation patterns from an XML file.
|
||||
*
|
||||
* @param filename the filename
|
||||
* @throws HyphenationException In case the parsing fails
|
||||
*/
|
||||
public void loadPatterns(File f) throws HyphenationException {
|
||||
try {
|
||||
InputSource src = new InputSource(f.toURL().toExternalForm());
|
||||
loadPatterns(src);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new HyphenationException("Error converting the File '" + f
|
||||
+ "' to a URL: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read hyphenation patterns from an XML file.
|
||||
*
|
||||
* @param source the InputSource for the file
|
||||
* @throws HyphenationException In case the parsing fails
|
||||
*/
|
||||
public void loadPatterns(InputSource source) throws HyphenationException {
|
||||
PatternParser pp = new PatternParser(this);
|
||||
ivalues = new TernaryTree();
|
||||
|
||||
pp.parse(source);
|
||||
|
||||
// patterns/values should be now in the tree
|
||||
// let's optimize a bit
|
||||
trimToSize();
|
||||
vspace.trimToSize();
|
||||
classmap.trimToSize();
|
||||
|
||||
// get rid of the auxiliary map
|
||||
ivalues = null;
|
||||
}
|
||||
|
||||
public String findPattern(String pat) {
|
||||
int k = super.find(pat);
|
||||
if (k >= 0) {
|
||||
return unpackValues(k);
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* String compare, returns 0 if equal or t is a substring of s
|
||||
*/
|
||||
protected int hstrcmp(char[] s, int si, char[] t, int ti) {
|
||||
for (; s[si] == t[ti]; si++, ti++) {
|
||||
if (s[si] == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (t[ti] == 0) {
|
||||
return 0;
|
||||
}
|
||||
return s[si] - t[ti];
|
||||
}
|
||||
|
||||
protected byte[] getValues(int k) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
byte v = vspace.get(k++);
|
||||
while (v != 0) {
|
||||
char c = (char) ((v >>> 4) - 1);
|
||||
buf.append(c);
|
||||
c = (char) (v & 0x0f);
|
||||
if (c == 0) {
|
||||
break;
|
||||
}
|
||||
c = (char) (c - 1);
|
||||
buf.append(c);
|
||||
v = vspace.get(k++);
|
||||
}
|
||||
byte[] res = new byte[buf.length()];
|
||||
for (int i = 0; i < res.length; i++) {
|
||||
res[i] = (byte) buf.charAt(i);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Search for all possible partial matches of word starting at index an update
|
||||
* interletter values. In other words, it does something like:
|
||||
* </p>
|
||||
* <code>
|
||||
* for(i=0; i<patterns.length; i++) {
|
||||
* if ( word.substring(index).startsWidth(patterns[i]) )
|
||||
* update_interletter_values(patterns[i]);
|
||||
* }
|
||||
* </code>
|
||||
* <p>
|
||||
* But it is done in an efficient way since the patterns are stored in a
|
||||
* ternary tree. In fact, this is the whole purpose of having the tree: doing
|
||||
* this search without having to test every single pattern. The number of
|
||||
* patterns for languages such as English range from 4000 to 10000. Thus,
|
||||
* doing thousands of string comparisons for each word to hyphenate would be
|
||||
* really slow without the tree. The tradeoff is memory, but using a ternary
|
||||
* tree instead of a trie, almost halves the the memory used by Lout or TeX.
|
||||
* It's also faster than using a hash table
|
||||
* </p>
|
||||
*
|
||||
* @param word null terminated word to match
|
||||
* @param index start index from word
|
||||
* @param il interletter values array to update
|
||||
*/
|
||||
protected void searchPatterns(char[] word, int index, byte[] il) {
|
||||
byte[] values;
|
||||
int i = index;
|
||||
char p, q;
|
||||
char sp = word[i];
|
||||
p = root;
|
||||
|
||||
while (p > 0 && p < sc.length) {
|
||||
if (sc[p] == 0xFFFF) {
|
||||
if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
|
||||
values = getValues(eq[p]); // data pointer is in eq[]
|
||||
int j = index;
|
||||
for (int k = 0; k < values.length; k++) {
|
||||
if (j < il.length && values[k] > il[j]) {
|
||||
il[j] = values[k];
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
int d = sp - sc[p];
|
||||
if (d == 0) {
|
||||
if (sp == 0) {
|
||||
break;
|
||||
}
|
||||
sp = word[++i];
|
||||
p = eq[p];
|
||||
q = p;
|
||||
|
||||
// look for a pattern ending at this position by searching for
|
||||
// the null char ( splitchar == 0 )
|
||||
while (q > 0 && q < sc.length) {
|
||||
if (sc[q] == 0xFFFF) { // stop at compressed branch
|
||||
break;
|
||||
}
|
||||
if (sc[q] == 0) {
|
||||
values = getValues(eq[q]);
|
||||
int j = index;
|
||||
for (int k = 0; k < values.length; k++) {
|
||||
if (j < il.length && values[k] > il[j]) {
|
||||
il[j] = values[k];
|
||||
}
|
||||
j++;
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
q = lo[q];
|
||||
|
||||
/**
|
||||
* actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but
|
||||
* java chars are unsigned
|
||||
*/
|
||||
}
|
||||
}
|
||||
} else {
|
||||
p = d < 0 ? lo[p] : hi[p];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hyphenate word and return a Hyphenation object.
|
||||
*
|
||||
* @param word the word to be hyphenated
|
||||
* @param remainCharCount Minimum number of characters allowed before the
|
||||
* hyphenation point.
|
||||
* @param pushCharCount Minimum number of characters allowed after the
|
||||
* hyphenation point.
|
||||
* @return a {@link Hyphenation Hyphenation} object representing the
|
||||
* hyphenated word or null if word is not hyphenated.
|
||||
*/
|
||||
public Hyphenation hyphenate(String word, int remainCharCount,
|
||||
int pushCharCount) {
|
||||
char[] w = word.toCharArray();
|
||||
return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n
|
||||
* may be absent, the first n is at offset, the first l is at offset +
|
||||
* iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied
|
||||
* into word. In the first part of the routine len = w.length, in the second
|
||||
* part of the routine len = word.length. Three indices are used: index(w),
|
||||
* the index in w, index(word), the index in word, letterindex(word), the
|
||||
* index in the letter part of word. The following relations exist: index(w) =
|
||||
* offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) =
|
||||
* index(word) - 1 (see first loop). It follows that: index(w) - index(word) =
|
||||
* offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset +
|
||||
* iIgnoreAtBeginning
|
||||
*/
|
||||
|
||||
/**
|
||||
* Hyphenate word and return an array of hyphenation points.
|
||||
*
|
||||
* @param w char array that contains the word
|
||||
* @param offset Offset to first character in word
|
||||
* @param len Length of word
|
||||
* @param remainCharCount Minimum number of characters allowed before the
|
||||
* hyphenation point.
|
||||
* @param pushCharCount Minimum number of characters allowed after the
|
||||
* hyphenation point.
|
||||
* @return a {@link Hyphenation Hyphenation} object representing the
|
||||
* hyphenated word or null if word is not hyphenated.
|
||||
*/
|
||||
public Hyphenation hyphenate(char[] w, int offset, int len,
|
||||
int remainCharCount, int pushCharCount) {
|
||||
int i;
|
||||
char[] word = new char[len + 3];
|
||||
|
||||
// normalize word
|
||||
char[] c = new char[2];
|
||||
int iIgnoreAtBeginning = 0;
|
||||
int iLength = len;
|
||||
boolean bEndOfLetters = false;
|
||||
for (i = 1; i <= len; i++) {
|
||||
c[0] = w[offset + i - 1];
|
||||
int nc = classmap.find(c, 0);
|
||||
if (nc < 0) { // found a non-letter character ...
|
||||
if (i == (1 + iIgnoreAtBeginning)) {
|
||||
// ... before any letter character
|
||||
iIgnoreAtBeginning++;
|
||||
} else {
|
||||
// ... after a letter character
|
||||
bEndOfLetters = true;
|
||||
}
|
||||
iLength--;
|
||||
} else {
|
||||
if (!bEndOfLetters) {
|
||||
word[i - iIgnoreAtBeginning] = (char) nc;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
len = iLength;
|
||||
if (len < (remainCharCount + pushCharCount)) {
|
||||
// word is too short to be hyphenated
|
||||
return null;
|
||||
}
|
||||
int[] result = new int[len + 1];
|
||||
int k = 0;
|
||||
|
||||
// check exception list first
|
||||
String sw = new String(word, 1, len);
|
||||
if (stoplist.containsKey(sw)) {
|
||||
// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
|
||||
// null)
|
||||
ArrayList hw = (ArrayList) stoplist.get(sw);
|
||||
int j = 0;
|
||||
for (i = 0; i < hw.size(); i++) {
|
||||
Object o = hw.get(i);
|
||||
// j = index(sw) = letterindex(word)?
|
||||
// result[k] = corresponding index(w)
|
||||
if (o instanceof String) {
|
||||
j += ((String) o).length();
|
||||
if (j >= remainCharCount && j < (len - pushCharCount)) {
|
||||
result[k++] = j + iIgnoreAtBeginning;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// use algorithm to get hyphenation points
|
||||
word[0] = '.'; // word start marker
|
||||
word[len + 1] = '.'; // word end marker
|
||||
word[len + 2] = 0; // null terminated
|
||||
byte[] il = new byte[len + 3]; // initialized to zero
|
||||
for (i = 0; i < len + 1; i++) {
|
||||
searchPatterns(word, i, il);
|
||||
}
|
||||
|
||||
// hyphenation points are located where interletter value is odd
|
||||
// i is letterindex(word),
|
||||
// i + 1 is index(word),
|
||||
// result[k] = corresponding index(w)
|
||||
for (i = 0; i < len; i++) {
|
||||
if (((il[i + 1] & 1) == 1) && i >= remainCharCount
|
||||
&& i <= (len - pushCharCount)) {
|
||||
result[k++] = i + iIgnoreAtBeginning;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (k > 0) {
|
||||
// trim result array
|
||||
int[] res = new int[k+2];
|
||||
System.arraycopy(result, 0, res, 1, k);
|
||||
// We add the synthetical hyphenation points
|
||||
// at the beginning and end of the word
|
||||
res[0]=0;
|
||||
res[k+1]=len;
|
||||
return new Hyphenation(res);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a character class to the tree. It is used by
|
||||
* {@link PatternParser PatternParser} as callback to add character classes.
|
||||
* Character classes define the valid word characters for hyphenation. If a
|
||||
* word contains a character not defined in any of the classes, it is not
|
||||
* hyphenated. It also defines a way to normalize the characters in order to
|
||||
* compare them with the stored patterns. Usually pattern files use only lower
|
||||
* case characters, in this case a class for letter 'a', for example, should
|
||||
* be defined as "aA", the first character being the normalization char.
|
||||
*/
|
||||
public void addClass(String chargroup) {
|
||||
if (chargroup.length() > 0) {
|
||||
char equivChar = chargroup.charAt(0);
|
||||
char[] key = new char[2];
|
||||
key[1] = 0;
|
||||
for (int i = 0; i < chargroup.length(); i++) {
|
||||
key[0] = chargroup.charAt(i);
|
||||
classmap.insert(key, 0, equivChar);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an exception to the tree. It is used by
|
||||
* {@link PatternParser PatternParser} class as callback to store the
|
||||
* hyphenation exceptions.
|
||||
*
|
||||
* @param word normalized word
|
||||
* @param hyphenatedword a vector of alternating strings and
|
||||
* {@link Hyphen hyphen} objects.
|
||||
*/
|
||||
public void addException(String word, ArrayList hyphenatedword) {
|
||||
stoplist.put(word, hyphenatedword);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a pattern to the tree. Mainly, to be used by
|
||||
* {@link PatternParser PatternParser} class as callback to add a pattern to
|
||||
* the tree.
|
||||
*
|
||||
* @param pattern the hyphenation pattern
|
||||
* @param ivalue interletter weight values indicating the desirability and
|
||||
* priority of hyphenating at a given point within the pattern. It
|
||||
* should contain only digit characters. (i.e. '0' to '9').
|
||||
*/
|
||||
public void addPattern(String pattern, String ivalue) {
|
||||
int k = ivalues.find(ivalue);
|
||||
if (k <= 0) {
|
||||
k = packValues(ivalue);
|
||||
ivalues.insert(ivalue, (char) k);
|
||||
}
|
||||
insert(pattern, (char) k);
|
||||
}
|
||||
|
||||
public void printStats() {
|
||||
System.out.println("Value space size = "
|
||||
+ Integer.toString(vspace.length()));
|
||||
super.printStats();
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* This interface is used to connect the XML pattern file parser to the
|
||||
* hyphenation tree.
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public interface PatternConsumer {
|
||||
|
||||
/**
|
||||
* Add a character class. A character class defines characters that are
|
||||
* considered equivalent for the purpose of hyphenation (e.g. "aA"). It
|
||||
* usually means to ignore case.
|
||||
*
|
||||
* @param chargroup character group
|
||||
*/
|
||||
void addClass(String chargroup);
|
||||
|
||||
/**
|
||||
* Add a hyphenation exception. An exception replaces the result obtained by
|
||||
* the algorithm for cases for which this fails or the user wants to provide
|
||||
* his own hyphenation. A hyphenatedword is a vector of alternating String's
|
||||
* and {@link Hyphen Hyphen} instances
|
||||
*/
|
||||
void addException(String word, ArrayList hyphenatedword);
|
||||
|
||||
/**
|
||||
* Add hyphenation patterns.
|
||||
*
|
||||
* @param pattern the pattern
|
||||
* @param values interletter values expressed as a string of digit characters.
|
||||
*/
|
||||
void addPattern(String pattern, String values);
|
||||
|
||||
}
|
|
@ -0,0 +1,518 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* $Id: PatternParser.java 426576 2006-07-28 15:44:37Z jeremias $ */
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
// SAX
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.SAXParseException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
import org.xml.sax.Attributes;
|
||||
|
||||
// Java
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
/**
|
||||
* A SAX document handler to read and parse hyphenation patterns from a XML
|
||||
* file.
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||
|
||||
XMLReader parser;
|
||||
|
||||
int currElement;
|
||||
|
||||
PatternConsumer consumer;
|
||||
|
||||
StringBuffer token;
|
||||
|
||||
ArrayList exception;
|
||||
|
||||
char hyphenChar;
|
||||
|
||||
String errMsg;
|
||||
|
||||
static final int ELEM_CLASSES = 1;
|
||||
|
||||
static final int ELEM_EXCEPTIONS = 2;
|
||||
|
||||
static final int ELEM_PATTERNS = 3;
|
||||
|
||||
static final int ELEM_HYPHEN = 4;
|
||||
|
||||
public PatternParser() throws HyphenationException {
|
||||
token = new StringBuffer();
|
||||
parser = createParser();
|
||||
parser.setContentHandler(this);
|
||||
parser.setErrorHandler(this);
|
||||
parser.setEntityResolver(this);
|
||||
hyphenChar = '-'; // default
|
||||
|
||||
}
|
||||
|
||||
public PatternParser(PatternConsumer consumer) throws HyphenationException {
|
||||
this();
|
||||
this.consumer = consumer;
|
||||
}
|
||||
|
||||
public void setConsumer(PatternConsumer consumer) {
|
||||
this.consumer = consumer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a hyphenation pattern file.
|
||||
*
|
||||
* @param filename the filename
|
||||
* @throws HyphenationException In case of an exception while parsing
|
||||
*/
|
||||
public void parse(String filename) throws HyphenationException {
|
||||
parse(new File(filename));
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a hyphenation pattern file.
|
||||
*
|
||||
* @param file the pattern file
|
||||
* @throws HyphenationException In case of an exception while parsing
|
||||
*/
|
||||
public void parse(File file) throws HyphenationException {
|
||||
try {
|
||||
InputSource src = new InputSource(file.toURL().toExternalForm());
|
||||
parse(src);
|
||||
} catch (MalformedURLException e) {
|
||||
throw new HyphenationException("Error converting the File '" + file
|
||||
+ "' to a URL: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a hyphenation pattern file.
|
||||
*
|
||||
* @param source the InputSource for the file
|
||||
* @throws HyphenationException In case of an exception while parsing
|
||||
*/
|
||||
public void parse(InputSource source) throws HyphenationException {
|
||||
try {
|
||||
parser.parse(source);
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
throw new HyphenationException("File not found: " + fnfe.getMessage());
|
||||
} catch (IOException ioe) {
|
||||
throw new HyphenationException(ioe.getMessage());
|
||||
} catch (SAXException e) {
|
||||
throw new HyphenationException(errMsg);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a SAX parser using JAXP
|
||||
*
|
||||
* @return the created SAX parser
|
||||
*/
|
||||
static XMLReader createParser() {
|
||||
try {
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
factory.setNamespaceAware(true);
|
||||
return factory.newSAXParser().getXMLReader();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
protected String readToken(StringBuffer chars) {
|
||||
String word;
|
||||
boolean space = false;
|
||||
int i;
|
||||
for (i = 0; i < chars.length(); i++) {
|
||||
if (Character.isWhitespace(chars.charAt(i))) {
|
||||
space = true;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (space) {
|
||||
// chars.delete(0,i);
|
||||
for (int countr = i; countr < chars.length(); countr++) {
|
||||
chars.setCharAt(countr - i, chars.charAt(countr));
|
||||
}
|
||||
chars.setLength(chars.length() - i);
|
||||
if (token.length() > 0) {
|
||||
word = token.toString();
|
||||
token.setLength(0);
|
||||
return word;
|
||||
}
|
||||
}
|
||||
space = false;
|
||||
for (i = 0; i < chars.length(); i++) {
|
||||
if (Character.isWhitespace(chars.charAt(i))) {
|
||||
space = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
token.append(chars.toString().substring(0, i));
|
||||
// chars.delete(0,i);
|
||||
for (int countr = i; countr < chars.length(); countr++) {
|
||||
chars.setCharAt(countr - i, chars.charAt(countr));
|
||||
}
|
||||
chars.setLength(chars.length() - i);
|
||||
if (space) {
|
||||
word = token.toString();
|
||||
token.setLength(0);
|
||||
return word;
|
||||
}
|
||||
token.append(chars);
|
||||
return null;
|
||||
}
|
||||
|
||||
protected static String getPattern(String word) {
|
||||
StringBuffer pat = new StringBuffer();
|
||||
int len = word.length();
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (!Character.isDigit(word.charAt(i))) {
|
||||
pat.append(word.charAt(i));
|
||||
}
|
||||
}
|
||||
return pat.toString();
|
||||
}
|
||||
|
||||
protected ArrayList normalizeException(ArrayList ex) {
|
||||
ArrayList res = new ArrayList();
|
||||
for (int i = 0; i < ex.size(); i++) {
|
||||
Object item = ex.get(i);
|
||||
if (item instanceof String) {
|
||||
String str = (String) item;
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int j = 0; j < str.length(); j++) {
|
||||
char c = str.charAt(j);
|
||||
if (c != hyphenChar) {
|
||||
buf.append(c);
|
||||
} else {
|
||||
res.add(buf.toString());
|
||||
buf.setLength(0);
|
||||
char[] h = new char[1];
|
||||
h[0] = hyphenChar;
|
||||
// we use here hyphenChar which is not necessarily
|
||||
// the one to be printed
|
||||
res.add(new Hyphen(new String(h), null, null));
|
||||
}
|
||||
}
|
||||
if (buf.length() > 0) {
|
||||
res.add(buf.toString());
|
||||
}
|
||||
} else {
|
||||
res.add(item);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected String getExceptionWord(ArrayList ex) {
|
||||
StringBuffer res = new StringBuffer();
|
||||
for (int i = 0; i < ex.size(); i++) {
|
||||
Object item = ex.get(i);
|
||||
if (item instanceof String) {
|
||||
res.append((String) item);
|
||||
} else {
|
||||
if (((Hyphen) item).noBreak != null) {
|
||||
res.append(((Hyphen) item).noBreak);
|
||||
}
|
||||
}
|
||||
}
|
||||
return res.toString();
|
||||
}
|
||||
|
||||
protected static String getInterletterValues(String pat) {
|
||||
StringBuffer il = new StringBuffer();
|
||||
String word = pat + "a"; // add dummy letter to serve as sentinel
|
||||
int len = word.length();
|
||||
for (int i = 0; i < len; i++) {
|
||||
char c = word.charAt(i);
|
||||
if (Character.isDigit(c)) {
|
||||
il.append(c);
|
||||
i++;
|
||||
} else {
|
||||
il.append('0');
|
||||
}
|
||||
}
|
||||
return il.toString();
|
||||
}
|
||||
|
||||
//
|
||||
// EntityResolver methods
|
||||
//
|
||||
public InputSource resolveEntity(String publicId, String systemId)
|
||||
throws SAXException, IOException {
|
||||
return HyphenationDTDGenerator.generateDTD();
|
||||
}
|
||||
|
||||
//
|
||||
// ContentHandler methods
|
||||
//
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ContentHandler#startElement(java.lang.String,
|
||||
* java.lang.String, java.lang.String, org.xml.sax.Attributes)
|
||||
*/
|
||||
public void startElement(String uri, String local, String raw,
|
||||
Attributes attrs) {
|
||||
if (local.equals("hyphen-char")) {
|
||||
String h = attrs.getValue("value");
|
||||
if (h != null && h.length() == 1) {
|
||||
hyphenChar = h.charAt(0);
|
||||
}
|
||||
} else if (local.equals("classes")) {
|
||||
currElement = ELEM_CLASSES;
|
||||
} else if (local.equals("patterns")) {
|
||||
currElement = ELEM_PATTERNS;
|
||||
} else if (local.equals("exceptions")) {
|
||||
currElement = ELEM_EXCEPTIONS;
|
||||
exception = new ArrayList();
|
||||
} else if (local.equals("hyphen")) {
|
||||
if (token.length() > 0) {
|
||||
exception.add(token.toString());
|
||||
}
|
||||
exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"),
|
||||
attrs.getValue("post")));
|
||||
currElement = ELEM_HYPHEN;
|
||||
}
|
||||
token.setLength(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ContentHandler#endElement(java.lang.String,
|
||||
* java.lang.String, java.lang.String)
|
||||
*/
|
||||
public void endElement(String uri, String local, String raw) {
|
||||
|
||||
if (token.length() > 0) {
|
||||
String word = token.toString();
|
||||
switch (currElement) {
|
||||
case ELEM_CLASSES:
|
||||
consumer.addClass(word);
|
||||
break;
|
||||
case ELEM_EXCEPTIONS:
|
||||
exception.add(word);
|
||||
exception = normalizeException(exception);
|
||||
consumer.addException(getExceptionWord(exception),
|
||||
(ArrayList) exception.clone());
|
||||
break;
|
||||
case ELEM_PATTERNS:
|
||||
consumer.addPattern(getPattern(word), getInterletterValues(word));
|
||||
break;
|
||||
case ELEM_HYPHEN:
|
||||
// nothing to do
|
||||
break;
|
||||
}
|
||||
if (currElement != ELEM_HYPHEN) {
|
||||
token.setLength(0);
|
||||
}
|
||||
}
|
||||
if (currElement == ELEM_HYPHEN) {
|
||||
currElement = ELEM_EXCEPTIONS;
|
||||
} else {
|
||||
currElement = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ContentHandler#characters(char[], int, int)
|
||||
*/
|
||||
public void characters(char ch[], int start, int length) {
|
||||
StringBuffer chars = new StringBuffer(length);
|
||||
chars.append(ch, start, length);
|
||||
String word = readToken(chars);
|
||||
while (word != null) {
|
||||
// System.out.println("\"" + word + "\"");
|
||||
switch (currElement) {
|
||||
case ELEM_CLASSES:
|
||||
consumer.addClass(word);
|
||||
break;
|
||||
case ELEM_EXCEPTIONS:
|
||||
exception.add(word);
|
||||
exception = normalizeException(exception);
|
||||
consumer.addException(getExceptionWord(exception),
|
||||
(ArrayList) exception.clone());
|
||||
exception.clear();
|
||||
break;
|
||||
case ELEM_PATTERNS:
|
||||
consumer.addPattern(getPattern(word), getInterletterValues(word));
|
||||
break;
|
||||
}
|
||||
word = readToken(chars);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// ErrorHandler methods
|
||||
//
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException)
|
||||
*/
|
||||
public void warning(SAXParseException ex) {
|
||||
errMsg = "[Warning] " + getLocationString(ex) + ": " + ex.getMessage();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException)
|
||||
*/
|
||||
public void error(SAXParseException ex) {
|
||||
errMsg = "[Error] " + getLocationString(ex) + ": " + ex.getMessage();
|
||||
}
|
||||
|
||||
/**
|
||||
* @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException)
|
||||
*/
|
||||
public void fatalError(SAXParseException ex) throws SAXException {
|
||||
errMsg = "[Fatal Error] " + getLocationString(ex) + ": " + ex.getMessage();
|
||||
throw ex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string of the location.
|
||||
*/
|
||||
private String getLocationString(SAXParseException ex) {
|
||||
StringBuffer str = new StringBuffer();
|
||||
|
||||
String systemId = ex.getSystemId();
|
||||
if (systemId != null) {
|
||||
int index = systemId.lastIndexOf('/');
|
||||
if (index != -1) {
|
||||
systemId = systemId.substring(index + 1);
|
||||
}
|
||||
str.append(systemId);
|
||||
}
|
||||
str.append(':');
|
||||
str.append(ex.getLineNumber());
|
||||
str.append(':');
|
||||
str.append(ex.getColumnNumber());
|
||||
|
||||
return str.toString();
|
||||
|
||||
} // getLocationString(SAXParseException):String
|
||||
|
||||
// PatternConsumer implementation for testing purposes
|
||||
public void addClass(String c) {
|
||||
System.out.println("class: " + c);
|
||||
}
|
||||
|
||||
public void addException(String w, ArrayList e) {
|
||||
System.out.println("exception: " + w + " : " + e.toString());
|
||||
}
|
||||
|
||||
public void addPattern(String p, String v) {
|
||||
System.out.println("pattern: " + p + " : " + v);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
if (args.length > 0) {
|
||||
PatternParser pp = new PatternParser();
|
||||
pp.setConsumer(pp);
|
||||
pp.parse(args[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class HyphenationDTDGenerator {
|
||||
public static final String DTD_STRING=
|
||||
"<?xml version=\"1.0\" encoding=\"US-ASCII\"?>\n"+
|
||||
"<!--\n"+
|
||||
" Copyright 1999-2004 The Apache Software Foundation\n"+
|
||||
"\n"+
|
||||
" Licensed under the Apache License, Version 2.0 (the \"License\");\n"+
|
||||
" you may not use this file except in compliance with the License.\n"+
|
||||
" You may obtain a copy of the License at\n"+
|
||||
"\n"+
|
||||
" http://www.apache.org/licenses/LICENSE-2.0\n"+
|
||||
"\n"+
|
||||
" Unless required by applicable law or agreed to in writing, software\n"+
|
||||
" distributed under the License is distributed on an \"AS IS\" BASIS,\n"+
|
||||
" WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"+
|
||||
" See the License for the specific language governing permissions and\n"+
|
||||
" limitations under the License.\n"+
|
||||
"-->\n"+
|
||||
"<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->\n"+
|
||||
"\n"+
|
||||
"<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,\n"+
|
||||
" classes, exceptions?, patterns)>\n"+
|
||||
"\n"+
|
||||
"<!-- Hyphen character to be used in the exception list as shortcut for\n"+
|
||||
" <hyphen pre-break=\"-\"/>. Defaults to '-'\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT hyphen-char EMPTY>\n"+
|
||||
"<!ATTLIST hyphen-char value CDATA #REQUIRED>\n"+
|
||||
"\n"+
|
||||
"<!-- Default minimun length in characters of hyphenated word fragments\n"+
|
||||
" before and after the line break. For some languages this is not\n"+
|
||||
" only for aesthetic purposes, wrong hyphens may be generated if this\n"+
|
||||
" is not accounted for.\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT hyphen-min EMPTY>\n"+
|
||||
"<!ATTLIST hyphen-min before CDATA #REQUIRED>\n"+
|
||||
"<!ATTLIST hyphen-min after CDATA #REQUIRED>\n"+
|
||||
"\n"+
|
||||
"<!-- Character equivalent classes: space separated list of character groups, all\n"+
|
||||
" characters in a group are to be treated equivalent as far as\n"+
|
||||
" the hyphenation algorithm is concerned. The first character in a group\n"+
|
||||
" is the group's equivalent character. Patterns should only contain\n"+
|
||||
" first characters. It also defines word characters, i.e. a word that\n"+
|
||||
" contains characters not present in any of the classes is not hyphenated.\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT classes (#PCDATA)>\n"+
|
||||
"\n"+
|
||||
"<!-- Hyphenation exceptions: space separated list of hyphenated words.\n"+
|
||||
" A hyphen is indicated by the hyphen tag, but you can use the\n"+
|
||||
" hyphen-char defined previously as shortcut. This is in cases\n"+
|
||||
" when the algorithm procedure finds wrong hyphens or you want\n"+
|
||||
" to provide your own hyphenation for some words.\n"+
|
||||
"-->\n"+
|
||||
"<!ELEMENT exceptions (#PCDATA|hyphen)* >\n"+
|
||||
"\n"+
|
||||
"<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'\n"+
|
||||
" characters as described before, between any two word characters a digit\n"+
|
||||
" in the range 0 to 9 may be specified. The absence of a digit is equivalent\n"+
|
||||
" to zero. The '.' character is reserved to indicate begining or ending\n"+
|
||||
" of words. -->\n"+
|
||||
"<!ELEMENT patterns (#PCDATA)>\n"+
|
||||
"\n"+
|
||||
"<!-- A \"full hyphen\" equivalent to TeX's \\discretionary\n"+
|
||||
" with pre-break, post-break and no-break attributes.\n"+
|
||||
" To be used in the exceptions list, the hyphen character is not\n"+
|
||||
" automatically added -->\n"+
|
||||
"<!ELEMENT hyphen EMPTY>\n"+
|
||||
"<!ATTLIST hyphen pre CDATA #IMPLIED>\n"+
|
||||
"<!ATTLIST hyphen no CDATA #IMPLIED>\n"+
|
||||
"<!ATTLIST hyphen post CDATA #IMPLIED>\n";
|
||||
|
||||
public static InputSource generateDTD() {
|
||||
return new InputSource(new StringReader(DTD_STRING));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,663 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.compound.hyphenation;
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.Stack;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* <h2>Ternary Search Tree.</h2>
|
||||
*
|
||||
* <p>
|
||||
* A ternary search tree is a hibrid between a binary tree and a digital search
|
||||
* tree (trie). Keys are limited to strings. A data value of type char is stored
|
||||
* in each leaf node. It can be used as an index (or pointer) to the data.
|
||||
* Branches that only contain one key are compressed to one node by storing a
|
||||
* pointer to the trailer substring of the key. This class is intended to serve
|
||||
* as base class or helper class to implement Dictionary collections or the
|
||||
* like. Ternary trees have some nice properties as the following: the tree can
|
||||
* be traversed in sorted order, partial matches (wildcard) can be implemented,
|
||||
* retrieval of all keys within a given distance from the target, etc. The
|
||||
* storage requirements are higher than a binary tree but a lot less than a
|
||||
* trie. Performance is comparable with a hash table, sometimes it outperforms a
|
||||
* hash function (most of the time can determine a miss faster than a hash).
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* The main purpose of this java port is to serve as a base for implementing
|
||||
* TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
|
||||
* requires from 5000 to 15000 hyphenation patterns which will be keys in this
|
||||
* tree. The strings patterns are usually small (from 2 to 5 characters), but
|
||||
* each char in the tree is stored in a node. Thus memory usage is the main
|
||||
* concern. We will sacrify 'elegance' to keep memory requirenments to the
|
||||
* minimum. Using java's char type as pointer (yes, I know pointer it is a
|
||||
* forbidden word in java) we can keep the size of the node to be just 8 bytes
|
||||
* (3 pointers and the data char). This gives room for about 65000 nodes. In my
|
||||
* tests the english patterns took 7694 nodes and the german patterns 10055
|
||||
* nodes, so I think we are safe.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* All said, this is a map with strings as keys and char as value. Pretty
|
||||
* limited!. It can be extended to a general map by using the string
|
||||
* representation of an object and using the char value as an index to an array
|
||||
* that contains the object values.
|
||||
* </p>
|
||||
*
|
||||
* This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
|
||||
*/
|
||||
|
||||
public class TernaryTree implements Cloneable, Serializable {
|
||||
|
||||
/**
|
||||
* We use 4 arrays to represent a node. I guess I should have created a proper
|
||||
* node class, but somehow Knuth's pascal code made me forget we now have a
|
||||
* portable language with virtual memory management and automatic garbage
|
||||
* collection! And now is kind of late, furthermore, if it ain't broken, don't
|
||||
* fix it.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Pointer to low branch and to rest of the key when it is stored directly in
|
||||
* this node, we don't have unions in java!
|
||||
*/
|
||||
protected char[] lo;
|
||||
|
||||
/**
|
||||
* Pointer to high branch.
|
||||
*/
|
||||
protected char[] hi;
|
||||
|
||||
/**
|
||||
* Pointer to equal branch and to data when this node is a string terminator.
|
||||
*/
|
||||
protected char[] eq;
|
||||
|
||||
/**
|
||||
* <P>
|
||||
* The character stored in this node: splitchar. Two special values are
|
||||
* reserved:
|
||||
* </P>
|
||||
* <ul>
|
||||
* <li>0x0000 as string terminator</li>
|
||||
* <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* This shouldn't be a problem if we give the usual semantics to strings since
|
||||
* 0xFFFF is garanteed not to be an Unicode character.
|
||||
* </p>
|
||||
*/
|
||||
protected char[] sc;
|
||||
|
||||
/**
|
||||
* This vector holds the trailing of the keys when the branch is compressed.
|
||||
*/
|
||||
protected CharVector kv;
|
||||
|
||||
protected char root;
|
||||
|
||||
protected char freenode;
|
||||
|
||||
protected int length; // number of items in tree
|
||||
|
||||
protected static final int BLOCK_SIZE = 2048; // allocation size for arrays
|
||||
|
||||
TernaryTree() {
|
||||
init();
|
||||
}
|
||||
|
||||
protected void init() {
|
||||
root = 0;
|
||||
freenode = 1;
|
||||
length = 0;
|
||||
lo = new char[BLOCK_SIZE];
|
||||
hi = new char[BLOCK_SIZE];
|
||||
eq = new char[BLOCK_SIZE];
|
||||
sc = new char[BLOCK_SIZE];
|
||||
kv = new CharVector();
|
||||
}
|
||||
|
||||
/**
|
||||
* Branches are initially compressed, needing one node per key plus the size
|
||||
* of the string key. They are decompressed as needed when another key with
|
||||
* same prefix is inserted. This saves a lot of space, specially for long
|
||||
* keys.
|
||||
*/
|
||||
public void insert(String key, char val) {
|
||||
// make sure we have enough room in the arrays
|
||||
int len = key.length() + 1; // maximum number of nodes that may be generated
|
||||
if (freenode + len > eq.length) {
|
||||
redimNodeArrays(eq.length + BLOCK_SIZE);
|
||||
}
|
||||
char strkey[] = new char[len--];
|
||||
key.getChars(0, len, strkey, 0);
|
||||
strkey[len] = 0;
|
||||
root = insert(root, strkey, 0, val);
|
||||
}
|
||||
|
||||
public void insert(char[] key, int start, char val) {
|
||||
int len = strlen(key) + 1;
|
||||
if (freenode + len > eq.length) {
|
||||
redimNodeArrays(eq.length + BLOCK_SIZE);
|
||||
}
|
||||
root = insert(root, key, start, val);
|
||||
}
|
||||
|
||||
/**
|
||||
* The actual insertion function, recursive version.
|
||||
*/
|
||||
private char insert(char p, char[] key, int start, char val) {
|
||||
int len = strlen(key, start);
|
||||
if (p == 0) {
|
||||
// this means there is no branch, this node will start a new branch.
|
||||
// Instead of doing that, we store the key somewhere else and create
|
||||
// only one node with a pointer to the key
|
||||
p = freenode++;
|
||||
eq[p] = val; // holds data
|
||||
length++;
|
||||
hi[p] = 0;
|
||||
if (len > 0) {
|
||||
sc[p] = 0xFFFF; // indicates branch is compressed
|
||||
lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key
|
||||
strcpy(kv.getArray(), lo[p], key, start);
|
||||
} else {
|
||||
sc[p] = 0;
|
||||
lo[p] = 0;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
if (sc[p] == 0xFFFF) {
|
||||
// branch is compressed: need to decompress
|
||||
// this will generate garbage in the external key array
|
||||
// but we can do some garbage collection later
|
||||
char pp = freenode++;
|
||||
lo[pp] = lo[p]; // previous pointer to key
|
||||
eq[pp] = eq[p]; // previous pointer to data
|
||||
lo[p] = 0;
|
||||
if (len > 0) {
|
||||
sc[p] = kv.get(lo[pp]);
|
||||
eq[p] = pp;
|
||||
lo[pp]++;
|
||||
if (kv.get(lo[pp]) == 0) {
|
||||
// key completly decompressed leaving garbage in key array
|
||||
lo[pp] = 0;
|
||||
sc[pp] = 0;
|
||||
hi[pp] = 0;
|
||||
} else {
|
||||
// we only got first char of key, rest is still there
|
||||
sc[pp] = 0xFFFF;
|
||||
}
|
||||
} else {
|
||||
// In this case we can save a node by swapping the new node
|
||||
// with the compressed node
|
||||
sc[pp] = 0xFFFF;
|
||||
hi[p] = pp;
|
||||
sc[p] = 0;
|
||||
eq[p] = val;
|
||||
length++;
|
||||
return p;
|
||||
}
|
||||
}
|
||||
char s = key[start];
|
||||
if (s < sc[p]) {
|
||||
lo[p] = insert(lo[p], key, start, val);
|
||||
} else if (s == sc[p]) {
|
||||
if (s != 0) {
|
||||
eq[p] = insert(eq[p], key, start + 1, val);
|
||||
} else {
|
||||
// key already in tree, overwrite data
|
||||
eq[p] = val;
|
||||
}
|
||||
} else {
|
||||
hi[p] = insert(hi[p], key, start, val);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares 2 null terminated char arrays
|
||||
*/
|
||||
public static int strcmp(char[] a, int startA, char[] b, int startB) {
|
||||
for (; a[startA] == b[startB]; startA++, startB++) {
|
||||
if (a[startA] == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return a[startA] - b[startB];
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares a string with null terminated char array
|
||||
*/
|
||||
public static int strcmp(String str, char[] a, int start) {
|
||||
int i, d, len = str.length();
|
||||
for (i = 0; i < len; i++) {
|
||||
d = (int) str.charAt(i) - a[start + i];
|
||||
if (d != 0) {
|
||||
return d;
|
||||
}
|
||||
if (a[start + i] == 0) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
if (a[start + i] != 0) {
|
||||
return (int) -a[start + i];
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
public static void strcpy(char[] dst, int di, char[] src, int si) {
|
||||
while (src[si] != 0) {
|
||||
dst[di++] = src[si++];
|
||||
}
|
||||
dst[di] = 0;
|
||||
}
|
||||
|
||||
public static int strlen(char[] a, int start) {
|
||||
int len = 0;
|
||||
for (int i = start; i < a.length && a[i] != 0; i++) {
|
||||
len++;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
public static int strlen(char[] a) {
|
||||
return strlen(a, 0);
|
||||
}
|
||||
|
||||
public int find(String key) {
|
||||
int len = key.length();
|
||||
char strkey[] = new char[len + 1];
|
||||
key.getChars(0, len, strkey, 0);
|
||||
strkey[len] = 0;
|
||||
|
||||
return find(strkey, 0);
|
||||
}
|
||||
|
||||
public int find(char[] key, int start) {
|
||||
int d;
|
||||
char p = root;
|
||||
int i = start;
|
||||
char c;
|
||||
|
||||
while (p != 0) {
|
||||
if (sc[p] == 0xFFFF) {
|
||||
if (strcmp(key, i, kv.getArray(), lo[p]) == 0) {
|
||||
return eq[p];
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
c = key[i];
|
||||
d = c - sc[p];
|
||||
if (d == 0) {
|
||||
if (c == 0) {
|
||||
return eq[p];
|
||||
}
|
||||
i++;
|
||||
p = eq[p];
|
||||
} else if (d < 0) {
|
||||
p = lo[p];
|
||||
} else {
|
||||
p = hi[p];
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
public boolean knows(String key) {
|
||||
return (find(key) >= 0);
|
||||
}
|
||||
|
||||
// redimension the arrays
|
||||
private void redimNodeArrays(int newsize) {
|
||||
int len = newsize < lo.length ? newsize : lo.length;
|
||||
char[] na = new char[newsize];
|
||||
System.arraycopy(lo, 0, na, 0, len);
|
||||
lo = na;
|
||||
na = new char[newsize];
|
||||
System.arraycopy(hi, 0, na, 0, len);
|
||||
hi = na;
|
||||
na = new char[newsize];
|
||||
System.arraycopy(eq, 0, na, 0, len);
|
||||
eq = na;
|
||||
na = new char[newsize];
|
||||
System.arraycopy(sc, 0, na, 0, len);
|
||||
sc = na;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
TernaryTree t = new TernaryTree();
|
||||
t.lo = (char[]) this.lo.clone();
|
||||
t.hi = (char[]) this.hi.clone();
|
||||
t.eq = (char[]) this.eq.clone();
|
||||
t.sc = (char[]) this.sc.clone();
|
||||
t.kv = (CharVector) this.kv.clone();
|
||||
t.root = this.root;
|
||||
t.freenode = this.freenode;
|
||||
t.length = this.length;
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively insert the median first and then the median of the lower and
|
||||
* upper halves, and so on in order to get a balanced tree. The array of keys
|
||||
* is assumed to be sorted in ascending order.
|
||||
*/
|
||||
protected void insertBalanced(String[] k, char[] v, int offset, int n) {
|
||||
int m;
|
||||
if (n < 1) {
|
||||
return;
|
||||
}
|
||||
m = n >> 1;
|
||||
|
||||
insert(k[m + offset], v[m + offset]);
|
||||
insertBalanced(k, v, offset, m);
|
||||
|
||||
insertBalanced(k, v, offset + m + 1, n - m - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Balance the tree for best search performance
|
||||
*/
|
||||
public void balance() {
|
||||
// System.out.print("Before root splitchar = ");
|
||||
// System.out.println(sc[root]);
|
||||
|
||||
int i = 0, n = length;
|
||||
String[] k = new String[n];
|
||||
char[] v = new char[n];
|
||||
Iterator iter = new Iterator();
|
||||
while (iter.hasMoreElements()) {
|
||||
v[i] = iter.getValue();
|
||||
k[i++] = (String) iter.nextElement();
|
||||
}
|
||||
init();
|
||||
insertBalanced(k, v, 0, n);
|
||||
|
||||
// With uniform letter distribution sc[root] should be around 'm'
|
||||
// System.out.print("After root splitchar = ");
|
||||
// System.out.println(sc[root]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Each node stores a character (splitchar) which is part of some key(s). In a
|
||||
* compressed branch (one that only contain a single string key) the trailer
|
||||
* of the key which is not already in nodes is stored externally in the kv
|
||||
* array. As items are inserted, key substrings decrease. Some substrings may
|
||||
* completely disappear when the whole branch is totally decompressed. The
|
||||
* tree is traversed to find the key substrings actually used. In addition,
|
||||
* duplicate substrings are removed using a map (implemented with a
|
||||
* TernaryTree!).
|
||||
*
|
||||
*/
|
||||
public void trimToSize() {
|
||||
// first balance the tree for best performance
|
||||
balance();
|
||||
|
||||
// redimension the node arrays
|
||||
redimNodeArrays(freenode);
|
||||
|
||||
// ok, compact kv array
|
||||
CharVector kx = new CharVector();
|
||||
kx.alloc(1);
|
||||
TernaryTree map = new TernaryTree();
|
||||
compact(kx, map, root);
|
||||
kv = kx;
|
||||
kv.trimToSize();
|
||||
}
|
||||
|
||||
private void compact(CharVector kx, TernaryTree map, char p) {
|
||||
int k;
|
||||
if (p == 0) {
|
||||
return;
|
||||
}
|
||||
if (sc[p] == 0xFFFF) {
|
||||
k = map.find(kv.getArray(), lo[p]);
|
||||
if (k < 0) {
|
||||
k = kx.alloc(strlen(kv.getArray(), lo[p]) + 1);
|
||||
strcpy(kx.getArray(), k, kv.getArray(), lo[p]);
|
||||
map.insert(kx.getArray(), k, (char) k);
|
||||
}
|
||||
lo[p] = (char) k;
|
||||
} else {
|
||||
compact(kx, map, lo[p]);
|
||||
if (sc[p] != 0) {
|
||||
compact(kx, map, eq[p]);
|
||||
}
|
||||
compact(kx, map, hi[p]);
|
||||
}
|
||||
}
|
||||
|
||||
public Enumeration keys() {
|
||||
return new Iterator();
|
||||
}
|
||||
|
||||
public class Iterator implements Enumeration {
|
||||
|
||||
/**
|
||||
* current node index
|
||||
*/
|
||||
int cur;
|
||||
|
||||
/**
|
||||
* current key
|
||||
*/
|
||||
String curkey;
|
||||
|
||||
private class Item implements Cloneable {
|
||||
char parent;
|
||||
|
||||
char child;
|
||||
|
||||
public Item() {
|
||||
parent = 0;
|
||||
child = 0;
|
||||
}
|
||||
|
||||
public Item(char p, char c) {
|
||||
parent = p;
|
||||
child = c;
|
||||
}
|
||||
|
||||
public Object clone() {
|
||||
return new Item(parent, child);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Node stack
|
||||
*/
|
||||
Stack ns;
|
||||
|
||||
/**
|
||||
* key stack implemented with a StringBuffer
|
||||
*/
|
||||
StringBuffer ks;
|
||||
|
||||
public Iterator() {
|
||||
cur = -1;
|
||||
ns = new Stack();
|
||||
ks = new StringBuffer();
|
||||
rewind();
|
||||
}
|
||||
|
||||
public void rewind() {
|
||||
ns.removeAllElements();
|
||||
ks.setLength(0);
|
||||
cur = root;
|
||||
run();
|
||||
}
|
||||
|
||||
public Object nextElement() {
|
||||
String res = new String(curkey);
|
||||
cur = up();
|
||||
run();
|
||||
return res;
|
||||
}
|
||||
|
||||
public char getValue() {
|
||||
if (cur >= 0) {
|
||||
return eq[cur];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public boolean hasMoreElements() {
|
||||
return (cur != -1);
|
||||
}
|
||||
|
||||
/**
|
||||
* traverse upwards
|
||||
*/
|
||||
private int up() {
|
||||
Item i = new Item();
|
||||
int res = 0;
|
||||
|
||||
if (ns.empty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (cur != 0 && sc[cur] == 0) {
|
||||
return lo[cur];
|
||||
}
|
||||
|
||||
boolean climb = true;
|
||||
|
||||
while (climb) {
|
||||
i = (Item) ns.pop();
|
||||
i.child++;
|
||||
switch (i.child) {
|
||||
case 1:
|
||||
if (sc[i.parent] != 0) {
|
||||
res = eq[i.parent];
|
||||
ns.push(i.clone());
|
||||
ks.append(sc[i.parent]);
|
||||
} else {
|
||||
i.child++;
|
||||
ns.push(i.clone());
|
||||
res = hi[i.parent];
|
||||
}
|
||||
climb = false;
|
||||
break;
|
||||
|
||||
case 2:
|
||||
res = hi[i.parent];
|
||||
ns.push(i.clone());
|
||||
if (ks.length() > 0) {
|
||||
ks.setLength(ks.length() - 1); // pop
|
||||
}
|
||||
climb = false;
|
||||
break;
|
||||
|
||||
default:
|
||||
if (ns.empty()) {
|
||||
return -1;
|
||||
}
|
||||
climb = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* traverse the tree to find next key
|
||||
*/
|
||||
private int run() {
|
||||
if (cur == -1) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
boolean leaf = false;
|
||||
while (true) {
|
||||
// first go down on low branch until leaf or compressed branch
|
||||
while (cur != 0) {
|
||||
if (sc[cur] == 0xFFFF) {
|
||||
leaf = true;
|
||||
break;
|
||||
}
|
||||
ns.push(new Item((char) cur, '\u0000'));
|
||||
if (sc[cur] == 0) {
|
||||
leaf = true;
|
||||
break;
|
||||
}
|
||||
cur = lo[cur];
|
||||
}
|
||||
if (leaf) {
|
||||
break;
|
||||
}
|
||||
// nothing found, go up one node and try again
|
||||
cur = up();
|
||||
if (cur == -1) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
// The current node should be a data node and
|
||||
// the key should be in the key stack (at least partially)
|
||||
StringBuffer buf = new StringBuffer(ks.toString());
|
||||
if (sc[cur] == 0xFFFF) {
|
||||
int p = lo[cur];
|
||||
while (kv.get(p) != 0) {
|
||||
buf.append(kv.get(p++));
|
||||
}
|
||||
}
|
||||
curkey = buf.toString();
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void printStats() {
|
||||
System.out.println("Number of keys = " + Integer.toString(length));
|
||||
System.out.println("Node count = " + Integer.toString(freenode));
|
||||
// System.out.println("Array length = " + Integer.toString(eq.length));
|
||||
System.out.println("Key Array length = " + Integer.toString(kv.length()));
|
||||
|
||||
/*
|
||||
* for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
|
||||
* System.out.print(kv.get(i)); else System.out.println("");
|
||||
* System.out.println("Keys:"); for(Enumeration enum = keys();
|
||||
* enum.hasMoreElements(); ) System.out.println(enum.nextElement());
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
TernaryTree tt = new TernaryTree();
|
||||
tt.insert("Carlos", 'C');
|
||||
tt.insert("Car", 'r');
|
||||
tt.insert("palos", 'l');
|
||||
tt.insert("pa", 'p');
|
||||
tt.trimToSize();
|
||||
System.out.println((char) tt.find("Car"));
|
||||
System.out.println((char) tt.find("Carlos"));
|
||||
System.out.println((char) tt.find("alto"));
|
||||
tt.printStats();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
<?xml version="1.0" encoding="US-ASCII"?>
|
||||
<!--
|
||||
Copyright 1999-2004 The Apache Software Foundation
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ -->
|
||||
|
||||
<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?,
|
||||
classes, exceptions?, patterns)>
|
||||
|
||||
<!-- Hyphen character to be used in the exception list as shortcut for
|
||||
<hyphen pre-break="-"/>. Defaults to '-'
|
||||
-->
|
||||
<!ELEMENT hyphen-char EMPTY>
|
||||
<!ATTLIST hyphen-char value CDATA #REQUIRED>
|
||||
|
||||
<!-- Default minimun length in characters of hyphenated word fragments
|
||||
before and after the line break. For some languages this is not
|
||||
only for aesthetic purposes, wrong hyphens may be generated if this
|
||||
is not accounted for.
|
||||
-->
|
||||
<!ELEMENT hyphen-min EMPTY>
|
||||
<!ATTLIST hyphen-min before CDATA #REQUIRED>
|
||||
<!ATTLIST hyphen-min after CDATA #REQUIRED>
|
||||
|
||||
<!-- Character equivalent classes: space separated list of character groups, all
|
||||
characters in a group are to be treated equivalent as far as
|
||||
the hyphenation algorithm is concerned. The first character in a group
|
||||
is the group's equivalent character. Patterns should only contain
|
||||
first characters. It also defines word characters, i.e. a word that
|
||||
contains characters not present in any of the classes is not hyphenated.
|
||||
-->
|
||||
<!ELEMENT classes (#PCDATA)>
|
||||
|
||||
<!-- Hyphenation exceptions: space separated list of hyphenated words.
|
||||
A hyphen is indicated by the hyphen tag, but you can use the
|
||||
hyphen-char defined previously as shortcut. This is in cases
|
||||
when the algorithm procedure finds wrong hyphens or you want
|
||||
to provide your own hyphenation for some words.
|
||||
-->
|
||||
<!ELEMENT exceptions (#PCDATA|hyphen)* >
|
||||
|
||||
<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent'
|
||||
characters as described before, between any two word characters a digit
|
||||
in the range 0 to 9 may be specified. The absence of a digit is equivalent
|
||||
to zero. The '.' character is reserved to indicate begining or ending
|
||||
of words. -->
|
||||
<!ELEMENT patterns (#PCDATA)>
|
||||
|
||||
<!-- A "full hyphen" equivalent to TeX's \discretionary
|
||||
with pre-break, post-break and no-break attributes.
|
||||
To be used in the exceptions list, the hyphen character is not
|
||||
automatically added -->
|
||||
<!ELEMENT hyphen EMPTY>
|
||||
<!ATTLIST hyphen pre CDATA #IMPLIED>
|
||||
<!ATTLIST hyphen no CDATA #IMPLIED>
|
||||
<!ATTLIST hyphen post CDATA #IMPLIED>
|
|
@ -0,0 +1,10 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Hypenation code for the CompoundWordTokenFilter</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>
|
||||
The code for the compound word hyphenation is taken from the <a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>. All credits for the hyphenation code belongs to them.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,166 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>CompoundWordTokenFilter</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></meta>
|
||||
</head>
|
||||
<body>
|
||||
A filter that decomposes compound words you find in many Germanic
|
||||
languages to the word parts. This example shows what it does:
|
||||
<table border="1">
|
||||
<tr>
|
||||
<th>Input token stream</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Rindfleischüberwachungsgesetz Drahtschere abba</td>
|
||||
</tr>
|
||||
</table>
|
||||
<br>
|
||||
<table border="1">
|
||||
<tr>
|
||||
<th>Output token stream</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(Rindfleischüberwachungsgesetz,0,29)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(Rind,0,4,posIncr=0)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(fleisch,4,11,posIncr=0)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(überwachung,11,22,posIncr=0)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(gesetz,23,29,posIncr=0)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(Drahtschere,30,41)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(Draht,30,35,posIncr=0)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(schere,35,41,posIncr=0)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>(abba,42,46)</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
The input token is always preserved and the filters do not alter the case of word parts. There are two variants of the
|
||||
filter available:
|
||||
<ul>
|
||||
<li><i>HyphenationCompoundWordTokenFilter</i>: it uses a
|
||||
hyphenation grammer based approach to find potential word parts of a
|
||||
given word.</li>
|
||||
<li><i>DictionaryCompoundWordTokenFilter</i>: it uses a
|
||||
brute-force dictionary-only based approach to find the word parts of a given
|
||||
word.</li>
|
||||
</ul>
|
||||
|
||||
<h3>Compound word token filters</h3>
|
||||
<h4>HyphenationCompoundWordTokenFilter</h4>
|
||||
The {@link
|
||||
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
|
||||
HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
|
||||
potential subwords that a worth to check against the dictionary. The
|
||||
quality of the output tokens is directly connected to the quality of the
|
||||
grammar file you use. For languages like German they are quite good.
|
||||
<h5>Grammar file</h5>
|
||||
Unfortunately we cannot bundle the hyphenation grammar files with Lucene
|
||||
because they do not use an ASF compatible license (they use the LaTeX
|
||||
Project Public License instead). You can find the XML based grammar
|
||||
files at the
|
||||
<a href="http://offo.sourceforge.net/hyphenation/index.html">Objects
|
||||
For Formatting Objects</a>
|
||||
(OFFO) Sourceforge project (direct link to download the pattern files:
|
||||
<a href="http://downloads.sourceforge.net/offo/offo-hyphenation.zip">http://downloads.sourceforge.net/offo/offo-hyphenation.zip</a>
|
||||
). The files you need are in the subfolder
|
||||
<i>offo-hyphenation/hyph/</i>
|
||||
.
|
||||
<br />
|
||||
Credits for the hyphenation code go to the
|
||||
<a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>
|
||||
.
|
||||
|
||||
<h4>DictionaryCompoundWordTokenFilter</h4>
|
||||
The {@link
|
||||
org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter
|
||||
DictionaryCompoundWordTokenFilter} uses a dictionary-only approach to
|
||||
find subwords in a compound word. It is much slower than the one that
|
||||
uses the hyphenation grammars. You can use it as a first start to
|
||||
see if your dictionary is good or not because it is much simpler in design.
|
||||
|
||||
<h3>Dictionary</h3>
|
||||
The output quality of both token filters is directly connected to the
|
||||
quality of the dictionary you use. They are language dependent of course.
|
||||
You always should use a dictionary
|
||||
that fits to the text you want to index. If you index medical text for
|
||||
example then you should use a dictionary that contains medical words.
|
||||
A good start for general text are the dictionaries you find at the
|
||||
<a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
|
||||
dictionaries</a>
|
||||
Wiki.
|
||||
|
||||
<h3>Which variant should I use?</h3>
|
||||
This decision matrix should help you:
|
||||
<table border="1">
|
||||
<tr>
|
||||
<th>Token filter</th>
|
||||
<th>Output quality</th>
|
||||
<th>Performance</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>HyphenationCompoundWordTokenFilter</td>
|
||||
<td>good if grammar file is good – acceptable otherwise</td>
|
||||
<td>fast</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>DictionaryCompoundWordTokenFilter</td>
|
||||
<td>good</td>
|
||||
<td>slow</td>
|
||||
</tr>
|
||||
</table>
|
||||
<h3>Examples</h3>
|
||||
<pre>
|
||||
public void testHyphenationCompoundWordsDE() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung" };
|
||||
|
||||
Reader reader = new FileReader("de_DR.xml");
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
|
||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
Token t;
|
||||
while ((t=tf.next())!=null) {
|
||||
System.out.println(t);
|
||||
}
|
||||
}
|
||||
|
||||
public void testDumbCompoundWordsSE() throws Exception {
|
||||
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
|
||||
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
|
||||
"Sko", "Vind", "Rute", "Torkare", "Blad" };
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(
|
||||
new StringReader(
|
||||
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
|
||||
dict);
|
||||
Token t;
|
||||
while ((t=tf.next())!=null) {
|
||||
System.out.println(t);
|
||||
}
|
||||
}
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,214 @@
|
|||
package org.apache.lucene.analysis.compound;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipInputStream;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
||||
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestCompoundWordTokenFilter extends TestCase {
|
||||
private static String[] locations = {
|
||||
"http://dfn.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
|
||||
"http://surfnet.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
|
||||
"http://superb-west.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip",
|
||||
"http://superb-east.dl.sourceforge.net/sourceforge/offo/offo-hyphenation.zip"};
|
||||
|
||||
private byte[] patternsFileContent;
|
||||
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
getHyphenationPatternFileContents();
|
||||
}
|
||||
|
||||
public void testHyphenationCompoundWordsDE() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung" };
|
||||
|
||||
Reader reader = getHyphenationReader("de_DR.xml");
|
||||
if (reader == null) {
|
||||
// we gracefully die if we have no reader
|
||||
return;
|
||||
}
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
|
||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz", "Rind",
|
||||
"fleisch", "überwachung", "gesetz", "Drahtschere", "Draht", "schere",
|
||||
"abba" }, new int[] { 0, 0, 4, 11, 23, 30, 30, 35, 42 }, new int[] {
|
||||
29, 4, 11, 22, 29, 41, 35, 41, 46 }, new int[] { 1, 0, 0, 0, 0, 1, 0,
|
||||
0, 1 });
|
||||
}
|
||||
|
||||
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
|
||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||
"Aufgabe", "Überwachung", "Rindfleisch", "Überwachungsgesetz" };
|
||||
|
||||
Reader reader = getHyphenationReader("de_DR.xml");
|
||||
if (reader == null) {
|
||||
// we gracefully die if we have no reader
|
||||
return;
|
||||
}
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz")), hyphenator, dict,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
|
||||
assertFiltersTo(tf, new String[] { "Rindfleischüberwachungsgesetz",
|
||||
"Rindfleisch", "fleisch", "überwachungsgesetz", "gesetz" }, new int[] {
|
||||
0, 0, 4, 11, 23 }, new int[] { 29, 11, 11, 29, 29 }, new int[] { 1, 0,
|
||||
0, 0, 0 });
|
||||
}
|
||||
|
||||
public void testDumbCompoundWordsSE() throws Exception {
|
||||
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
|
||||
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
|
||||
"Sko", "Vind", "Rute", "Torkare", "Blad" };
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(
|
||||
new StringReader(
|
||||
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
|
||||
dict);
|
||||
|
||||
assertFiltersTo(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
|
||||
"Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
|
||||
"Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
|
||||
"Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
|
||||
"fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
|
||||
"fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
|
||||
"Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
|
||||
"Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
|
||||
17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
|
||||
77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
|
||||
137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
|
||||
28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
|
||||
87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
|
||||
155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
|
||||
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
|
||||
0, 0, 0, 1 });
|
||||
}
|
||||
|
||||
public void testDumbCompoundWordsSELongestMatch() throws Exception {
|
||||
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
|
||||
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
|
||||
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
|
||||
|
||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader("Basfiolsfodralmakaregesäll")),
|
||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
|
||||
|
||||
assertFiltersTo(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
|
||||
"fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
|
||||
14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
|
||||
0, 0 });
|
||||
}
|
||||
|
||||
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
|
||||
int[] endOffset, int[] posIncr) throws Exception {
|
||||
for (int i = 0; i < s.length; ++i) {
|
||||
Token t = tf.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(s[i], new String(t.termBuffer(), 0, t.termLength()));
|
||||
assertEquals(startOffset[i], t.startOffset());
|
||||
assertEquals(endOffset[i], t.endOffset());
|
||||
assertEquals(posIncr[i], t.getPositionIncrement());
|
||||
}
|
||||
assertNull(tf.next());
|
||||
}
|
||||
|
||||
private void getHyphenationPatternFileContents() {
|
||||
try {
|
||||
List urls = new LinkedList(Arrays.asList(locations));
|
||||
Collections.shuffle(urls);
|
||||
URL url = new URL((String)urls.get(0));
|
||||
InputStream in = url.openStream();
|
||||
byte[] buffer = new byte[1024];
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
int count;
|
||||
|
||||
while ((count = in.read(buffer)) != -1) {
|
||||
out.write(buffer, 0, count);
|
||||
}
|
||||
in.close();
|
||||
out.close();
|
||||
patternsFileContent = out.toByteArray();
|
||||
} catch (IOException e) {
|
||||
// we swallow all exceptions - the user might have no internet connection
|
||||
}
|
||||
}
|
||||
|
||||
private Reader getHyphenationReader(String filename) throws Exception {
|
||||
if (patternsFileContent == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
ZipInputStream zipstream = new ZipInputStream(new ByteArrayInputStream(
|
||||
patternsFileContent));
|
||||
|
||||
ZipEntry entry;
|
||||
while ((entry = zipstream.getNextEntry()) != null) {
|
||||
if (entry.getName().equals("offo-hyphenation/hyph/" + filename)) {
|
||||
byte[] buffer = new byte[1024];
|
||||
ByteArrayOutputStream outstream = new ByteArrayOutputStream();
|
||||
int count;
|
||||
while ((count = zipstream.read(buffer)) != -1) {
|
||||
outstream.write(buffer, 0, count);
|
||||
}
|
||||
outstream.close();
|
||||
zipstream.close();
|
||||
return new StringReader(new String(outstream.toByteArray(),
|
||||
"ISO-8859-1"));
|
||||
}
|
||||
}
|
||||
// we never should get here
|
||||
return null;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue