LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without a dictionary

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@946139 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-19 11:58:37 +00:00
parent c5ae13f39e
commit fe5f1aabcb
5 changed files with 106 additions and 8 deletions

View File

@ -160,6 +160,9 @@ New features
* LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
when Version is set to 3.1 or higher. (Robert Muir)
* LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary.
(Thomas Peuss via Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation

View File

@ -154,7 +154,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
if (dictionary instanceof CharArraySet) {
if (dictionary==null || dictionary instanceof CharArraySet) {
this.dictionary = (CharArraySet) dictionary;
} else {
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
@ -181,6 +181,9 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
}
public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
if (dictionary == null) {
return null;
}
// is the below really case insensitive?
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
addAllLowerCase(dict, Arrays.asList(dictionary));

View File

@ -153,6 +153,33 @@ public class HyphenationCompoundWordTokenFilter extends
this.hyphenator = hyphenator;
}
/**
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
* <p>
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
* null, minWordSize, minSubwordSize, maxSubwordSize }
*/
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
int maxSubwordSize) {
this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
maxSubwordSize, false);
}
/**
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
* <p>
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
* DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
*/
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
HyphenationTree hyphenator) {
this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
DEFAULT_MAX_SUBWORD_SIZE);
}
/**
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
*
@ -305,7 +332,7 @@ public class HyphenationCompoundWordTokenFilter extends
}
// check the dictionary
if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
if (this.onlyLongestMatch) {
if (longestMatchToken != null) {
if (longestMatchToken.termLength() < partLength) {

View File

@ -81,8 +81,9 @@ filter available:
The {@link
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
potential subwords that a worth to check against the dictionary. The
quality of the output tokens is directly connected to the quality of the
potential subwords that a worth to check against the dictionary. It can be used
without a dictionary as well but then produces a lot of "nonword" tokens.
The quality of the output tokens is directly connected to the quality of the
grammar file you use. For languages like German they are quite good.
<h5>Grammar file</h5>
Unfortunately we cannot bundle the hyphenation grammar files with Lucene
@ -157,8 +158,24 @@ This decision matrix should help you:
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
Token t;
while ((t=tf.next())!=null) {
CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
while (tf.incrementToken()) {
System.out.println(t);
}
}
public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
Reader reader = new FileReader("de_DR.xml");
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
new WhitespaceTokenizer(new StringReader(
"Rindfleisch&uuml;berwachungsgesetz Drahtschere abba")), hyphenator);
CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
while (tf.incrementToken()) {
System.out.println(t);
}
}
@ -173,8 +190,8 @@ This decision matrix should help you:
new StringReader(
"Bild&ouml;rr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glas&ouml;gonfodral Basfiolsfodral Basfiolsfodralmakareges&auml;ll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
dict);
Token t;
while ((t=tf.next())!=null) {
CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
while (tf.incrementToken()) {
System.out.println(t);
}
}

View File

@ -70,6 +70,54 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
}
/**
* With hyphenation-only, you can get a lot of nonsense tokens.
* This can be controlled with the min/max subword size.
*/
public void testHyphenationOnly() throws Exception {
Reader reader = getHyphenationReader();
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(reader);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
hyphenator,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
2, 4);
// min=2, max=4
assertTokenStreamContents(tf,
new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
);
tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
hyphenator,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
4, 6);
// min=4, max=6
assertTokenStreamContents(tf,
new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
);
tf = new HyphenationCompoundWordTokenFilter(
TEST_VERSION_CURRENT,
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
hyphenator,
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
4, 10);
// min=4, max=10
assertTokenStreamContents(tf,
new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
"sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
);
}
public void testDumbCompoundWordsSE() throws Exception {
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",