mirror of https://github.com/apache/lucene.git
LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without a dictionary
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@946139 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c5ae13f39e
commit
fe5f1aabcb
|
@ -160,6 +160,9 @@ New features
|
|||
* LUCENE-2463: Add a Greek inflectional stemmer. GreekAnalyzer will now stem words
|
||||
when Version is set to 3.1 or higher. (Robert Muir)
|
||||
|
||||
* LUCENE-1287: Allow usage of HyphenationCompoundWordTokenFilter without dictionary.
|
||||
(Thomas Peuss via Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
|
|
@ -154,7 +154,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
this.maxSubwordSize=maxSubwordSize;
|
||||
this.onlyLongestMatch=onlyLongestMatch;
|
||||
|
||||
if (dictionary instanceof CharArraySet) {
|
||||
if (dictionary==null || dictionary instanceof CharArraySet) {
|
||||
this.dictionary = (CharArraySet) dictionary;
|
||||
} else {
|
||||
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
|
||||
|
@ -181,6 +181,9 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
}
|
||||
|
||||
public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
|
||||
if (dictionary == null) {
|
||||
return null;
|
||||
}
|
||||
// is the below really case insensitive?
|
||||
CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
|
||||
addAllLowerCase(dict, Arrays.asList(dictionary));
|
||||
|
|
|
@ -153,6 +153,33 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
this.hyphenator = hyphenator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* null, minWordSize, minSubwordSize, maxSubwordSize }
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
|
||||
int maxSubwordSize) {
|
||||
this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
|
||||
maxSubwordSize, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a HyphenationCompoundWordTokenFilter with no dictionary.
|
||||
* <p>
|
||||
* Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
|
||||
* HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
|
||||
* DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
|
||||
*/
|
||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||
HyphenationTree hyphenator) {
|
||||
this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
|
||||
DEFAULT_MAX_SUBWORD_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
|
||||
*
|
||||
|
@ -305,7 +332,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
|||
}
|
||||
|
||||
// check the dictionary
|
||||
if (dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
|
||||
if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
|
||||
if (this.onlyLongestMatch) {
|
||||
if (longestMatchToken != null) {
|
||||
if (longestMatchToken.termLength() < partLength) {
|
||||
|
|
|
@ -81,8 +81,9 @@ filter available:
|
|||
The {@link
|
||||
org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
|
||||
HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
|
||||
potential subwords that a worth to check against the dictionary. The
|
||||
quality of the output tokens is directly connected to the quality of the
|
||||
potential subwords that a worth to check against the dictionary. It can be used
|
||||
without a dictionary as well but then produces a lot of "nonword" tokens.
|
||||
The quality of the output tokens is directly connected to the quality of the
|
||||
grammar file you use. For languages like German they are quite good.
|
||||
<h5>Grammar file</h5>
|
||||
Unfortunately we cannot bundle the hyphenation grammar files with Lucene
|
||||
|
@ -157,8 +158,24 @@ This decision matrix should help you:
|
|||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||
|
||||
Token t;
|
||||
while ((t=tf.next())!=null) {
|
||||
CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
|
||||
while (tf.incrementToken()) {
|
||||
System.out.println(t);
|
||||
}
|
||||
}
|
||||
|
||||
public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
|
||||
Reader reader = new FileReader("de_DR.xml");
|
||||
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
new WhitespaceTokenizer(new StringReader(
|
||||
"Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator);
|
||||
|
||||
CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
|
||||
while (tf.incrementToken()) {
|
||||
System.out.println(t);
|
||||
}
|
||||
}
|
||||
|
@ -173,8 +190,8 @@ This decision matrix should help you:
|
|||
new StringReader(
|
||||
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
|
||||
dict);
|
||||
Token t;
|
||||
while ((t=tf.next())!=null) {
|
||||
CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
|
||||
while (tf.incrementToken()) {
|
||||
System.out.println(t);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,6 +70,54 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* With hyphenation-only, you can get a lot of nonsense tokens.
|
||||
* This can be controlled with the min/max subword size.
|
||||
*/
|
||||
public void testHyphenationOnly() throws Exception {
|
||||
Reader reader = getHyphenationReader();
|
||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||
.getHyphenationTree(reader);
|
||||
|
||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
|
||||
hyphenator,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
2, 4);
|
||||
|
||||
// min=2, max=4
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
|
||||
);
|
||||
|
||||
tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
|
||||
hyphenator,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
4, 6);
|
||||
|
||||
// min=4, max=6
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
|
||||
);
|
||||
|
||||
tf = new HyphenationCompoundWordTokenFilter(
|
||||
TEST_VERSION_CURRENT,
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("basketballkurv")),
|
||||
hyphenator,
|
||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||
4, 10);
|
||||
|
||||
// min=4, max=10
|
||||
assertTokenStreamContents(tf,
|
||||
new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
|
||||
"sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public void testDumbCompoundWordsSE() throws Exception {
|
||||
String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
|
||||
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
|
||||
|
|
Loading…
Reference in New Issue