LUCENE-3969: don't allow negative subword params, Hyphenation relies upon this to filter out what appear to be bogus hyphenation points

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3969@1311257 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-04-09 14:31:25 +00:00
parent 24f8a9e627
commit ac393486e0
2 changed files with 11 additions and 0 deletions

View File

@ -82,8 +82,17 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
super(input);
this.tokens=new LinkedList<CompoundToken>();
if (minWordSize < 0) {
throw new IllegalArgumentException("minWordSize cannot be negative");
}
this.minWordSize=minWordSize;
if (minSubwordSize < 0) {
throw new IllegalArgumentException("minSubwordSize cannot be negative");
}
this.minSubwordSize=minSubwordSize;
if (maxSubwordSize < 0) {
throw new IllegalArgumentException("maxSubwordSize cannot be negative");
}
this.maxSubwordSize=maxSubwordSize;
this.onlyLongestMatch=onlyLongestMatch;
this.dictionary = dictionary;

View File

@ -191,6 +191,8 @@ public class HyphenationCompoundWordTokenFilter extends
// we only put subwords to the token stream
// that are longer than minPartSize
if (partLength < this.minSubwordSize) {
// nocommit/BOGUS/BROKEN/FUNKY/WACKO: somehow we have negative 'parts' according to the
// calculation above, and we rely upon minSubwordSize being >=0 to filter them out...
continue;
}