LUCENE-3523: Add WordBreakSpellChecker to suggest based on word combinations and/or breaks

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344318 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
James Dyer 2012-05-30 15:58:38 +00:00
parent a0e590a119
commit 642b882d5a
4 changed files with 735 additions and 0 deletions

View File

@ -4148,6 +4148,10 @@ New features
passed to not double RAM usage in the FieldCache. (Chris passed to not double RAM usage in the FieldCache. (Chris
Hostetter, Mark Miller, Mike McCandless) Hostetter, Mark Miller, Mike McCandless)
* LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which
generates suggestions by combining two or more terms and/or
breaking terms into multiple words. See Javadocs for usage. (James Dyer)
Optimizations Optimizations
* LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing * LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -0,0 +1,17 @@
package org.apache.lucene.search.spell;
public class CombineSuggestion {
/**
* <p>The indexes from the passed-in array of terms used to make this word combination</p>
*/
public final int[] originalTermIndexes;
/**
* <p>The word combination suggestion</p>
*/
public final SuggestWord suggestion;
public CombineSuggestion (SuggestWord suggestion, int[] originalTermIndexes) {
this.suggestion = suggestion;
this.originalTermIndexes = originalTermIndexes;
}
}

View File

@ -0,0 +1,480 @@
package org.apache.lucene.search.spell;
import java.io.IOException;
import java.util.Comparator;
import java.util.PriorityQueue;
import java.util.Queue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.util.BytesRef;
/**
* <p>
* A spell checker whose sole function is to offer suggestions by combining
* multiple terms into one word and/or breaking terms into multiple words.
* </p>
*/
public class WordBreakSpellChecker {
private int minSuggestionFrequency = 1;
private int minBreakWordLength = 1;
private int maxCombineWordLength = 20;
private int maxChanges = 1;
private int maxEvaluations = 1000;
public static final Term SEPARATOR_TERM = new Term("", "");
public enum BreakSuggestionSortMethod {
/**
* <p>
* Sort by Number of word breaks, then by the Sum of all the component
* term's frequencies
* </p>
*/
NUM_CHANGES_THEN_SUMMED_FREQUENCY,
/**
* <p>
* Sort by Number of word breaks, then by the Maximum of all the component
* term's frequencies
* </p>
*/
NUM_CHANGES_THEN_MAX_FREQUENCY
}
/**
* <p>
* Generate suggestions by breaking the passed-in term into multiple words.
* The scores returned are equal to the number of word breaks needed so a
* lower score is generally preferred over a higher score.
* </p>
*
* @param term
* @param maxSuggestions
* @param ir
* @param suggestMode
* - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
* @param sortMethod
* - default =
* {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
* @return one or more arrays of words formed by breaking up the original term
* @throws IOException
*/
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
IndexReader ir, SuggestMode suggestMode,
BreakSuggestionSortMethod sortMethod) throws IOException {
if (maxSuggestions < 1) {
return new SuggestWord[0][0];
}
if (suggestMode == null) {
suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
}
if (sortMethod == null) {
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
}
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
: new LengthThenSumFreqComparator();
Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<SuggestWordArrayWrapper>(
queueInitialCapacity, queueComparator);
int origFreq = ir.docFreq(term);
if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
return new SuggestWord[0][];
}
int useMinSuggestionFrequency = minSuggestionFrequency;
if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
}
generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
sortMethod);
SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
for (int i = suggestions.size() - 1; i >= 0; i--) {
suggestionArray[i] = suggestions.remove().suggestWords;
}
return suggestionArray;
}
/**
* <p>
* Generate suggestions by combining one or more of the passed-in terms into
* single words. The returned {@link CombineSuggestion} contains both a
* {@link SuggestWord} and also an array detailing which passed-in terms were
* involved in creating this combination. The scores returned are equal to the
* number of word combinations needed, also one less than the length of the
* array {@link CombineSuggestion#originalTermIndexes}. Generally, a
* suggestion with a lower score is preferred over a higher score.
* </p>
* <p>
* To prevent two adjacent terms from being combined (for instance, if one is
* mandatory and the other is prohibited), separate the two terms with
* {@link WordBreakSpellChecker#SEPARATOR_TERM}
* </p>
* <p>
* When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
* suggestion will include at least one term not in the index.
* </p>
* <p>
* When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
* suggestion will have the same, or better frequency than the most-popular
* included term.
* </p>
*
* @param terms
* @param maxSuggestions
* @param ir
* @param suggestMode
* @return an array of words generated by combining original terms
* @throws IOException
*/
public CombineSuggestion[] suggestWordCombinations(Term[] terms,
int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
throws IOException {
if (maxSuggestions < 1) {
return new CombineSuggestion[0];
}
int[] origFreqs = null;
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
origFreqs = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
origFreqs[i] = ir.docFreq(terms[i]);
}
}
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<CombineSuggestionWrapper>(
queueInitialCapacity, queueComparator);
int thisTimeEvaluations = 0;
BytesRef reuse = new BytesRef();
for (int i = 0; i < terms.length - 1; i++) {
if (terms[i].equals(SEPARATOR_TERM)) {
continue;
}
int byteLength = terms[i].bytes().length;
if (byteLength > maxCombineWordLength) {
continue;
}
reuse.grow(byteLength);
reuse.length = byteLength;
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
reuse.bytes, 0, byteLength);
int maxFreq = 0;
int minFreq = Integer.MAX_VALUE;
if (origFreqs != null) {
maxFreq = origFreqs[i];
minFreq = origFreqs[i];
}
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
if (terms[j].equals(SEPARATOR_TERM)) {
break;
}
byteLength += terms[j].bytes().length;
if (byteLength > maxCombineWordLength) {
break;
}
if (origFreqs != null) {
maxFreq = Math.max(maxFreq, origFreqs[j]);
minFreq = Math.min(minFreq, origFreqs[j]);
}
reuse.grow(byteLength);
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
reuse.bytes, reuse.length, terms[j].bytes().length);
reuse.length = byteLength;
Term combinedTerm = new Term(terms[0].field(), reuse);
int combinedTermFreq = ir.docFreq(combinedTerm);
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|| combinedTermFreq >= maxFreq) {
if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
|| minFreq == 0) {
if (combinedTermFreq >= minSuggestionFrequency) {
int[] origIndexes = new int[j - i + 1];
origIndexes[0] = i;
for (int k = 1; k < origIndexes.length; k++) {
origIndexes[k] = i + k;
}
SuggestWord word = new SuggestWord();
word.freq = combinedTermFreq;
word.score = origIndexes.length - 1;
word.string = combinedTerm.text();
CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
new CombineSuggestion(word, origIndexes),
(origIndexes.length - 1));
suggestions.offer(suggestion);
if (suggestions.size() > maxSuggestions) {
suggestions.poll();
}
}
}
}
thisTimeEvaluations++;
if (thisTimeEvaluations == maxEvaluations) {
break;
}
}
}
CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
.size()];
for (int i = suggestions.size() - 1; i >= 0; i--) {
combineSuggestions[i] = suggestions.remove().combineSuggestion;
}
return combineSuggestions;
}
private int generateBreakUpSuggestions(Term term, IndexReader ir,
int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
throws IOException {
int termLength = term.bytes().length;
int useMinBreakWordLength = minBreakWordLength;
if (useMinBreakWordLength < 1) {
useMinBreakWordLength = 1;
}
if (termLength <= (useMinBreakWordLength * 2)) {
return 0;
}
int thisTimeEvaluations = 0;
BytesRef termBytes = term.bytes().clone();
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
.field());
if (leftWord.freq >= useMinSuggestionFrequency) {
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
termLength - i, term.field());
if (rightWord.freq >= useMinSuggestionFrequency) {
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
newSuggestion(prefix, leftWord, rightWord));
suggestions.offer(suggestion);
if (suggestions.size() > maxSuggestions) {
suggestions.poll();
}
}
int newNumberBreaks = numberBreaks + 1;
if (newNumberBreaks <= maxChanges) {
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
rightWord.string), ir, newNumberBreaks, maxSuggestions,
useMinSuggestionFrequency, newPrefix(prefix, leftWord),
suggestions, totalEvaluations, sortMethod);
totalEvaluations += evaluations;
}
}
thisTimeEvaluations++;
totalEvaluations++;
if (totalEvaluations >= maxEvaluations) {
break;
}
}
return thisTimeEvaluations;
}
private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
newPrefix[newPrefix.length - 1] = append;
return newPrefix;
}
private SuggestWord[] newSuggestion(SuggestWord[] prefix,
SuggestWord append1, SuggestWord append2) {
SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
int score = prefix.length + 1;
for (int i = 0; i < prefix.length; i++) {
SuggestWord word = new SuggestWord();
word.string = prefix[i].string;
word.freq = prefix[i].freq;
word.score = score;
newSuggestion[i] = word;
}
append1.score = score;
append2.score = score;
newSuggestion[newSuggestion.length - 2] = append1;
newSuggestion[newSuggestion.length - 1] = append2;
return newSuggestion;
}
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
int offset, int length, String fieldname) throws IOException {
bytes.offset = offset;
bytes.length = length;
Term term = new Term(fieldname, bytes);
int freq = ir.docFreq(term);
SuggestWord word = new SuggestWord();
word.freq = freq;
word.score = 1;
word.string = term.text();
return word;
}
public int getMinSuggestionFrequency() {
return minSuggestionFrequency;
}
public int getMaxCombineWordLength() {
return maxCombineWordLength;
}
public int getMinBreakWordLength() {
return minBreakWordLength;
}
public int getMaxChanges() {
return maxChanges;
}
public int getMaxEvaluations() {
return maxEvaluations;
}
/**
* <p>
* The minimum frequency a term must have to be included as part of a
* suggestion. Default=1 Not applicable when used with
* {@link SuggestMode#SUGGEST_MORE_POPULAR}
* </p>
*
* @param minSuggestionFrequency
*/
public void setMinSuggestionFrequency(int minSuggestionFrequency) {
this.minSuggestionFrequency = minSuggestionFrequency;
}
/**
* <p>
* The maximum length of a suggestion made by combining 1 or more original
* terms. Default=20
* </p>
*
* @param maxCombineWordLength
*/
public void setMaxCombineWordLength(int maxCombineWordLength) {
this.maxCombineWordLength = maxCombineWordLength;
}
/**
* <p>
* The minimum length to break words down to. Default=1
* </p>
*
* @param minBreakWordLength
*/
public void setMinBreakWordLength(int minBreakWordLength) {
this.minBreakWordLength = minBreakWordLength;
}
/**
* <p>
* The maximum numbers of changes (word breaks or combinations) to make on the
* original term(s). Default=1
* </p>
*
* @param maxChanges
*/
public void setMaxChanges(int maxChanges) {
this.maxChanges = maxChanges;
}
/**
* <p>
* The maximum number of word combinations to evaluate. Default=1000. A higher
* value might improve result quality. A lower value might improve
* performance.
* </p>
*
* @param maxEvaluations
*/
public void setMaxEvaluations(int maxEvaluations) {
this.maxEvaluations = maxEvaluations;
}
private class LengthThenMaxFreqComparator implements
Comparator<SuggestWordArrayWrapper> {
@Override
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
if (o1.suggestWords.length != o2.suggestWords.length) {
return o2.suggestWords.length - o1.suggestWords.length;
}
if (o1.freqMax != o2.freqMax) {
return o1.freqMax - o2.freqMax;
}
return 0;
}
}
private class LengthThenSumFreqComparator implements
Comparator<SuggestWordArrayWrapper> {
@Override
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
if (o1.suggestWords.length != o2.suggestWords.length) {
return o2.suggestWords.length - o1.suggestWords.length;
}
if (o1.freqSum != o2.freqSum) {
return o1.freqSum - o2.freqSum;
}
return 0;
}
}
private class CombinationsThenFreqComparator implements
Comparator<CombineSuggestionWrapper> {
@Override
public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
if (o1.numCombinations != o2.numCombinations) {
return o2.numCombinations - o1.numCombinations;
}
if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
return o1.combineSuggestion.suggestion.freq
- o2.combineSuggestion.suggestion.freq;
}
return 0;
}
}
private class SuggestWordArrayWrapper {
final SuggestWord[] suggestWords;
final int freqMax;
final int freqSum;
SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
this.suggestWords = suggestWords;
int aFreqSum = 0;
int aFreqMax = 0;
for (SuggestWord sw : suggestWords) {
aFreqSum += sw.freq;
aFreqMax = Math.max(aFreqMax, sw.freq);
}
this.freqSum = aFreqSum;
this.freqMax = aFreqMax;
}
}
private class CombineSuggestionWrapper {
final CombineSuggestion combineSuggestion;
final int numCombinations;
CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
int numCombinations) {
this.combineSuggestion = combineSuggestion;
this.numCombinations = numCombinations;
}
}
}

View File

@ -0,0 +1,234 @@
package org.apache.lucene.search.spell;
import junit.framework.Assert;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
public class TestWordBreakSpellChecker extends LuceneTestCase {
private Directory dir = null;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
for (int i = 900; i < 1112; i++) {
Document doc = new Document();
String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
doc.add(newField("numbers", num, TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("numbers", "thou hast sand betwixt thy toes", TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("numbers", "hundredeight eightyeight yeight", TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("numbers", "tres y cinco", TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
writer.commit();
writer.close();
}
@Override
public void tearDown() throws Exception {
if(dir!=null) {
dir.close();
dir = null;
}
super.tearDown();
}
public void testCombiningWords() throws Exception {
IndexReader ir = null;
try {
ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
{
Term[] terms = {
new Term("numbers", "one"),
new Term("numbers", "hun"),
new Term("numbers", "dred"),
new Term("numbers", "eight"),
new Term("numbers", "y"),
new Term("numbers", "eight"),
};
wbsp.setMaxChanges(3);
wbsp.setMaxCombineWordLength(20);
wbsp.setMinSuggestionFrequency(1);
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
Assert.assertTrue(cs.length==5);
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[1].originalTermIndexes.length==2);
Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
Assert.assertTrue(cs[1].suggestion.score==1);
Assert.assertTrue(cs[2].originalTermIndexes.length==2);
Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
Assert.assertTrue(cs[2].suggestion.score==1);
for(int i=3 ; i<5 ; i++) {
Assert.assertTrue(cs[i].originalTermIndexes.length==3);
Assert.assertTrue(cs[i].suggestion.score==2);
Assert.assertTrue(
(cs[i].originalTermIndexes[0]==1 &&
cs[i].originalTermIndexes[1]==2 &&
cs[i].originalTermIndexes[2]==3 &&
cs[i].suggestion.string.equals("hundredeight")) ||
(cs[i].originalTermIndexes[0]==3 &&
cs[i].originalTermIndexes[1]==4 &&
cs[i].originalTermIndexes[2]==5 &&
cs[i].suggestion.string.equals("eightyeight"))
);
}
cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
Assert.assertTrue(cs.length==2);
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[1].originalTermIndexes.length==3);
Assert.assertTrue(cs[1].suggestion.score==2);
Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
}
} catch(Exception e) {
throw e;
} finally {
try { ir.close(); } catch(Exception e1) { }
}
}
public void testBreakingWords() throws Exception {
IndexReader ir = null;
try {
ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
{
Term term = new Term("numbers", "ninetynine");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("ninety"));
Assert.assertTrue(sw[0][1].string.equals("nine"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
}
{
Term term = new Term("numbers", "onethousand");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("one"));
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(1);
sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(2);
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(1);
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==2);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("one"));
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
Assert.assertTrue(sw[0][1].freq>1);
Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
Assert.assertTrue(sw[1].length==3);
Assert.assertTrue(sw[1][0].string.equals("one"));
Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand"));
Assert.assertTrue(sw[1][0].score == 2);
Assert.assertTrue(sw[1][1].score == 2);
Assert.assertTrue(sw[1][2].score == 2);
Assert.assertTrue(sw[1][0].freq>1);
Assert.assertTrue(sw[1][1].freq==1);
Assert.assertTrue(sw[1][2].freq==1);
}
{
Term term = new Term("numbers", "onethousandonehundredeleven");
wbsp.setMaxChanges(3);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==0);
wbsp.setMaxChanges(4);
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==5);
wbsp.setMaxChanges(5);
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==2);
Assert.assertTrue(sw[0].length==5);
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[1].length==6);
Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand"));
}
} catch(Exception e) {
throw e;
} finally {
try { ir.close(); } catch(Exception e1) { }
}
}
}