LUCENE-3523: Add WordBreakSpellChecker to suggest based on word combinations and/or breaks

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344318 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
James Dyer 2012-05-30 15:58:38 +00:00
parent a0e590a119
commit 642b882d5a
4 changed files with 735 additions and 0 deletions

View File

@ -4147,6 +4147,10 @@ New features
ValueSource, but takes care when composite (multi-segment) are
passed to not double RAM usage in the FieldCache. (Chris
Hostetter, Mark Miller, Mike McCandless)
* LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which
generates suggestions by combining two or more terms and/or
breaking terms into multiple words. See Javadocs for usage. (James Dyer)
Optimizations

View File

@ -0,0 +1,17 @@
package org.apache.lucene.search.spell;
public class CombineSuggestion {
/**
* <p>The indexes from the passed-in array of terms used to make this word combination</p>
*/
public final int[] originalTermIndexes;
/**
* <p>The word combination suggestion</p>
*/
public final SuggestWord suggestion;
public CombineSuggestion (SuggestWord suggestion, int[] originalTermIndexes) {
this.suggestion = suggestion;
this.originalTermIndexes = originalTermIndexes;
}
}

View File

@ -0,0 +1,480 @@
package org.apache.lucene.search.spell;
import java.io.IOException;
import java.util.Comparator;
import java.util.PriorityQueue;
import java.util.Queue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.util.BytesRef;
/**
* <p>
* A spell checker whose sole function is to offer suggestions by combining
* multiple terms into one word and/or breaking terms into multiple words.
* </p>
*/
public class WordBreakSpellChecker {
private int minSuggestionFrequency = 1;
private int minBreakWordLength = 1;
private int maxCombineWordLength = 20;
private int maxChanges = 1;
private int maxEvaluations = 1000;
public static final Term SEPARATOR_TERM = new Term("", "");
public enum BreakSuggestionSortMethod {
/**
* <p>
* Sort by Number of word breaks, then by the Sum of all the component
* term's frequencies
* </p>
*/
NUM_CHANGES_THEN_SUMMED_FREQUENCY,
/**
* <p>
* Sort by Number of word breaks, then by the Maximum of all the component
* term's frequencies
* </p>
*/
NUM_CHANGES_THEN_MAX_FREQUENCY
}
/**
* <p>
* Generate suggestions by breaking the passed-in term into multiple words.
* The scores returned are equal to the number of word breaks needed so a
* lower score is generally preferred over a higher score.
* </p>
*
* @param term
* @param maxSuggestions
* @param ir
* @param suggestMode
* - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
* @param sortMethod
* - default =
* {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
* @return one or more arrays of words formed by breaking up the original term
* @throws IOException
*/
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
IndexReader ir, SuggestMode suggestMode,
BreakSuggestionSortMethod sortMethod) throws IOException {
if (maxSuggestions < 1) {
return new SuggestWord[0][0];
}
if (suggestMode == null) {
suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
}
if (sortMethod == null) {
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
}
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
: new LengthThenSumFreqComparator();
Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<SuggestWordArrayWrapper>(
queueInitialCapacity, queueComparator);
int origFreq = ir.docFreq(term);
if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
return new SuggestWord[0][];
}
int useMinSuggestionFrequency = minSuggestionFrequency;
if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
}
generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
sortMethod);
SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
for (int i = suggestions.size() - 1; i >= 0; i--) {
suggestionArray[i] = suggestions.remove().suggestWords;
}
return suggestionArray;
}
/**
* <p>
* Generate suggestions by combining one or more of the passed-in terms into
* single words. The returned {@link CombineSuggestion} contains both a
* {@link SuggestWord} and also an array detailing which passed-in terms were
* involved in creating this combination. The scores returned are equal to the
* number of word combinations needed, also one less than the length of the
* array {@link CombineSuggestion#originalTermIndexes}. Generally, a
* suggestion with a lower score is preferred over a higher score.
* </p>
* <p>
* To prevent two adjacent terms from being combined (for instance, if one is
* mandatory and the other is prohibited), separate the two terms with
* {@link WordBreakSpellChecker#SEPARATOR_TERM}
* </p>
* <p>
* When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
* suggestion will include at least one term not in the index.
* </p>
* <p>
* When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
* suggestion will have the same, or better frequency than the most-popular
* included term.
* </p>
*
* @param terms
* @param maxSuggestions
* @param ir
* @param suggestMode
* @return an array of words generated by combining original terms
* @throws IOException
*/
public CombineSuggestion[] suggestWordCombinations(Term[] terms,
int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
throws IOException {
if (maxSuggestions < 1) {
return new CombineSuggestion[0];
}
int[] origFreqs = null;
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
origFreqs = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
origFreqs[i] = ir.docFreq(terms[i]);
}
}
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<CombineSuggestionWrapper>(
queueInitialCapacity, queueComparator);
int thisTimeEvaluations = 0;
BytesRef reuse = new BytesRef();
for (int i = 0; i < terms.length - 1; i++) {
if (terms[i].equals(SEPARATOR_TERM)) {
continue;
}
int byteLength = terms[i].bytes().length;
if (byteLength > maxCombineWordLength) {
continue;
}
reuse.grow(byteLength);
reuse.length = byteLength;
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
reuse.bytes, 0, byteLength);
int maxFreq = 0;
int minFreq = Integer.MAX_VALUE;
if (origFreqs != null) {
maxFreq = origFreqs[i];
minFreq = origFreqs[i];
}
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
if (terms[j].equals(SEPARATOR_TERM)) {
break;
}
byteLength += terms[j].bytes().length;
if (byteLength > maxCombineWordLength) {
break;
}
if (origFreqs != null) {
maxFreq = Math.max(maxFreq, origFreqs[j]);
minFreq = Math.min(minFreq, origFreqs[j]);
}
reuse.grow(byteLength);
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
reuse.bytes, reuse.length, terms[j].bytes().length);
reuse.length = byteLength;
Term combinedTerm = new Term(terms[0].field(), reuse);
int combinedTermFreq = ir.docFreq(combinedTerm);
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|| combinedTermFreq >= maxFreq) {
if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
|| minFreq == 0) {
if (combinedTermFreq >= minSuggestionFrequency) {
int[] origIndexes = new int[j - i + 1];
origIndexes[0] = i;
for (int k = 1; k < origIndexes.length; k++) {
origIndexes[k] = i + k;
}
SuggestWord word = new SuggestWord();
word.freq = combinedTermFreq;
word.score = origIndexes.length - 1;
word.string = combinedTerm.text();
CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
new CombineSuggestion(word, origIndexes),
(origIndexes.length - 1));
suggestions.offer(suggestion);
if (suggestions.size() > maxSuggestions) {
suggestions.poll();
}
}
}
}
thisTimeEvaluations++;
if (thisTimeEvaluations == maxEvaluations) {
break;
}
}
}
CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
.size()];
for (int i = suggestions.size() - 1; i >= 0; i--) {
combineSuggestions[i] = suggestions.remove().combineSuggestion;
}
return combineSuggestions;
}
private int generateBreakUpSuggestions(Term term, IndexReader ir,
int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
throws IOException {
int termLength = term.bytes().length;
int useMinBreakWordLength = minBreakWordLength;
if (useMinBreakWordLength < 1) {
useMinBreakWordLength = 1;
}
if (termLength <= (useMinBreakWordLength * 2)) {
return 0;
}
int thisTimeEvaluations = 0;
BytesRef termBytes = term.bytes().clone();
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
.field());
if (leftWord.freq >= useMinSuggestionFrequency) {
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
termLength - i, term.field());
if (rightWord.freq >= useMinSuggestionFrequency) {
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
newSuggestion(prefix, leftWord, rightWord));
suggestions.offer(suggestion);
if (suggestions.size() > maxSuggestions) {
suggestions.poll();
}
}
int newNumberBreaks = numberBreaks + 1;
if (newNumberBreaks <= maxChanges) {
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
rightWord.string), ir, newNumberBreaks, maxSuggestions,
useMinSuggestionFrequency, newPrefix(prefix, leftWord),
suggestions, totalEvaluations, sortMethod);
totalEvaluations += evaluations;
}
}
thisTimeEvaluations++;
totalEvaluations++;
if (totalEvaluations >= maxEvaluations) {
break;
}
}
return thisTimeEvaluations;
}
private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
newPrefix[newPrefix.length - 1] = append;
return newPrefix;
}
private SuggestWord[] newSuggestion(SuggestWord[] prefix,
SuggestWord append1, SuggestWord append2) {
SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
int score = prefix.length + 1;
for (int i = 0; i < prefix.length; i++) {
SuggestWord word = new SuggestWord();
word.string = prefix[i].string;
word.freq = prefix[i].freq;
word.score = score;
newSuggestion[i] = word;
}
append1.score = score;
append2.score = score;
newSuggestion[newSuggestion.length - 2] = append1;
newSuggestion[newSuggestion.length - 1] = append2;
return newSuggestion;
}
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
int offset, int length, String fieldname) throws IOException {
bytes.offset = offset;
bytes.length = length;
Term term = new Term(fieldname, bytes);
int freq = ir.docFreq(term);
SuggestWord word = new SuggestWord();
word.freq = freq;
word.score = 1;
word.string = term.text();
return word;
}
public int getMinSuggestionFrequency() {
return minSuggestionFrequency;
}
public int getMaxCombineWordLength() {
return maxCombineWordLength;
}
public int getMinBreakWordLength() {
return minBreakWordLength;
}
public int getMaxChanges() {
return maxChanges;
}
public int getMaxEvaluations() {
return maxEvaluations;
}
/**
* <p>
* The minimum frequency a term must have to be included as part of a
* suggestion. Default=1 Not applicable when used with
* {@link SuggestMode#SUGGEST_MORE_POPULAR}
* </p>
*
* @param minSuggestionFrequency
*/
public void setMinSuggestionFrequency(int minSuggestionFrequency) {
this.minSuggestionFrequency = minSuggestionFrequency;
}
/**
* <p>
* The maximum length of a suggestion made by combining 1 or more original
* terms. Default=20
* </p>
*
* @param maxCombineWordLength
*/
public void setMaxCombineWordLength(int maxCombineWordLength) {
this.maxCombineWordLength = maxCombineWordLength;
}
/**
* <p>
* The minimum length to break words down to. Default=1
* </p>
*
* @param minBreakWordLength
*/
public void setMinBreakWordLength(int minBreakWordLength) {
this.minBreakWordLength = minBreakWordLength;
}
/**
* <p>
* The maximum numbers of changes (word breaks or combinations) to make on the
* original term(s). Default=1
* </p>
*
* @param maxChanges
*/
public void setMaxChanges(int maxChanges) {
this.maxChanges = maxChanges;
}
/**
* <p>
* The maximum number of word combinations to evaluate. Default=1000. A higher
* value might improve result quality. A lower value might improve
* performance.
* </p>
*
* @param maxEvaluations
*/
public void setMaxEvaluations(int maxEvaluations) {
this.maxEvaluations = maxEvaluations;
}
private class LengthThenMaxFreqComparator implements
Comparator<SuggestWordArrayWrapper> {
@Override
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
if (o1.suggestWords.length != o2.suggestWords.length) {
return o2.suggestWords.length - o1.suggestWords.length;
}
if (o1.freqMax != o2.freqMax) {
return o1.freqMax - o2.freqMax;
}
return 0;
}
}
private class LengthThenSumFreqComparator implements
Comparator<SuggestWordArrayWrapper> {
@Override
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
if (o1.suggestWords.length != o2.suggestWords.length) {
return o2.suggestWords.length - o1.suggestWords.length;
}
if (o1.freqSum != o2.freqSum) {
return o1.freqSum - o2.freqSum;
}
return 0;
}
}
private class CombinationsThenFreqComparator implements
Comparator<CombineSuggestionWrapper> {
@Override
public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
if (o1.numCombinations != o2.numCombinations) {
return o2.numCombinations - o1.numCombinations;
}
if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
return o1.combineSuggestion.suggestion.freq
- o2.combineSuggestion.suggestion.freq;
}
return 0;
}
}
private class SuggestWordArrayWrapper {
final SuggestWord[] suggestWords;
final int freqMax;
final int freqSum;
SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
this.suggestWords = suggestWords;
int aFreqSum = 0;
int aFreqMax = 0;
for (SuggestWord sw : suggestWords) {
aFreqSum += sw.freq;
aFreqMax = Math.max(aFreqMax, sw.freq);
}
this.freqSum = aFreqSum;
this.freqMax = aFreqMax;
}
}
private class CombineSuggestionWrapper {
final CombineSuggestion combineSuggestion;
final int numCombinations;
CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
int numCombinations) {
this.combineSuggestion = combineSuggestion;
this.numCombinations = numCombinations;
}
}
}

View File

@ -0,0 +1,234 @@
package org.apache.lucene.search.spell;
import junit.framework.Assert;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
public class TestWordBreakSpellChecker extends LuceneTestCase {
private Directory dir = null;
@Override
public void setUp() throws Exception {
super.setUp();
dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
for (int i = 900; i < 1112; i++) {
Document doc = new Document();
String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
doc.add(newField("numbers", num, TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("numbers", "thou hast sand betwixt thy toes", TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("numbers", "hundredeight eightyeight yeight", TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("numbers", "tres y cinco", TextField.TYPE_UNSTORED));
writer.addDocument(doc);
}
writer.commit();
writer.close();
}
@Override
public void tearDown() throws Exception {
if(dir!=null) {
dir.close();
dir = null;
}
super.tearDown();
}
public void testCombiningWords() throws Exception {
IndexReader ir = null;
try {
ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
{
Term[] terms = {
new Term("numbers", "one"),
new Term("numbers", "hun"),
new Term("numbers", "dred"),
new Term("numbers", "eight"),
new Term("numbers", "y"),
new Term("numbers", "eight"),
};
wbsp.setMaxChanges(3);
wbsp.setMaxCombineWordLength(20);
wbsp.setMinSuggestionFrequency(1);
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
Assert.assertTrue(cs.length==5);
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[1].originalTermIndexes.length==2);
Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
Assert.assertTrue(cs[1].suggestion.score==1);
Assert.assertTrue(cs[2].originalTermIndexes.length==2);
Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
Assert.assertTrue(cs[2].suggestion.score==1);
for(int i=3 ; i<5 ; i++) {
Assert.assertTrue(cs[i].originalTermIndexes.length==3);
Assert.assertTrue(cs[i].suggestion.score==2);
Assert.assertTrue(
(cs[i].originalTermIndexes[0]==1 &&
cs[i].originalTermIndexes[1]==2 &&
cs[i].originalTermIndexes[2]==3 &&
cs[i].suggestion.string.equals("hundredeight")) ||
(cs[i].originalTermIndexes[0]==3 &&
cs[i].originalTermIndexes[1]==4 &&
cs[i].originalTermIndexes[2]==5 &&
cs[i].suggestion.string.equals("eightyeight"))
);
}
cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
Assert.assertTrue(cs.length==2);
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
Assert.assertTrue(cs[0].suggestion.score==1);
Assert.assertTrue(cs[1].originalTermIndexes.length==3);
Assert.assertTrue(cs[1].suggestion.score==2);
Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
}
} catch(Exception e) {
throw e;
} finally {
try { ir.close(); } catch(Exception e1) { }
}
}
public void testBreakingWords() throws Exception {
IndexReader ir = null;
try {
ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
{
Term term = new Term("numbers", "ninetynine");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("ninety"));
Assert.assertTrue(sw[0][1].string.equals("nine"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
}
{
Term term = new Term("numbers", "onethousand");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("one"));
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(1);
sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(2);
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==2);
wbsp.setMaxChanges(2);
wbsp.setMinSuggestionFrequency(1);
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==2);
Assert.assertTrue(sw[0].length==2);
Assert.assertTrue(sw[0][0].string.equals("one"));
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[0][0].score == 1);
Assert.assertTrue(sw[0][1].score == 1);
Assert.assertTrue(sw[0][1].freq>1);
Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
Assert.assertTrue(sw[1].length==3);
Assert.assertTrue(sw[1][0].string.equals("one"));
Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand"));
Assert.assertTrue(sw[1][0].score == 2);
Assert.assertTrue(sw[1][1].score == 2);
Assert.assertTrue(sw[1][2].score == 2);
Assert.assertTrue(sw[1][0].freq>1);
Assert.assertTrue(sw[1][1].freq==1);
Assert.assertTrue(sw[1][2].freq==1);
}
{
Term term = new Term("numbers", "onethousandonehundredeleven");
wbsp.setMaxChanges(3);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==0);
wbsp.setMaxChanges(4);
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==1);
Assert.assertTrue(sw[0].length==5);
wbsp.setMaxChanges(5);
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==2);
Assert.assertTrue(sw[0].length==5);
Assert.assertTrue(sw[0][1].string.equals("thousand"));
Assert.assertTrue(sw[1].length==6);
Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand"));
}
} catch(Exception e) {
throw e;
} finally {
try { ir.close(); } catch(Exception e1) { }
}
}
}