diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 07a9a8314bf..8e09c2c180f 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -4147,6 +4147,10 @@ New features ValueSource, but takes care when composite (multi-segment) are passed to not double RAM usage in the FieldCache. (Chris Hostetter, Mark Miller, Mike McCandless) + +* LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which + generates suggestions by combining two or more terms and/or + breaking terms into multiple words. See Javadocs for usage. (James Dyer) Optimizations diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java new file mode 100644 index 00000000000..8029d282373 --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java @@ -0,0 +1,17 @@ +package org.apache.lucene.search.spell; + +public class CombineSuggestion { + /** + *

The indexes from the passed-in array of terms used to make this word combination

+ */ + public final int[] originalTermIndexes; + /** + *

The word combination suggestion

+ */ + public final SuggestWord suggestion; + + public CombineSuggestion (SuggestWord suggestion, int[] originalTermIndexes) { + this.suggestion = suggestion; + this.originalTermIndexes = originalTermIndexes; + } +} diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java new file mode 100644 index 00000000000..e9b4298caea --- /dev/null +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java @@ -0,0 +1,480 @@ +package org.apache.lucene.search.spell; + +import java.io.IOException; +import java.util.Comparator; +import java.util.PriorityQueue; +import java.util.Queue; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.util.BytesRef; + +/** + *

+ * A spell checker whose sole function is to offer suggestions by combining + * multiple terms into one word and/or breaking terms into multiple words. + *

+ */ +public class WordBreakSpellChecker { + private int minSuggestionFrequency = 1; + private int minBreakWordLength = 1; + private int maxCombineWordLength = 20; + private int maxChanges = 1; + private int maxEvaluations = 1000; + + public static final Term SEPARATOR_TERM = new Term("", ""); + + public enum BreakSuggestionSortMethod { + /** + *

+ * Sort by Number of word breaks, then by the Sum of all the component + * term's frequencies + *

+ */ + NUM_CHANGES_THEN_SUMMED_FREQUENCY, + /** + *

+ * Sort by Number of word breaks, then by the Maximum of all the component + * term's frequencies + *

+ */ + NUM_CHANGES_THEN_MAX_FREQUENCY + } + + /** + *

+ * Generate suggestions by breaking the passed-in term into multiple words. + * The scores returned are equal to the number of word breaks needed so a + * lower score is generally preferred over a higher score. + *

+ * + * @param term + * @param maxSuggestions + * @param ir + * @param suggestMode + * - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX} + * @param sortMethod + * - default = + * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} + * @return one or more arrays of words formed by breaking up the original term + * @throws IOException + */ + public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions, + IndexReader ir, SuggestMode suggestMode, + BreakSuggestionSortMethod sortMethod) throws IOException { + if (maxSuggestions < 1) { + return new SuggestWord[0][0]; + } + if (suggestMode == null) { + suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; + } + if (sortMethod == null) { + sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY; + } + + int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; + Comparator queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator() + : new LengthThenSumFreqComparator(); + Queue suggestions = new PriorityQueue( + queueInitialCapacity, queueComparator); + + int origFreq = ir.docFreq(term); + if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { + return new SuggestWord[0][]; + } + + int useMinSuggestionFrequency = minSuggestionFrequency; + if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) { + useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq); + } + + generateBreakUpSuggestions(term, ir, 1, maxSuggestions, + useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0, + sortMethod); + + SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][]; + for (int i = suggestions.size() - 1; i >= 0; i--) { + suggestionArray[i] = suggestions.remove().suggestWords; + } + + return suggestionArray; + } + + /** + *

+ * Generate suggestions by combining one or more of the passed-in terms into + * single words. The returned {@link CombineSuggestion} contains both a + * {@link SuggestWord} and also an array detailing which passed-in terms were + * involved in creating this combination. The scores returned are equal to the + * number of word combinations needed, also one less than the length of the + * array {@link CombineSuggestion#originalTermIndexes}. Generally, a + * suggestion with a lower score is preferred over a higher score. + *

+ *

+ * To prevent two adjacent terms from being combined (for instance, if one is + * mandatory and the other is prohibited), separate the two terms with + * {@link WordBreakSpellChecker#SEPARATOR_TERM} + *

+ *

+ * When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each + * suggestion will include at least one term not in the index. + *

+ *

+ * When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each + * suggestion will have the same, or better frequency than the most-popular + * included term. + *

+ * + * @param terms + * @param maxSuggestions + * @param ir + * @param suggestMode + * @return an array of words generated by combining original terms + * @throws IOException + */ + public CombineSuggestion[] suggestWordCombinations(Term[] terms, + int maxSuggestions, IndexReader ir, SuggestMode suggestMode) + throws IOException { + if (maxSuggestions < 1) { + return new CombineSuggestion[0]; + } + + int[] origFreqs = null; + if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { + origFreqs = new int[terms.length]; + for (int i = 0; i < terms.length; i++) { + origFreqs[i] = ir.docFreq(terms[i]); + } + } + + int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; + Comparator queueComparator = new CombinationsThenFreqComparator(); + Queue suggestions = new PriorityQueue( + queueInitialCapacity, queueComparator); + + int thisTimeEvaluations = 0; + BytesRef reuse = new BytesRef(); + for (int i = 0; i < terms.length - 1; i++) { + if (terms[i].equals(SEPARATOR_TERM)) { + continue; + } + + int byteLength = terms[i].bytes().length; + if (byteLength > maxCombineWordLength) { + continue; + } + + reuse.grow(byteLength); + reuse.length = byteLength; + System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset, + reuse.bytes, 0, byteLength); + + int maxFreq = 0; + int minFreq = Integer.MAX_VALUE; + if (origFreqs != null) { + maxFreq = origFreqs[i]; + minFreq = origFreqs[i]; + } + + for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) { + if (terms[j].equals(SEPARATOR_TERM)) { + break; + } + byteLength += terms[j].bytes().length; + if (byteLength > maxCombineWordLength) { + break; + } + + if (origFreqs != null) { + maxFreq = Math.max(maxFreq, origFreqs[j]); + minFreq = Math.min(minFreq, origFreqs[j]); + } + + reuse.grow(byteLength); + System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset, + reuse.bytes, reuse.length, terms[j].bytes().length); + reuse.length = byteLength; + + Term combinedTerm = new Term(terms[0].field(), reuse); + int combinedTermFreq = ir.docFreq(combinedTerm); + + if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR + || combinedTermFreq >= maxFreq) { + if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX + || minFreq == 0) { + if (combinedTermFreq >= minSuggestionFrequency) { + int[] origIndexes = new int[j - i + 1]; + origIndexes[0] = i; + for (int k = 1; k < origIndexes.length; k++) { + origIndexes[k] = i + k; + } + SuggestWord word = new SuggestWord(); + word.freq = combinedTermFreq; + word.score = origIndexes.length - 1; + word.string = combinedTerm.text(); + CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper( + new CombineSuggestion(word, origIndexes), + (origIndexes.length - 1)); + suggestions.offer(suggestion); + if (suggestions.size() > maxSuggestions) { + suggestions.poll(); + } + } + } + } + thisTimeEvaluations++; + if (thisTimeEvaluations == maxEvaluations) { + break; + } + } + } + CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions + .size()]; + for (int i = suggestions.size() - 1; i >= 0; i--) { + combineSuggestions[i] = suggestions.remove().combineSuggestion; + } + return combineSuggestions; + } + + private int generateBreakUpSuggestions(Term term, IndexReader ir, + int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency, + SuggestWord[] prefix, Queue suggestions, + int totalEvaluations, BreakSuggestionSortMethod sortMethod) + throws IOException { + int termLength = term.bytes().length; + int useMinBreakWordLength = minBreakWordLength; + if (useMinBreakWordLength < 1) { + useMinBreakWordLength = 1; + } + if (termLength <= (useMinBreakWordLength * 2)) { + return 0; + } + + int thisTimeEvaluations = 0; + BytesRef termBytes = term.bytes().clone(); + for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) { + SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term + .field()); + + if (leftWord.freq >= useMinSuggestionFrequency) { + SuggestWord rightWord = generateSuggestWord(ir, termBytes, i, + termLength - i, term.field()); + if (rightWord.freq >= useMinSuggestionFrequency) { + SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper( + newSuggestion(prefix, leftWord, rightWord)); + suggestions.offer(suggestion); + if (suggestions.size() > maxSuggestions) { + suggestions.poll(); + } + } + + int newNumberBreaks = numberBreaks + 1; + if (newNumberBreaks <= maxChanges) { + int evaluations = generateBreakUpSuggestions(new Term(term.field(), + rightWord.string), ir, newNumberBreaks, maxSuggestions, + useMinSuggestionFrequency, newPrefix(prefix, leftWord), + suggestions, totalEvaluations, sortMethod); + totalEvaluations += evaluations; + } + } + thisTimeEvaluations++; + totalEvaluations++; + if (totalEvaluations >= maxEvaluations) { + break; + } + } + return thisTimeEvaluations; + } + + private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) { + SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1]; + System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length); + newPrefix[newPrefix.length - 1] = append; + return newPrefix; + } + + private SuggestWord[] newSuggestion(SuggestWord[] prefix, + SuggestWord append1, SuggestWord append2) { + SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2]; + int score = prefix.length + 1; + for (int i = 0; i < prefix.length; i++) { + SuggestWord word = new SuggestWord(); + word.string = prefix[i].string; + word.freq = prefix[i].freq; + word.score = score; + newSuggestion[i] = word; + } + append1.score = score; + append2.score = score; + newSuggestion[newSuggestion.length - 2] = append1; + newSuggestion[newSuggestion.length - 1] = append2; + return newSuggestion; + } + + private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes, + int offset, int length, String fieldname) throws IOException { + bytes.offset = offset; + bytes.length = length; + Term term = new Term(fieldname, bytes); + int freq = ir.docFreq(term); + SuggestWord word = new SuggestWord(); + word.freq = freq; + word.score = 1; + word.string = term.text(); + return word; + } + + public int getMinSuggestionFrequency() { + return minSuggestionFrequency; + } + + public int getMaxCombineWordLength() { + return maxCombineWordLength; + } + + public int getMinBreakWordLength() { + return minBreakWordLength; + } + + public int getMaxChanges() { + return maxChanges; + } + + public int getMaxEvaluations() { + return maxEvaluations; + } + + /** + *

+ * The minimum frequency a term must have to be included as part of a + * suggestion. Default=1 Not applicable when used with + * {@link SuggestMode#SUGGEST_MORE_POPULAR} + *

+ * + * @param minSuggestionFrequency + */ + public void setMinSuggestionFrequency(int minSuggestionFrequency) { + this.minSuggestionFrequency = minSuggestionFrequency; + } + + /** + *

+ * The maximum length of a suggestion made by combining 1 or more original + * terms. Default=20 + *

+ * + * @param maxCombineWordLength + */ + public void setMaxCombineWordLength(int maxCombineWordLength) { + this.maxCombineWordLength = maxCombineWordLength; + } + + /** + *

+ * The minimum length to break words down to. Default=1 + *

+ * + * @param minBreakWordLength + */ + public void setMinBreakWordLength(int minBreakWordLength) { + this.minBreakWordLength = minBreakWordLength; + } + + /** + *

+ * The maximum numbers of changes (word breaks or combinations) to make on the + * original term(s). Default=1 + *

+ * + * @param maxChanges + */ + public void setMaxChanges(int maxChanges) { + this.maxChanges = maxChanges; + } + + /** + *

+ * The maximum number of word combinations to evaluate. Default=1000. A higher + * value might improve result quality. A lower value might improve + * performance. + *

+ * + * @param maxEvaluations + */ + public void setMaxEvaluations(int maxEvaluations) { + this.maxEvaluations = maxEvaluations; + } + + private class LengthThenMaxFreqComparator implements + Comparator { + @Override + public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) { + if (o1.suggestWords.length != o2.suggestWords.length) { + return o2.suggestWords.length - o1.suggestWords.length; + } + if (o1.freqMax != o2.freqMax) { + return o1.freqMax - o2.freqMax; + } + return 0; + } + } + + private class LengthThenSumFreqComparator implements + Comparator { + @Override + public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) { + if (o1.suggestWords.length != o2.suggestWords.length) { + return o2.suggestWords.length - o1.suggestWords.length; + } + if (o1.freqSum != o2.freqSum) { + return o1.freqSum - o2.freqSum; + } + return 0; + } + } + + private class CombinationsThenFreqComparator implements + Comparator { + @Override + public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) { + if (o1.numCombinations != o2.numCombinations) { + return o2.numCombinations - o1.numCombinations; + } + if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) { + return o1.combineSuggestion.suggestion.freq + - o2.combineSuggestion.suggestion.freq; + } + return 0; + } + } + + private class SuggestWordArrayWrapper { + final SuggestWord[] suggestWords; + final int freqMax; + final int freqSum; + + SuggestWordArrayWrapper(SuggestWord[] suggestWords) { + this.suggestWords = suggestWords; + int aFreqSum = 0; + int aFreqMax = 0; + for (SuggestWord sw : suggestWords) { + aFreqSum += sw.freq; + aFreqMax = Math.max(aFreqMax, sw.freq); + } + this.freqSum = aFreqSum; + this.freqMax = aFreqMax; + } + } + + private class CombineSuggestionWrapper { + final CombineSuggestion combineSuggestion; + final int numCombinations; + + CombineSuggestionWrapper(CombineSuggestion combineSuggestion, + int numCombinations) { + this.combineSuggestion = combineSuggestion; + this.numCombinations = numCombinations; + } + } +} diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java new file mode 100644 index 00000000000..50c961798fe --- /dev/null +++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java @@ -0,0 +1,234 @@ +package org.apache.lucene.search.spell; + +import junit.framework.Assert; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +public class TestWordBreakSpellChecker extends LuceneTestCase { + private Directory dir = null; + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true)); + + for (int i = 900; i < 1112; i++) { + Document doc = new Document(); + String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", ""); + doc.add(newField("numbers", num, TextField.TYPE_UNSTORED)); + writer.addDocument(doc); + } + + { + Document doc = new Document(); + doc.add(newField("numbers", "thou hast sand betwixt thy toes", TextField.TYPE_UNSTORED)); + writer.addDocument(doc); + } + { + Document doc = new Document(); + doc.add(newField("numbers", "hundredeight eightyeight yeight", TextField.TYPE_UNSTORED)); + writer.addDocument(doc); + } + { + Document doc = new Document(); + doc.add(newField("numbers", "tres y cinco", TextField.TYPE_UNSTORED)); + writer.addDocument(doc); + } + + writer.commit(); + writer.close(); + } + + @Override + public void tearDown() throws Exception { + if(dir!=null) { + dir.close(); + dir = null; + } + super.tearDown(); + } + public void testCombiningWords() throws Exception { + IndexReader ir = null; + try { + ir = DirectoryReader.open(dir); + WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); + + { + Term[] terms = { + new Term("numbers", "one"), + new Term("numbers", "hun"), + new Term("numbers", "dred"), + new Term("numbers", "eight"), + new Term("numbers", "y"), + new Term("numbers", "eight"), + }; + wbsp.setMaxChanges(3); + wbsp.setMaxCombineWordLength(20); + wbsp.setMinSuggestionFrequency(1); + CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS); + Assert.assertTrue(cs.length==5); + + Assert.assertTrue(cs[0].originalTermIndexes.length==2); + Assert.assertTrue(cs[0].originalTermIndexes[0]==1); + Assert.assertTrue(cs[0].originalTermIndexes[1]==2); + Assert.assertTrue(cs[0].suggestion.string.equals("hundred")); + Assert.assertTrue(cs[0].suggestion.score==1); + + Assert.assertTrue(cs[1].originalTermIndexes.length==2); + Assert.assertTrue(cs[1].originalTermIndexes[0]==3); + Assert.assertTrue(cs[1].originalTermIndexes[1]==4); + Assert.assertTrue(cs[1].suggestion.string.equals("eighty")); + Assert.assertTrue(cs[1].suggestion.score==1); + + Assert.assertTrue(cs[2].originalTermIndexes.length==2); + Assert.assertTrue(cs[2].originalTermIndexes[0]==4); + Assert.assertTrue(cs[2].originalTermIndexes[1]==5); + Assert.assertTrue(cs[2].suggestion.string.equals("yeight")); + Assert.assertTrue(cs[2].suggestion.score==1); + + for(int i=3 ; i<5 ; i++) { + Assert.assertTrue(cs[i].originalTermIndexes.length==3); + Assert.assertTrue(cs[i].suggestion.score==2); + Assert.assertTrue( + (cs[i].originalTermIndexes[0]==1 && + cs[i].originalTermIndexes[1]==2 && + cs[i].originalTermIndexes[2]==3 && + cs[i].suggestion.string.equals("hundredeight")) || + (cs[i].originalTermIndexes[0]==3 && + cs[i].originalTermIndexes[1]==4 && + cs[i].originalTermIndexes[2]==5 && + cs[i].suggestion.string.equals("eightyeight")) + ); + } + + cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); + Assert.assertTrue(cs.length==2); + Assert.assertTrue(cs[0].originalTermIndexes.length==2); + Assert.assertTrue(cs[0].suggestion.score==1); + Assert.assertTrue(cs[0].originalTermIndexes[0]==1); + Assert.assertTrue(cs[0].originalTermIndexes[1]==2); + Assert.assertTrue(cs[0].suggestion.string.equals("hundred")); + Assert.assertTrue(cs[0].suggestion.score==1); + + Assert.assertTrue(cs[1].originalTermIndexes.length==3); + Assert.assertTrue(cs[1].suggestion.score==2); + Assert.assertTrue(cs[1].originalTermIndexes[0] == 1); + Assert.assertTrue(cs[1].originalTermIndexes[1] == 2); + Assert.assertTrue(cs[1].originalTermIndexes[2] == 3); + Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight")); + } + } catch(Exception e) { + throw e; + } finally { + try { ir.close(); } catch(Exception e1) { } + } + } + + public void testBreakingWords() throws Exception { + IndexReader ir = null; + try { + ir = DirectoryReader.open(dir); + WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); + + { + Term term = new Term("numbers", "ninetynine"); + wbsp.setMaxChanges(1); + wbsp.setMinBreakWordLength(1); + wbsp.setMinSuggestionFrequency(1); + SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==1); + Assert.assertTrue(sw[0].length==2); + Assert.assertTrue(sw[0][0].string.equals("ninety")); + Assert.assertTrue(sw[0][1].string.equals("nine")); + Assert.assertTrue(sw[0][0].score == 1); + Assert.assertTrue(sw[0][1].score == 1); + } + { + Term term = new Term("numbers", "onethousand"); + wbsp.setMaxChanges(1); + wbsp.setMinBreakWordLength(1); + wbsp.setMinSuggestionFrequency(1); + SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==1); + Assert.assertTrue(sw[0].length==2); + Assert.assertTrue(sw[0][0].string.equals("one")); + Assert.assertTrue(sw[0][1].string.equals("thousand")); + Assert.assertTrue(sw[0][0].score == 1); + Assert.assertTrue(sw[0][1].score == 1); + + wbsp.setMaxChanges(2); + wbsp.setMinSuggestionFrequency(1); + sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==1); + Assert.assertTrue(sw[0].length==2); + + wbsp.setMaxChanges(2); + wbsp.setMinSuggestionFrequency(2); + sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==1); + Assert.assertTrue(sw[0].length==2); + + wbsp.setMaxChanges(2); + wbsp.setMinSuggestionFrequency(1); + sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==2); + Assert.assertTrue(sw[0].length==2); + Assert.assertTrue(sw[0][0].string.equals("one")); + Assert.assertTrue(sw[0][1].string.equals("thousand")); + Assert.assertTrue(sw[0][0].score == 1); + Assert.assertTrue(sw[0][1].score == 1); + Assert.assertTrue(sw[0][1].freq>1); + Assert.assertTrue(sw[0][0].freq>sw[0][1].freq); + Assert.assertTrue(sw[1].length==3); + Assert.assertTrue(sw[1][0].string.equals("one")); + Assert.assertTrue(sw[1][1].string.equals("thou")); + Assert.assertTrue(sw[1][2].string.equals("sand")); + Assert.assertTrue(sw[1][0].score == 2); + Assert.assertTrue(sw[1][1].score == 2); + Assert.assertTrue(sw[1][2].score == 2); + Assert.assertTrue(sw[1][0].freq>1); + Assert.assertTrue(sw[1][1].freq==1); + Assert.assertTrue(sw[1][2].freq==1); + } + { + Term term = new Term("numbers", "onethousandonehundredeleven"); + wbsp.setMaxChanges(3); + wbsp.setMinBreakWordLength(1); + wbsp.setMinSuggestionFrequency(1); + SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==0); + + wbsp.setMaxChanges(4); + sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==1); + Assert.assertTrue(sw[0].length==5); + + wbsp.setMaxChanges(5); + sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==2); + Assert.assertTrue(sw[0].length==5); + Assert.assertTrue(sw[0][1].string.equals("thousand")); + Assert.assertTrue(sw[1].length==6); + Assert.assertTrue(sw[1][1].string.equals("thou")); + Assert.assertTrue(sw[1][2].string.equals("sand")); + } + + } catch(Exception e) { + throw e; + } finally { + try { ir.close(); } catch(Exception e1) { } + } + } + }