diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 07a9a8314bf..8e09c2c180f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -4147,6 +4147,10 @@ New features
ValueSource, but takes care when composite (multi-segment) are
passed to not double RAM usage in the FieldCache. (Chris
Hostetter, Mark Miller, Mike McCandless)
+
+* LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which
+ generates suggestions by combining two or more terms and/or
+ breaking terms into multiple words. See Javadocs for usage. (James Dyer)
Optimizations
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java
new file mode 100644
index 00000000000..8029d282373
--- /dev/null
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/CombineSuggestion.java
@@ -0,0 +1,17 @@
+package org.apache.lucene.search.spell;
+
+public class CombineSuggestion {
+ /**
+ *
The indexes from the passed-in array of terms used to make this word combination
+ */
+ public final int[] originalTermIndexes;
+ /**
+ * The word combination suggestion
+ */
+ public final SuggestWord suggestion;
+
+ public CombineSuggestion (SuggestWord suggestion, int[] originalTermIndexes) {
+ this.suggestion = suggestion;
+ this.originalTermIndexes = originalTermIndexes;
+ }
+}
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
new file mode 100644
index 00000000000..e9b4298caea
--- /dev/null
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@@ -0,0 +1,480 @@
+package org.apache.lucene.search.spell;
+
+import java.io.IOException;
+import java.util.Comparator;
+import java.util.PriorityQueue;
+import java.util.Queue;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spell.SuggestMode;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ *
+ * A spell checker whose sole function is to offer suggestions by combining
+ * multiple terms into one word and/or breaking terms into multiple words.
+ *
+ */
+public class WordBreakSpellChecker {
+ private int minSuggestionFrequency = 1;
+ private int minBreakWordLength = 1;
+ private int maxCombineWordLength = 20;
+ private int maxChanges = 1;
+ private int maxEvaluations = 1000;
+
+ public static final Term SEPARATOR_TERM = new Term("", "");
+
+ public enum BreakSuggestionSortMethod {
+ /**
+ *
+ * Sort by Number of word breaks, then by the Sum of all the component
+ * term's frequencies
+ *
+ */
+ NUM_CHANGES_THEN_SUMMED_FREQUENCY,
+ /**
+ *
+ * Sort by Number of word breaks, then by the Maximum of all the component
+ * term's frequencies
+ *
+ */
+ NUM_CHANGES_THEN_MAX_FREQUENCY
+ }
+
+ /**
+ *
+ * Generate suggestions by breaking the passed-in term into multiple words.
+ * The scores returned are equal to the number of word breaks needed so a
+ * lower score is generally preferred over a higher score.
+ *
+ *
+ * @param term
+ * @param maxSuggestions
+ * @param ir
+ * @param suggestMode
+ * - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
+ * @param sortMethod
+ * - default =
+ * {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
+ * @return one or more arrays of words formed by breaking up the original term
+ * @throws IOException
+ */
+ public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
+ IndexReader ir, SuggestMode suggestMode,
+ BreakSuggestionSortMethod sortMethod) throws IOException {
+ if (maxSuggestions < 1) {
+ return new SuggestWord[0][0];
+ }
+ if (suggestMode == null) {
+ suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
+ }
+ if (sortMethod == null) {
+ sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
+ }
+
+ int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
+ Comparator queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
+ : new LengthThenSumFreqComparator();
+ Queue suggestions = new PriorityQueue(
+ queueInitialCapacity, queueComparator);
+
+ int origFreq = ir.docFreq(term);
+ if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
+ return new SuggestWord[0][];
+ }
+
+ int useMinSuggestionFrequency = minSuggestionFrequency;
+ if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
+ useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
+ }
+
+ generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
+ useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
+ sortMethod);
+
+ SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
+ for (int i = suggestions.size() - 1; i >= 0; i--) {
+ suggestionArray[i] = suggestions.remove().suggestWords;
+ }
+
+ return suggestionArray;
+ }
+
+ /**
+ *
+ * Generate suggestions by combining one or more of the passed-in terms into
+ * single words. The returned {@link CombineSuggestion} contains both a
+ * {@link SuggestWord} and also an array detailing which passed-in terms were
+ * involved in creating this combination. The scores returned are equal to the
+ * number of word combinations needed, also one less than the length of the
+ * array {@link CombineSuggestion#originalTermIndexes}. Generally, a
+ * suggestion with a lower score is preferred over a higher score.
+ *
+ *
+ * To prevent two adjacent terms from being combined (for instance, if one is
+ * mandatory and the other is prohibited), separate the two terms with
+ * {@link WordBreakSpellChecker#SEPARATOR_TERM}
+ *
+ *
+ * When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
+ * suggestion will include at least one term not in the index.
+ *
+ *
+ * When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
+ * suggestion will have the same, or better frequency than the most-popular
+ * included term.
+ *
+ *
+ * @param terms
+ * @param maxSuggestions
+ * @param ir
+ * @param suggestMode
+ * @return an array of words generated by combining original terms
+ * @throws IOException
+ */
+ public CombineSuggestion[] suggestWordCombinations(Term[] terms,
+ int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
+ throws IOException {
+ if (maxSuggestions < 1) {
+ return new CombineSuggestion[0];
+ }
+
+ int[] origFreqs = null;
+ if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
+ origFreqs = new int[terms.length];
+ for (int i = 0; i < terms.length; i++) {
+ origFreqs[i] = ir.docFreq(terms[i]);
+ }
+ }
+
+ int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
+ Comparator queueComparator = new CombinationsThenFreqComparator();
+ Queue suggestions = new PriorityQueue(
+ queueInitialCapacity, queueComparator);
+
+ int thisTimeEvaluations = 0;
+ BytesRef reuse = new BytesRef();
+ for (int i = 0; i < terms.length - 1; i++) {
+ if (terms[i].equals(SEPARATOR_TERM)) {
+ continue;
+ }
+
+ int byteLength = terms[i].bytes().length;
+ if (byteLength > maxCombineWordLength) {
+ continue;
+ }
+
+ reuse.grow(byteLength);
+ reuse.length = byteLength;
+ System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
+ reuse.bytes, 0, byteLength);
+
+ int maxFreq = 0;
+ int minFreq = Integer.MAX_VALUE;
+ if (origFreqs != null) {
+ maxFreq = origFreqs[i];
+ minFreq = origFreqs[i];
+ }
+
+ for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
+ if (terms[j].equals(SEPARATOR_TERM)) {
+ break;
+ }
+ byteLength += terms[j].bytes().length;
+ if (byteLength > maxCombineWordLength) {
+ break;
+ }
+
+ if (origFreqs != null) {
+ maxFreq = Math.max(maxFreq, origFreqs[j]);
+ minFreq = Math.min(minFreq, origFreqs[j]);
+ }
+
+ reuse.grow(byteLength);
+ System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
+ reuse.bytes, reuse.length, terms[j].bytes().length);
+ reuse.length = byteLength;
+
+ Term combinedTerm = new Term(terms[0].field(), reuse);
+ int combinedTermFreq = ir.docFreq(combinedTerm);
+
+ if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
+ || combinedTermFreq >= maxFreq) {
+ if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
+ || minFreq == 0) {
+ if (combinedTermFreq >= minSuggestionFrequency) {
+ int[] origIndexes = new int[j - i + 1];
+ origIndexes[0] = i;
+ for (int k = 1; k < origIndexes.length; k++) {
+ origIndexes[k] = i + k;
+ }
+ SuggestWord word = new SuggestWord();
+ word.freq = combinedTermFreq;
+ word.score = origIndexes.length - 1;
+ word.string = combinedTerm.text();
+ CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
+ new CombineSuggestion(word, origIndexes),
+ (origIndexes.length - 1));
+ suggestions.offer(suggestion);
+ if (suggestions.size() > maxSuggestions) {
+ suggestions.poll();
+ }
+ }
+ }
+ }
+ thisTimeEvaluations++;
+ if (thisTimeEvaluations == maxEvaluations) {
+ break;
+ }
+ }
+ }
+ CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
+ .size()];
+ for (int i = suggestions.size() - 1; i >= 0; i--) {
+ combineSuggestions[i] = suggestions.remove().combineSuggestion;
+ }
+ return combineSuggestions;
+ }
+
+ private int generateBreakUpSuggestions(Term term, IndexReader ir,
+ int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
+ SuggestWord[] prefix, Queue suggestions,
+ int totalEvaluations, BreakSuggestionSortMethod sortMethod)
+ throws IOException {
+ int termLength = term.bytes().length;
+ int useMinBreakWordLength = minBreakWordLength;
+ if (useMinBreakWordLength < 1) {
+ useMinBreakWordLength = 1;
+ }
+ if (termLength <= (useMinBreakWordLength * 2)) {
+ return 0;
+ }
+
+ int thisTimeEvaluations = 0;
+ BytesRef termBytes = term.bytes().clone();
+ for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
+ SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
+ .field());
+
+ if (leftWord.freq >= useMinSuggestionFrequency) {
+ SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
+ termLength - i, term.field());
+ if (rightWord.freq >= useMinSuggestionFrequency) {
+ SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
+ newSuggestion(prefix, leftWord, rightWord));
+ suggestions.offer(suggestion);
+ if (suggestions.size() > maxSuggestions) {
+ suggestions.poll();
+ }
+ }
+
+ int newNumberBreaks = numberBreaks + 1;
+ if (newNumberBreaks <= maxChanges) {
+ int evaluations = generateBreakUpSuggestions(new Term(term.field(),
+ rightWord.string), ir, newNumberBreaks, maxSuggestions,
+ useMinSuggestionFrequency, newPrefix(prefix, leftWord),
+ suggestions, totalEvaluations, sortMethod);
+ totalEvaluations += evaluations;
+ }
+ }
+ thisTimeEvaluations++;
+ totalEvaluations++;
+ if (totalEvaluations >= maxEvaluations) {
+ break;
+ }
+ }
+ return thisTimeEvaluations;
+ }
+
+ private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
+ SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
+ System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
+ newPrefix[newPrefix.length - 1] = append;
+ return newPrefix;
+ }
+
+ private SuggestWord[] newSuggestion(SuggestWord[] prefix,
+ SuggestWord append1, SuggestWord append2) {
+ SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
+ int score = prefix.length + 1;
+ for (int i = 0; i < prefix.length; i++) {
+ SuggestWord word = new SuggestWord();
+ word.string = prefix[i].string;
+ word.freq = prefix[i].freq;
+ word.score = score;
+ newSuggestion[i] = word;
+ }
+ append1.score = score;
+ append2.score = score;
+ newSuggestion[newSuggestion.length - 2] = append1;
+ newSuggestion[newSuggestion.length - 1] = append2;
+ return newSuggestion;
+ }
+
+ private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
+ int offset, int length, String fieldname) throws IOException {
+ bytes.offset = offset;
+ bytes.length = length;
+ Term term = new Term(fieldname, bytes);
+ int freq = ir.docFreq(term);
+ SuggestWord word = new SuggestWord();
+ word.freq = freq;
+ word.score = 1;
+ word.string = term.text();
+ return word;
+ }
+
+ public int getMinSuggestionFrequency() {
+ return minSuggestionFrequency;
+ }
+
+ public int getMaxCombineWordLength() {
+ return maxCombineWordLength;
+ }
+
+ public int getMinBreakWordLength() {
+ return minBreakWordLength;
+ }
+
+ public int getMaxChanges() {
+ return maxChanges;
+ }
+
+ public int getMaxEvaluations() {
+ return maxEvaluations;
+ }
+
+ /**
+ *
+ * The minimum frequency a term must have to be included as part of a
+ * suggestion. Default=1 Not applicable when used with
+ * {@link SuggestMode#SUGGEST_MORE_POPULAR}
+ *
+ *
+ * @param minSuggestionFrequency
+ */
+ public void setMinSuggestionFrequency(int minSuggestionFrequency) {
+ this.minSuggestionFrequency = minSuggestionFrequency;
+ }
+
+ /**
+ *
+ * The maximum length of a suggestion made by combining 1 or more original
+ * terms. Default=20
+ *
+ *
+ * @param maxCombineWordLength
+ */
+ public void setMaxCombineWordLength(int maxCombineWordLength) {
+ this.maxCombineWordLength = maxCombineWordLength;
+ }
+
+ /**
+ *
+ * The minimum length to break words down to. Default=1
+ *
+ *
+ * @param minBreakWordLength
+ */
+ public void setMinBreakWordLength(int minBreakWordLength) {
+ this.minBreakWordLength = minBreakWordLength;
+ }
+
+ /**
+ *
+ * The maximum numbers of changes (word breaks or combinations) to make on the
+ * original term(s). Default=1
+ *
+ *
+ * @param maxChanges
+ */
+ public void setMaxChanges(int maxChanges) {
+ this.maxChanges = maxChanges;
+ }
+
+ /**
+ *
+ * The maximum number of word combinations to evaluate. Default=1000. A higher
+ * value might improve result quality. A lower value might improve
+ * performance.
+ *
+ *
+ * @param maxEvaluations
+ */
+ public void setMaxEvaluations(int maxEvaluations) {
+ this.maxEvaluations = maxEvaluations;
+ }
+
+ private class LengthThenMaxFreqComparator implements
+ Comparator {
+ @Override
+ public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
+ if (o1.suggestWords.length != o2.suggestWords.length) {
+ return o2.suggestWords.length - o1.suggestWords.length;
+ }
+ if (o1.freqMax != o2.freqMax) {
+ return o1.freqMax - o2.freqMax;
+ }
+ return 0;
+ }
+ }
+
+ private class LengthThenSumFreqComparator implements
+ Comparator {
+ @Override
+ public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
+ if (o1.suggestWords.length != o2.suggestWords.length) {
+ return o2.suggestWords.length - o1.suggestWords.length;
+ }
+ if (o1.freqSum != o2.freqSum) {
+ return o1.freqSum - o2.freqSum;
+ }
+ return 0;
+ }
+ }
+
+ private class CombinationsThenFreqComparator implements
+ Comparator {
+ @Override
+ public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
+ if (o1.numCombinations != o2.numCombinations) {
+ return o2.numCombinations - o1.numCombinations;
+ }
+ if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
+ return o1.combineSuggestion.suggestion.freq
+ - o2.combineSuggestion.suggestion.freq;
+ }
+ return 0;
+ }
+ }
+
+ private class SuggestWordArrayWrapper {
+ final SuggestWord[] suggestWords;
+ final int freqMax;
+ final int freqSum;
+
+ SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
+ this.suggestWords = suggestWords;
+ int aFreqSum = 0;
+ int aFreqMax = 0;
+ for (SuggestWord sw : suggestWords) {
+ aFreqSum += sw.freq;
+ aFreqMax = Math.max(aFreqMax, sw.freq);
+ }
+ this.freqSum = aFreqSum;
+ this.freqMax = aFreqMax;
+ }
+ }
+
+ private class CombineSuggestionWrapper {
+ final CombineSuggestion combineSuggestion;
+ final int numCombinations;
+
+ CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
+ int numCombinations) {
+ this.combineSuggestion = combineSuggestion;
+ this.numCombinations = numCombinations;
+ }
+ }
+}
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
new file mode 100644
index 00000000000..50c961798fe
--- /dev/null
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@@ -0,0 +1,234 @@
+package org.apache.lucene.search.spell;
+
+import junit.framework.Assert;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.English;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestWordBreakSpellChecker extends LuceneTestCase {
+ private Directory dir = null;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ dir = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
+
+ for (int i = 900; i < 1112; i++) {
+ Document doc = new Document();
+ String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
+ doc.add(newField("numbers", num, TextField.TYPE_UNSTORED));
+ writer.addDocument(doc);
+ }
+
+ {
+ Document doc = new Document();
+ doc.add(newField("numbers", "thou hast sand betwixt thy toes", TextField.TYPE_UNSTORED));
+ writer.addDocument(doc);
+ }
+ {
+ Document doc = new Document();
+ doc.add(newField("numbers", "hundredeight eightyeight yeight", TextField.TYPE_UNSTORED));
+ writer.addDocument(doc);
+ }
+ {
+ Document doc = new Document();
+ doc.add(newField("numbers", "tres y cinco", TextField.TYPE_UNSTORED));
+ writer.addDocument(doc);
+ }
+
+ writer.commit();
+ writer.close();
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ if(dir!=null) {
+ dir.close();
+ dir = null;
+ }
+ super.tearDown();
+ }
+ public void testCombiningWords() throws Exception {
+ IndexReader ir = null;
+ try {
+ ir = DirectoryReader.open(dir);
+ WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+
+ {
+ Term[] terms = {
+ new Term("numbers", "one"),
+ new Term("numbers", "hun"),
+ new Term("numbers", "dred"),
+ new Term("numbers", "eight"),
+ new Term("numbers", "y"),
+ new Term("numbers", "eight"),
+ };
+ wbsp.setMaxChanges(3);
+ wbsp.setMaxCombineWordLength(20);
+ wbsp.setMinSuggestionFrequency(1);
+ CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
+ Assert.assertTrue(cs.length==5);
+
+ Assert.assertTrue(cs[0].originalTermIndexes.length==2);
+ Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
+ Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
+ Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
+ Assert.assertTrue(cs[0].suggestion.score==1);
+
+ Assert.assertTrue(cs[1].originalTermIndexes.length==2);
+ Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
+ Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
+ Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
+ Assert.assertTrue(cs[1].suggestion.score==1);
+
+ Assert.assertTrue(cs[2].originalTermIndexes.length==2);
+ Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
+ Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
+ Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
+ Assert.assertTrue(cs[2].suggestion.score==1);
+
+ for(int i=3 ; i<5 ; i++) {
+ Assert.assertTrue(cs[i].originalTermIndexes.length==3);
+ Assert.assertTrue(cs[i].suggestion.score==2);
+ Assert.assertTrue(
+ (cs[i].originalTermIndexes[0]==1 &&
+ cs[i].originalTermIndexes[1]==2 &&
+ cs[i].originalTermIndexes[2]==3 &&
+ cs[i].suggestion.string.equals("hundredeight")) ||
+ (cs[i].originalTermIndexes[0]==3 &&
+ cs[i].originalTermIndexes[1]==4 &&
+ cs[i].originalTermIndexes[2]==5 &&
+ cs[i].suggestion.string.equals("eightyeight"))
+ );
+ }
+
+ cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
+ Assert.assertTrue(cs.length==2);
+ Assert.assertTrue(cs[0].originalTermIndexes.length==2);
+ Assert.assertTrue(cs[0].suggestion.score==1);
+ Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
+ Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
+ Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
+ Assert.assertTrue(cs[0].suggestion.score==1);
+
+ Assert.assertTrue(cs[1].originalTermIndexes.length==3);
+ Assert.assertTrue(cs[1].suggestion.score==2);
+ Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
+ Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
+ Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
+ Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
+ }
+ } catch(Exception e) {
+ throw e;
+ } finally {
+ try { ir.close(); } catch(Exception e1) { }
+ }
+ }
+
+ public void testBreakingWords() throws Exception {
+ IndexReader ir = null;
+ try {
+ ir = DirectoryReader.open(dir);
+ WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+
+ {
+ Term term = new Term("numbers", "ninetynine");
+ wbsp.setMaxChanges(1);
+ wbsp.setMinBreakWordLength(1);
+ wbsp.setMinSuggestionFrequency(1);
+ SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==1);
+ Assert.assertTrue(sw[0].length==2);
+ Assert.assertTrue(sw[0][0].string.equals("ninety"));
+ Assert.assertTrue(sw[0][1].string.equals("nine"));
+ Assert.assertTrue(sw[0][0].score == 1);
+ Assert.assertTrue(sw[0][1].score == 1);
+ }
+ {
+ Term term = new Term("numbers", "onethousand");
+ wbsp.setMaxChanges(1);
+ wbsp.setMinBreakWordLength(1);
+ wbsp.setMinSuggestionFrequency(1);
+ SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==1);
+ Assert.assertTrue(sw[0].length==2);
+ Assert.assertTrue(sw[0][0].string.equals("one"));
+ Assert.assertTrue(sw[0][1].string.equals("thousand"));
+ Assert.assertTrue(sw[0][0].score == 1);
+ Assert.assertTrue(sw[0][1].score == 1);
+
+ wbsp.setMaxChanges(2);
+ wbsp.setMinSuggestionFrequency(1);
+ sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==1);
+ Assert.assertTrue(sw[0].length==2);
+
+ wbsp.setMaxChanges(2);
+ wbsp.setMinSuggestionFrequency(2);
+ sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==1);
+ Assert.assertTrue(sw[0].length==2);
+
+ wbsp.setMaxChanges(2);
+ wbsp.setMinSuggestionFrequency(1);
+ sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==2);
+ Assert.assertTrue(sw[0].length==2);
+ Assert.assertTrue(sw[0][0].string.equals("one"));
+ Assert.assertTrue(sw[0][1].string.equals("thousand"));
+ Assert.assertTrue(sw[0][0].score == 1);
+ Assert.assertTrue(sw[0][1].score == 1);
+ Assert.assertTrue(sw[0][1].freq>1);
+ Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
+ Assert.assertTrue(sw[1].length==3);
+ Assert.assertTrue(sw[1][0].string.equals("one"));
+ Assert.assertTrue(sw[1][1].string.equals("thou"));
+ Assert.assertTrue(sw[1][2].string.equals("sand"));
+ Assert.assertTrue(sw[1][0].score == 2);
+ Assert.assertTrue(sw[1][1].score == 2);
+ Assert.assertTrue(sw[1][2].score == 2);
+ Assert.assertTrue(sw[1][0].freq>1);
+ Assert.assertTrue(sw[1][1].freq==1);
+ Assert.assertTrue(sw[1][2].freq==1);
+ }
+ {
+ Term term = new Term("numbers", "onethousandonehundredeleven");
+ wbsp.setMaxChanges(3);
+ wbsp.setMinBreakWordLength(1);
+ wbsp.setMinSuggestionFrequency(1);
+ SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==0);
+
+ wbsp.setMaxChanges(4);
+ sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==1);
+ Assert.assertTrue(sw[0].length==5);
+
+ wbsp.setMaxChanges(5);
+ sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+ Assert.assertTrue(sw.length==2);
+ Assert.assertTrue(sw[0].length==5);
+ Assert.assertTrue(sw[0][1].string.equals("thousand"));
+ Assert.assertTrue(sw[1].length==6);
+ Assert.assertTrue(sw[1][1].string.equals("thou"));
+ Assert.assertTrue(sw[1][2].string.equals("sand"));
+ }
+
+ } catch(Exception e) {
+ throw e;
+ } finally {
+ try { ir.close(); } catch(Exception e1) { }
+ }
+ }
+ }