mirror of https://github.com/apache/lucene.git
LUCENE-3523: Add WordBreakSpellChecker to suggest based on word combinations and/or breaks
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344318 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a0e590a119
commit
642b882d5a
|
@ -4148,6 +4148,10 @@ New features
|
||||||
passed to not double RAM usage in the FieldCache. (Chris
|
passed to not double RAM usage in the FieldCache. (Chris
|
||||||
Hostetter, Mark Miller, Mike McCandless)
|
Hostetter, Mark Miller, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which
|
||||||
|
generates suggestions by combining two or more terms and/or
|
||||||
|
breaking terms into multiple words. See Javadocs for usage. (James Dyer)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
|
* LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
public class CombineSuggestion {
|
||||||
|
/**
|
||||||
|
* <p>The indexes from the passed-in array of terms used to make this word combination</p>
|
||||||
|
*/
|
||||||
|
public final int[] originalTermIndexes;
|
||||||
|
/**
|
||||||
|
* <p>The word combination suggestion</p>
|
||||||
|
*/
|
||||||
|
public final SuggestWord suggestion;
|
||||||
|
|
||||||
|
public CombineSuggestion (SuggestWord suggestion, int[] originalTermIndexes) {
|
||||||
|
this.suggestion = suggestion;
|
||||||
|
this.originalTermIndexes = originalTermIndexes;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,480 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.PriorityQueue;
|
||||||
|
import java.util.Queue;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.spell.SuggestMode;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* A spell checker whose sole function is to offer suggestions by combining
|
||||||
|
* multiple terms into one word and/or breaking terms into multiple words.
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
public class WordBreakSpellChecker {
|
||||||
|
private int minSuggestionFrequency = 1;
|
||||||
|
private int minBreakWordLength = 1;
|
||||||
|
private int maxCombineWordLength = 20;
|
||||||
|
private int maxChanges = 1;
|
||||||
|
private int maxEvaluations = 1000;
|
||||||
|
|
||||||
|
public static final Term SEPARATOR_TERM = new Term("", "");
|
||||||
|
|
||||||
|
public enum BreakSuggestionSortMethod {
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Sort by Number of word breaks, then by the Sum of all the component
|
||||||
|
* term's frequencies
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
NUM_CHANGES_THEN_SUMMED_FREQUENCY,
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Sort by Number of word breaks, then by the Maximum of all the component
|
||||||
|
* term's frequencies
|
||||||
|
* </p>
|
||||||
|
*/
|
||||||
|
NUM_CHANGES_THEN_MAX_FREQUENCY
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Generate suggestions by breaking the passed-in term into multiple words.
|
||||||
|
* The scores returned are equal to the number of word breaks needed so a
|
||||||
|
* lower score is generally preferred over a higher score.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param term
|
||||||
|
* @param maxSuggestions
|
||||||
|
* @param ir
|
||||||
|
* @param suggestMode
|
||||||
|
* - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
|
||||||
|
* @param sortMethod
|
||||||
|
* - default =
|
||||||
|
* {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
|
||||||
|
* @return one or more arrays of words formed by breaking up the original term
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
|
||||||
|
IndexReader ir, SuggestMode suggestMode,
|
||||||
|
BreakSuggestionSortMethod sortMethod) throws IOException {
|
||||||
|
if (maxSuggestions < 1) {
|
||||||
|
return new SuggestWord[0][0];
|
||||||
|
}
|
||||||
|
if (suggestMode == null) {
|
||||||
|
suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
|
||||||
|
}
|
||||||
|
if (sortMethod == null) {
|
||||||
|
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
|
||||||
|
}
|
||||||
|
|
||||||
|
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
|
||||||
|
Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
|
||||||
|
: new LengthThenSumFreqComparator();
|
||||||
|
Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<SuggestWordArrayWrapper>(
|
||||||
|
queueInitialCapacity, queueComparator);
|
||||||
|
|
||||||
|
int origFreq = ir.docFreq(term);
|
||||||
|
if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
|
||||||
|
return new SuggestWord[0][];
|
||||||
|
}
|
||||||
|
|
||||||
|
int useMinSuggestionFrequency = minSuggestionFrequency;
|
||||||
|
if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
|
||||||
|
useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
|
||||||
|
}
|
||||||
|
|
||||||
|
generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
|
||||||
|
useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
|
||||||
|
sortMethod);
|
||||||
|
|
||||||
|
SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
|
||||||
|
for (int i = suggestions.size() - 1; i >= 0; i--) {
|
||||||
|
suggestionArray[i] = suggestions.remove().suggestWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
return suggestionArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Generate suggestions by combining one or more of the passed-in terms into
|
||||||
|
* single words. The returned {@link CombineSuggestion} contains both a
|
||||||
|
* {@link SuggestWord} and also an array detailing which passed-in terms were
|
||||||
|
* involved in creating this combination. The scores returned are equal to the
|
||||||
|
* number of word combinations needed, also one less than the length of the
|
||||||
|
* array {@link CombineSuggestion#originalTermIndexes}. Generally, a
|
||||||
|
* suggestion with a lower score is preferred over a higher score.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* To prevent two adjacent terms from being combined (for instance, if one is
|
||||||
|
* mandatory and the other is prohibited), separate the two terms with
|
||||||
|
* {@link WordBreakSpellChecker#SEPARATOR_TERM}
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
|
||||||
|
* suggestion will include at least one term not in the index.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
|
||||||
|
* suggestion will have the same, or better frequency than the most-popular
|
||||||
|
* included term.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param terms
|
||||||
|
* @param maxSuggestions
|
||||||
|
* @param ir
|
||||||
|
* @param suggestMode
|
||||||
|
* @return an array of words generated by combining original terms
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public CombineSuggestion[] suggestWordCombinations(Term[] terms,
|
||||||
|
int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
|
||||||
|
throws IOException {
|
||||||
|
if (maxSuggestions < 1) {
|
||||||
|
return new CombineSuggestion[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
int[] origFreqs = null;
|
||||||
|
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
|
||||||
|
origFreqs = new int[terms.length];
|
||||||
|
for (int i = 0; i < terms.length; i++) {
|
||||||
|
origFreqs[i] = ir.docFreq(terms[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
|
||||||
|
Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
|
||||||
|
Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<CombineSuggestionWrapper>(
|
||||||
|
queueInitialCapacity, queueComparator);
|
||||||
|
|
||||||
|
int thisTimeEvaluations = 0;
|
||||||
|
BytesRef reuse = new BytesRef();
|
||||||
|
for (int i = 0; i < terms.length - 1; i++) {
|
||||||
|
if (terms[i].equals(SEPARATOR_TERM)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int byteLength = terms[i].bytes().length;
|
||||||
|
if (byteLength > maxCombineWordLength) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
reuse.grow(byteLength);
|
||||||
|
reuse.length = byteLength;
|
||||||
|
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
|
||||||
|
reuse.bytes, 0, byteLength);
|
||||||
|
|
||||||
|
int maxFreq = 0;
|
||||||
|
int minFreq = Integer.MAX_VALUE;
|
||||||
|
if (origFreqs != null) {
|
||||||
|
maxFreq = origFreqs[i];
|
||||||
|
minFreq = origFreqs[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
|
||||||
|
if (terms[j].equals(SEPARATOR_TERM)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
byteLength += terms[j].bytes().length;
|
||||||
|
if (byteLength > maxCombineWordLength) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (origFreqs != null) {
|
||||||
|
maxFreq = Math.max(maxFreq, origFreqs[j]);
|
||||||
|
minFreq = Math.min(minFreq, origFreqs[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
reuse.grow(byteLength);
|
||||||
|
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
|
||||||
|
reuse.bytes, reuse.length, terms[j].bytes().length);
|
||||||
|
reuse.length = byteLength;
|
||||||
|
|
||||||
|
Term combinedTerm = new Term(terms[0].field(), reuse);
|
||||||
|
int combinedTermFreq = ir.docFreq(combinedTerm);
|
||||||
|
|
||||||
|
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|
||||||
|
|| combinedTermFreq >= maxFreq) {
|
||||||
|
if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
|
||||||
|
|| minFreq == 0) {
|
||||||
|
if (combinedTermFreq >= minSuggestionFrequency) {
|
||||||
|
int[] origIndexes = new int[j - i + 1];
|
||||||
|
origIndexes[0] = i;
|
||||||
|
for (int k = 1; k < origIndexes.length; k++) {
|
||||||
|
origIndexes[k] = i + k;
|
||||||
|
}
|
||||||
|
SuggestWord word = new SuggestWord();
|
||||||
|
word.freq = combinedTermFreq;
|
||||||
|
word.score = origIndexes.length - 1;
|
||||||
|
word.string = combinedTerm.text();
|
||||||
|
CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
|
||||||
|
new CombineSuggestion(word, origIndexes),
|
||||||
|
(origIndexes.length - 1));
|
||||||
|
suggestions.offer(suggestion);
|
||||||
|
if (suggestions.size() > maxSuggestions) {
|
||||||
|
suggestions.poll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
thisTimeEvaluations++;
|
||||||
|
if (thisTimeEvaluations == maxEvaluations) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
|
||||||
|
.size()];
|
||||||
|
for (int i = suggestions.size() - 1; i >= 0; i--) {
|
||||||
|
combineSuggestions[i] = suggestions.remove().combineSuggestion;
|
||||||
|
}
|
||||||
|
return combineSuggestions;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int generateBreakUpSuggestions(Term term, IndexReader ir,
|
||||||
|
int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
|
||||||
|
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
|
||||||
|
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
|
||||||
|
throws IOException {
|
||||||
|
int termLength = term.bytes().length;
|
||||||
|
int useMinBreakWordLength = minBreakWordLength;
|
||||||
|
if (useMinBreakWordLength < 1) {
|
||||||
|
useMinBreakWordLength = 1;
|
||||||
|
}
|
||||||
|
if (termLength <= (useMinBreakWordLength * 2)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int thisTimeEvaluations = 0;
|
||||||
|
BytesRef termBytes = term.bytes().clone();
|
||||||
|
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
|
||||||
|
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
|
||||||
|
.field());
|
||||||
|
|
||||||
|
if (leftWord.freq >= useMinSuggestionFrequency) {
|
||||||
|
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
|
||||||
|
termLength - i, term.field());
|
||||||
|
if (rightWord.freq >= useMinSuggestionFrequency) {
|
||||||
|
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
|
||||||
|
newSuggestion(prefix, leftWord, rightWord));
|
||||||
|
suggestions.offer(suggestion);
|
||||||
|
if (suggestions.size() > maxSuggestions) {
|
||||||
|
suggestions.poll();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int newNumberBreaks = numberBreaks + 1;
|
||||||
|
if (newNumberBreaks <= maxChanges) {
|
||||||
|
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
|
||||||
|
rightWord.string), ir, newNumberBreaks, maxSuggestions,
|
||||||
|
useMinSuggestionFrequency, newPrefix(prefix, leftWord),
|
||||||
|
suggestions, totalEvaluations, sortMethod);
|
||||||
|
totalEvaluations += evaluations;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
thisTimeEvaluations++;
|
||||||
|
totalEvaluations++;
|
||||||
|
if (totalEvaluations >= maxEvaluations) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return thisTimeEvaluations;
|
||||||
|
}
|
||||||
|
|
||||||
|
private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
|
||||||
|
SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
|
||||||
|
System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
|
||||||
|
newPrefix[newPrefix.length - 1] = append;
|
||||||
|
return newPrefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
private SuggestWord[] newSuggestion(SuggestWord[] prefix,
|
||||||
|
SuggestWord append1, SuggestWord append2) {
|
||||||
|
SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
|
||||||
|
int score = prefix.length + 1;
|
||||||
|
for (int i = 0; i < prefix.length; i++) {
|
||||||
|
SuggestWord word = new SuggestWord();
|
||||||
|
word.string = prefix[i].string;
|
||||||
|
word.freq = prefix[i].freq;
|
||||||
|
word.score = score;
|
||||||
|
newSuggestion[i] = word;
|
||||||
|
}
|
||||||
|
append1.score = score;
|
||||||
|
append2.score = score;
|
||||||
|
newSuggestion[newSuggestion.length - 2] = append1;
|
||||||
|
newSuggestion[newSuggestion.length - 1] = append2;
|
||||||
|
return newSuggestion;
|
||||||
|
}
|
||||||
|
|
||||||
|
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
|
||||||
|
int offset, int length, String fieldname) throws IOException {
|
||||||
|
bytes.offset = offset;
|
||||||
|
bytes.length = length;
|
||||||
|
Term term = new Term(fieldname, bytes);
|
||||||
|
int freq = ir.docFreq(term);
|
||||||
|
SuggestWord word = new SuggestWord();
|
||||||
|
word.freq = freq;
|
||||||
|
word.score = 1;
|
||||||
|
word.string = term.text();
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMinSuggestionFrequency() {
|
||||||
|
return minSuggestionFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxCombineWordLength() {
|
||||||
|
return maxCombineWordLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMinBreakWordLength() {
|
||||||
|
return minBreakWordLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxChanges() {
|
||||||
|
return maxChanges;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxEvaluations() {
|
||||||
|
return maxEvaluations;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* The minimum frequency a term must have to be included as part of a
|
||||||
|
* suggestion. Default=1 Not applicable when used with
|
||||||
|
* {@link SuggestMode#SUGGEST_MORE_POPULAR}
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param minSuggestionFrequency
|
||||||
|
*/
|
||||||
|
public void setMinSuggestionFrequency(int minSuggestionFrequency) {
|
||||||
|
this.minSuggestionFrequency = minSuggestionFrequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* The maximum length of a suggestion made by combining 1 or more original
|
||||||
|
* terms. Default=20
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param maxCombineWordLength
|
||||||
|
*/
|
||||||
|
public void setMaxCombineWordLength(int maxCombineWordLength) {
|
||||||
|
this.maxCombineWordLength = maxCombineWordLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* The minimum length to break words down to. Default=1
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param minBreakWordLength
|
||||||
|
*/
|
||||||
|
public void setMinBreakWordLength(int minBreakWordLength) {
|
||||||
|
this.minBreakWordLength = minBreakWordLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* The maximum numbers of changes (word breaks or combinations) to make on the
|
||||||
|
* original term(s). Default=1
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param maxChanges
|
||||||
|
*/
|
||||||
|
public void setMaxChanges(int maxChanges) {
|
||||||
|
this.maxChanges = maxChanges;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* The maximum number of word combinations to evaluate. Default=1000. A higher
|
||||||
|
* value might improve result quality. A lower value might improve
|
||||||
|
* performance.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @param maxEvaluations
|
||||||
|
*/
|
||||||
|
public void setMaxEvaluations(int maxEvaluations) {
|
||||||
|
this.maxEvaluations = maxEvaluations;
|
||||||
|
}
|
||||||
|
|
||||||
|
private class LengthThenMaxFreqComparator implements
|
||||||
|
Comparator<SuggestWordArrayWrapper> {
|
||||||
|
@Override
|
||||||
|
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
|
||||||
|
if (o1.suggestWords.length != o2.suggestWords.length) {
|
||||||
|
return o2.suggestWords.length - o1.suggestWords.length;
|
||||||
|
}
|
||||||
|
if (o1.freqMax != o2.freqMax) {
|
||||||
|
return o1.freqMax - o2.freqMax;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class LengthThenSumFreqComparator implements
|
||||||
|
Comparator<SuggestWordArrayWrapper> {
|
||||||
|
@Override
|
||||||
|
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
|
||||||
|
if (o1.suggestWords.length != o2.suggestWords.length) {
|
||||||
|
return o2.suggestWords.length - o1.suggestWords.length;
|
||||||
|
}
|
||||||
|
if (o1.freqSum != o2.freqSum) {
|
||||||
|
return o1.freqSum - o2.freqSum;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class CombinationsThenFreqComparator implements
|
||||||
|
Comparator<CombineSuggestionWrapper> {
|
||||||
|
@Override
|
||||||
|
public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
|
||||||
|
if (o1.numCombinations != o2.numCombinations) {
|
||||||
|
return o2.numCombinations - o1.numCombinations;
|
||||||
|
}
|
||||||
|
if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
|
||||||
|
return o1.combineSuggestion.suggestion.freq
|
||||||
|
- o2.combineSuggestion.suggestion.freq;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class SuggestWordArrayWrapper {
|
||||||
|
final SuggestWord[] suggestWords;
|
||||||
|
final int freqMax;
|
||||||
|
final int freqSum;
|
||||||
|
|
||||||
|
SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
|
||||||
|
this.suggestWords = suggestWords;
|
||||||
|
int aFreqSum = 0;
|
||||||
|
int aFreqMax = 0;
|
||||||
|
for (SuggestWord sw : suggestWords) {
|
||||||
|
aFreqSum += sw.freq;
|
||||||
|
aFreqMax = Math.max(aFreqMax, sw.freq);
|
||||||
|
}
|
||||||
|
this.freqSum = aFreqSum;
|
||||||
|
this.freqMax = aFreqMax;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class CombineSuggestionWrapper {
|
||||||
|
final CombineSuggestion combineSuggestion;
|
||||||
|
final int numCombinations;
|
||||||
|
|
||||||
|
CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
|
||||||
|
int numCombinations) {
|
||||||
|
this.combineSuggestion = combineSuggestion;
|
||||||
|
this.numCombinations = numCombinations;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,234 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
import junit.framework.Assert;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.English;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||||
|
private Directory dir = null;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
|
||||||
|
|
||||||
|
for (int i = 900; i < 1112; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
|
||||||
|
doc.add(newField("numbers", num, TextField.TYPE_UNSTORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("numbers", "thou hast sand betwixt thy toes", TextField.TYPE_UNSTORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("numbers", "hundredeight eightyeight yeight", TextField.TYPE_UNSTORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("numbers", "tres y cinco", TextField.TYPE_UNSTORED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.commit();
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
if(dir!=null) {
|
||||||
|
dir.close();
|
||||||
|
dir = null;
|
||||||
|
}
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
public void testCombiningWords() throws Exception {
|
||||||
|
IndexReader ir = null;
|
||||||
|
try {
|
||||||
|
ir = DirectoryReader.open(dir);
|
||||||
|
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
|
||||||
|
|
||||||
|
{
|
||||||
|
Term[] terms = {
|
||||||
|
new Term("numbers", "one"),
|
||||||
|
new Term("numbers", "hun"),
|
||||||
|
new Term("numbers", "dred"),
|
||||||
|
new Term("numbers", "eight"),
|
||||||
|
new Term("numbers", "y"),
|
||||||
|
new Term("numbers", "eight"),
|
||||||
|
};
|
||||||
|
wbsp.setMaxChanges(3);
|
||||||
|
wbsp.setMaxCombineWordLength(20);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
|
||||||
|
Assert.assertTrue(cs.length==5);
|
||||||
|
|
||||||
|
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
|
||||||
|
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
|
||||||
|
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
|
||||||
|
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
|
||||||
|
Assert.assertTrue(cs[0].suggestion.score==1);
|
||||||
|
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes.length==2);
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
|
||||||
|
Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
|
||||||
|
Assert.assertTrue(cs[1].suggestion.score==1);
|
||||||
|
|
||||||
|
Assert.assertTrue(cs[2].originalTermIndexes.length==2);
|
||||||
|
Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
|
||||||
|
Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
|
||||||
|
Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
|
||||||
|
Assert.assertTrue(cs[2].suggestion.score==1);
|
||||||
|
|
||||||
|
for(int i=3 ; i<5 ; i++) {
|
||||||
|
Assert.assertTrue(cs[i].originalTermIndexes.length==3);
|
||||||
|
Assert.assertTrue(cs[i].suggestion.score==2);
|
||||||
|
Assert.assertTrue(
|
||||||
|
(cs[i].originalTermIndexes[0]==1 &&
|
||||||
|
cs[i].originalTermIndexes[1]==2 &&
|
||||||
|
cs[i].originalTermIndexes[2]==3 &&
|
||||||
|
cs[i].suggestion.string.equals("hundredeight")) ||
|
||||||
|
(cs[i].originalTermIndexes[0]==3 &&
|
||||||
|
cs[i].originalTermIndexes[1]==4 &&
|
||||||
|
cs[i].originalTermIndexes[2]==5 &&
|
||||||
|
cs[i].suggestion.string.equals("eightyeight"))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
|
Assert.assertTrue(cs.length==2);
|
||||||
|
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
|
||||||
|
Assert.assertTrue(cs[0].suggestion.score==1);
|
||||||
|
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
|
||||||
|
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
|
||||||
|
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
|
||||||
|
Assert.assertTrue(cs[0].suggestion.score==1);
|
||||||
|
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes.length==3);
|
||||||
|
Assert.assertTrue(cs[1].suggestion.score==2);
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
|
||||||
|
Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
|
||||||
|
Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
|
||||||
|
}
|
||||||
|
} catch(Exception e) {
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
try { ir.close(); } catch(Exception e1) { }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBreakingWords() throws Exception {
|
||||||
|
IndexReader ir = null;
|
||||||
|
try {
|
||||||
|
ir = DirectoryReader.open(dir);
|
||||||
|
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
|
||||||
|
|
||||||
|
{
|
||||||
|
Term term = new Term("numbers", "ninetynine");
|
||||||
|
wbsp.setMaxChanges(1);
|
||||||
|
wbsp.setMinBreakWordLength(1);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==1);
|
||||||
|
Assert.assertTrue(sw[0].length==2);
|
||||||
|
Assert.assertTrue(sw[0][0].string.equals("ninety"));
|
||||||
|
Assert.assertTrue(sw[0][1].string.equals("nine"));
|
||||||
|
Assert.assertTrue(sw[0][0].score == 1);
|
||||||
|
Assert.assertTrue(sw[0][1].score == 1);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Term term = new Term("numbers", "onethousand");
|
||||||
|
wbsp.setMaxChanges(1);
|
||||||
|
wbsp.setMinBreakWordLength(1);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==1);
|
||||||
|
Assert.assertTrue(sw[0].length==2);
|
||||||
|
Assert.assertTrue(sw[0][0].string.equals("one"));
|
||||||
|
Assert.assertTrue(sw[0][1].string.equals("thousand"));
|
||||||
|
Assert.assertTrue(sw[0][0].score == 1);
|
||||||
|
Assert.assertTrue(sw[0][1].score == 1);
|
||||||
|
|
||||||
|
wbsp.setMaxChanges(2);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==1);
|
||||||
|
Assert.assertTrue(sw[0].length==2);
|
||||||
|
|
||||||
|
wbsp.setMaxChanges(2);
|
||||||
|
wbsp.setMinSuggestionFrequency(2);
|
||||||
|
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==1);
|
||||||
|
Assert.assertTrue(sw[0].length==2);
|
||||||
|
|
||||||
|
wbsp.setMaxChanges(2);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==2);
|
||||||
|
Assert.assertTrue(sw[0].length==2);
|
||||||
|
Assert.assertTrue(sw[0][0].string.equals("one"));
|
||||||
|
Assert.assertTrue(sw[0][1].string.equals("thousand"));
|
||||||
|
Assert.assertTrue(sw[0][0].score == 1);
|
||||||
|
Assert.assertTrue(sw[0][1].score == 1);
|
||||||
|
Assert.assertTrue(sw[0][1].freq>1);
|
||||||
|
Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
|
||||||
|
Assert.assertTrue(sw[1].length==3);
|
||||||
|
Assert.assertTrue(sw[1][0].string.equals("one"));
|
||||||
|
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
||||||
|
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
||||||
|
Assert.assertTrue(sw[1][0].score == 2);
|
||||||
|
Assert.assertTrue(sw[1][1].score == 2);
|
||||||
|
Assert.assertTrue(sw[1][2].score == 2);
|
||||||
|
Assert.assertTrue(sw[1][0].freq>1);
|
||||||
|
Assert.assertTrue(sw[1][1].freq==1);
|
||||||
|
Assert.assertTrue(sw[1][2].freq==1);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Term term = new Term("numbers", "onethousandonehundredeleven");
|
||||||
|
wbsp.setMaxChanges(3);
|
||||||
|
wbsp.setMinBreakWordLength(1);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==0);
|
||||||
|
|
||||||
|
wbsp.setMaxChanges(4);
|
||||||
|
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==1);
|
||||||
|
Assert.assertTrue(sw[0].length==5);
|
||||||
|
|
||||||
|
wbsp.setMaxChanges(5);
|
||||||
|
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==2);
|
||||||
|
Assert.assertTrue(sw[0].length==5);
|
||||||
|
Assert.assertTrue(sw[0][1].string.equals("thousand"));
|
||||||
|
Assert.assertTrue(sw[1].length==6);
|
||||||
|
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
||||||
|
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch(Exception e) {
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
try { ir.close(); } catch(Exception e1) { }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue