mirror of https://github.com/apache/lucene.git
LUCENE-3523: Add WordBreakSpellChecker to suggest based on word combinations and/or breaks
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344318 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a0e590a119
commit
642b882d5a
|
@ -4147,6 +4147,10 @@ New features
|
|||
ValueSource, but takes care when composite (multi-segment) are
|
||||
passed to not double RAM usage in the FieldCache. (Chris
|
||||
Hostetter, Mark Miller, Mike McCandless)
|
||||
|
||||
* LUCENE-3523: Added oal.search.spell.WordBreakSpellChecker, which
|
||||
generates suggestions by combining two or more terms and/or
|
||||
breaking terms into multiple words. See Javadocs for usage. (James Dyer)
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
public class CombineSuggestion {
|
||||
/**
|
||||
* <p>The indexes from the passed-in array of terms used to make this word combination</p>
|
||||
*/
|
||||
public final int[] originalTermIndexes;
|
||||
/**
|
||||
* <p>The word combination suggestion</p>
|
||||
*/
|
||||
public final SuggestWord suggestion;
|
||||
|
||||
public CombineSuggestion (SuggestWord suggestion, int[] originalTermIndexes) {
|
||||
this.suggestion = suggestion;
|
||||
this.originalTermIndexes = originalTermIndexes;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,480 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Queue;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spell.SuggestMode;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A spell checker whose sole function is to offer suggestions by combining
|
||||
* multiple terms into one word and/or breaking terms into multiple words.
|
||||
* </p>
|
||||
*/
|
||||
public class WordBreakSpellChecker {
|
||||
private int minSuggestionFrequency = 1;
|
||||
private int minBreakWordLength = 1;
|
||||
private int maxCombineWordLength = 20;
|
||||
private int maxChanges = 1;
|
||||
private int maxEvaluations = 1000;
|
||||
|
||||
public static final Term SEPARATOR_TERM = new Term("", "");
|
||||
|
||||
public enum BreakSuggestionSortMethod {
|
||||
/**
|
||||
* <p>
|
||||
* Sort by Number of word breaks, then by the Sum of all the component
|
||||
* term's frequencies
|
||||
* </p>
|
||||
*/
|
||||
NUM_CHANGES_THEN_SUMMED_FREQUENCY,
|
||||
/**
|
||||
* <p>
|
||||
* Sort by Number of word breaks, then by the Maximum of all the component
|
||||
* term's frequencies
|
||||
* </p>
|
||||
*/
|
||||
NUM_CHANGES_THEN_MAX_FREQUENCY
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Generate suggestions by breaking the passed-in term into multiple words.
|
||||
* The scores returned are equal to the number of word breaks needed so a
|
||||
* lower score is generally preferred over a higher score.
|
||||
* </p>
|
||||
*
|
||||
* @param term
|
||||
* @param maxSuggestions
|
||||
* @param ir
|
||||
* @param suggestMode
|
||||
* - default = {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}
|
||||
* @param sortMethod
|
||||
* - default =
|
||||
* {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY}
|
||||
* @return one or more arrays of words formed by breaking up the original term
|
||||
* @throws IOException
|
||||
*/
|
||||
public SuggestWord[][] suggestWordBreaks(Term term, int maxSuggestions,
|
||||
IndexReader ir, SuggestMode suggestMode,
|
||||
BreakSuggestionSortMethod sortMethod) throws IOException {
|
||||
if (maxSuggestions < 1) {
|
||||
return new SuggestWord[0][0];
|
||||
}
|
||||
if (suggestMode == null) {
|
||||
suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
|
||||
}
|
||||
if (sortMethod == null) {
|
||||
sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY;
|
||||
}
|
||||
|
||||
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
|
||||
Comparator<SuggestWordArrayWrapper> queueComparator = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? new LengthThenMaxFreqComparator()
|
||||
: new LengthThenSumFreqComparator();
|
||||
Queue<SuggestWordArrayWrapper> suggestions = new PriorityQueue<SuggestWordArrayWrapper>(
|
||||
queueInitialCapacity, queueComparator);
|
||||
|
||||
int origFreq = ir.docFreq(term);
|
||||
if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) {
|
||||
return new SuggestWord[0][];
|
||||
}
|
||||
|
||||
int useMinSuggestionFrequency = minSuggestionFrequency;
|
||||
if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) {
|
||||
useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
|
||||
}
|
||||
|
||||
generateBreakUpSuggestions(term, ir, 1, maxSuggestions,
|
||||
useMinSuggestionFrequency, new SuggestWord[0], suggestions, 0,
|
||||
sortMethod);
|
||||
|
||||
SuggestWord[][] suggestionArray = new SuggestWord[suggestions.size()][];
|
||||
for (int i = suggestions.size() - 1; i >= 0; i--) {
|
||||
suggestionArray[i] = suggestions.remove().suggestWords;
|
||||
}
|
||||
|
||||
return suggestionArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Generate suggestions by combining one or more of the passed-in terms into
|
||||
* single words. The returned {@link CombineSuggestion} contains both a
|
||||
* {@link SuggestWord} and also an array detailing which passed-in terms were
|
||||
* involved in creating this combination. The scores returned are equal to the
|
||||
* number of word combinations needed, also one less than the length of the
|
||||
* array {@link CombineSuggestion#originalTermIndexes}. Generally, a
|
||||
* suggestion with a lower score is preferred over a higher score.
|
||||
* </p>
|
||||
* <p>
|
||||
* To prevent two adjacent terms from being combined (for instance, if one is
|
||||
* mandatory and the other is prohibited), separate the two terms with
|
||||
* {@link WordBreakSpellChecker#SEPARATOR_TERM}
|
||||
* </p>
|
||||
* <p>
|
||||
* When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
|
||||
* suggestion will include at least one term not in the index.
|
||||
* </p>
|
||||
* <p>
|
||||
* When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
|
||||
* suggestion will have the same, or better frequency than the most-popular
|
||||
* included term.
|
||||
* </p>
|
||||
*
|
||||
* @param terms
|
||||
* @param maxSuggestions
|
||||
* @param ir
|
||||
* @param suggestMode
|
||||
* @return an array of words generated by combining original terms
|
||||
* @throws IOException
|
||||
*/
|
||||
public CombineSuggestion[] suggestWordCombinations(Term[] terms,
|
||||
int maxSuggestions, IndexReader ir, SuggestMode suggestMode)
|
||||
throws IOException {
|
||||
if (maxSuggestions < 1) {
|
||||
return new CombineSuggestion[0];
|
||||
}
|
||||
|
||||
int[] origFreqs = null;
|
||||
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
|
||||
origFreqs = new int[terms.length];
|
||||
for (int i = 0; i < terms.length; i++) {
|
||||
origFreqs[i] = ir.docFreq(terms[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
|
||||
Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
|
||||
Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<CombineSuggestionWrapper>(
|
||||
queueInitialCapacity, queueComparator);
|
||||
|
||||
int thisTimeEvaluations = 0;
|
||||
BytesRef reuse = new BytesRef();
|
||||
for (int i = 0; i < terms.length - 1; i++) {
|
||||
if (terms[i].equals(SEPARATOR_TERM)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int byteLength = terms[i].bytes().length;
|
||||
if (byteLength > maxCombineWordLength) {
|
||||
continue;
|
||||
}
|
||||
|
||||
reuse.grow(byteLength);
|
||||
reuse.length = byteLength;
|
||||
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
|
||||
reuse.bytes, 0, byteLength);
|
||||
|
||||
int maxFreq = 0;
|
||||
int minFreq = Integer.MAX_VALUE;
|
||||
if (origFreqs != null) {
|
||||
maxFreq = origFreqs[i];
|
||||
minFreq = origFreqs[i];
|
||||
}
|
||||
|
||||
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
|
||||
if (terms[j].equals(SEPARATOR_TERM)) {
|
||||
break;
|
||||
}
|
||||
byteLength += terms[j].bytes().length;
|
||||
if (byteLength > maxCombineWordLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (origFreqs != null) {
|
||||
maxFreq = Math.max(maxFreq, origFreqs[j]);
|
||||
minFreq = Math.min(minFreq, origFreqs[j]);
|
||||
}
|
||||
|
||||
reuse.grow(byteLength);
|
||||
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
|
||||
reuse.bytes, reuse.length, terms[j].bytes().length);
|
||||
reuse.length = byteLength;
|
||||
|
||||
Term combinedTerm = new Term(terms[0].field(), reuse);
|
||||
int combinedTermFreq = ir.docFreq(combinedTerm);
|
||||
|
||||
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|
||||
|| combinedTermFreq >= maxFreq) {
|
||||
if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX
|
||||
|| minFreq == 0) {
|
||||
if (combinedTermFreq >= minSuggestionFrequency) {
|
||||
int[] origIndexes = new int[j - i + 1];
|
||||
origIndexes[0] = i;
|
||||
for (int k = 1; k < origIndexes.length; k++) {
|
||||
origIndexes[k] = i + k;
|
||||
}
|
||||
SuggestWord word = new SuggestWord();
|
||||
word.freq = combinedTermFreq;
|
||||
word.score = origIndexes.length - 1;
|
||||
word.string = combinedTerm.text();
|
||||
CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(
|
||||
new CombineSuggestion(word, origIndexes),
|
||||
(origIndexes.length - 1));
|
||||
suggestions.offer(suggestion);
|
||||
if (suggestions.size() > maxSuggestions) {
|
||||
suggestions.poll();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
thisTimeEvaluations++;
|
||||
if (thisTimeEvaluations == maxEvaluations) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions
|
||||
.size()];
|
||||
for (int i = suggestions.size() - 1; i >= 0; i--) {
|
||||
combineSuggestions[i] = suggestions.remove().combineSuggestion;
|
||||
}
|
||||
return combineSuggestions;
|
||||
}
|
||||
|
||||
private int generateBreakUpSuggestions(Term term, IndexReader ir,
|
||||
int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
|
||||
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
|
||||
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
|
||||
throws IOException {
|
||||
int termLength = term.bytes().length;
|
||||
int useMinBreakWordLength = minBreakWordLength;
|
||||
if (useMinBreakWordLength < 1) {
|
||||
useMinBreakWordLength = 1;
|
||||
}
|
||||
if (termLength <= (useMinBreakWordLength * 2)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int thisTimeEvaluations = 0;
|
||||
BytesRef termBytes = term.bytes().clone();
|
||||
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
|
||||
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
|
||||
.field());
|
||||
|
||||
if (leftWord.freq >= useMinSuggestionFrequency) {
|
||||
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
|
||||
termLength - i, term.field());
|
||||
if (rightWord.freq >= useMinSuggestionFrequency) {
|
||||
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
|
||||
newSuggestion(prefix, leftWord, rightWord));
|
||||
suggestions.offer(suggestion);
|
||||
if (suggestions.size() > maxSuggestions) {
|
||||
suggestions.poll();
|
||||
}
|
||||
}
|
||||
|
||||
int newNumberBreaks = numberBreaks + 1;
|
||||
if (newNumberBreaks <= maxChanges) {
|
||||
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
|
||||
rightWord.string), ir, newNumberBreaks, maxSuggestions,
|
||||
useMinSuggestionFrequency, newPrefix(prefix, leftWord),
|
||||
suggestions, totalEvaluations, sortMethod);
|
||||
totalEvaluations += evaluations;
|
||||
}
|
||||
}
|
||||
thisTimeEvaluations++;
|
||||
totalEvaluations++;
|
||||
if (totalEvaluations >= maxEvaluations) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return thisTimeEvaluations;
|
||||
}
|
||||
|
||||
private SuggestWord[] newPrefix(SuggestWord[] oldPrefix, SuggestWord append) {
|
||||
SuggestWord[] newPrefix = new SuggestWord[oldPrefix.length + 1];
|
||||
System.arraycopy(oldPrefix, 0, newPrefix, 0, oldPrefix.length);
|
||||
newPrefix[newPrefix.length - 1] = append;
|
||||
return newPrefix;
|
||||
}
|
||||
|
||||
private SuggestWord[] newSuggestion(SuggestWord[] prefix,
|
||||
SuggestWord append1, SuggestWord append2) {
|
||||
SuggestWord[] newSuggestion = new SuggestWord[prefix.length + 2];
|
||||
int score = prefix.length + 1;
|
||||
for (int i = 0; i < prefix.length; i++) {
|
||||
SuggestWord word = new SuggestWord();
|
||||
word.string = prefix[i].string;
|
||||
word.freq = prefix[i].freq;
|
||||
word.score = score;
|
||||
newSuggestion[i] = word;
|
||||
}
|
||||
append1.score = score;
|
||||
append2.score = score;
|
||||
newSuggestion[newSuggestion.length - 2] = append1;
|
||||
newSuggestion[newSuggestion.length - 1] = append2;
|
||||
return newSuggestion;
|
||||
}
|
||||
|
||||
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
|
||||
int offset, int length, String fieldname) throws IOException {
|
||||
bytes.offset = offset;
|
||||
bytes.length = length;
|
||||
Term term = new Term(fieldname, bytes);
|
||||
int freq = ir.docFreq(term);
|
||||
SuggestWord word = new SuggestWord();
|
||||
word.freq = freq;
|
||||
word.score = 1;
|
||||
word.string = term.text();
|
||||
return word;
|
||||
}
|
||||
|
||||
public int getMinSuggestionFrequency() {
|
||||
return minSuggestionFrequency;
|
||||
}
|
||||
|
||||
public int getMaxCombineWordLength() {
|
||||
return maxCombineWordLength;
|
||||
}
|
||||
|
||||
public int getMinBreakWordLength() {
|
||||
return minBreakWordLength;
|
||||
}
|
||||
|
||||
public int getMaxChanges() {
|
||||
return maxChanges;
|
||||
}
|
||||
|
||||
public int getMaxEvaluations() {
|
||||
return maxEvaluations;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The minimum frequency a term must have to be included as part of a
|
||||
* suggestion. Default=1 Not applicable when used with
|
||||
* {@link SuggestMode#SUGGEST_MORE_POPULAR}
|
||||
* </p>
|
||||
*
|
||||
* @param minSuggestionFrequency
|
||||
*/
|
||||
public void setMinSuggestionFrequency(int minSuggestionFrequency) {
|
||||
this.minSuggestionFrequency = minSuggestionFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The maximum length of a suggestion made by combining 1 or more original
|
||||
* terms. Default=20
|
||||
* </p>
|
||||
*
|
||||
* @param maxCombineWordLength
|
||||
*/
|
||||
public void setMaxCombineWordLength(int maxCombineWordLength) {
|
||||
this.maxCombineWordLength = maxCombineWordLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The minimum length to break words down to. Default=1
|
||||
* </p>
|
||||
*
|
||||
* @param minBreakWordLength
|
||||
*/
|
||||
public void setMinBreakWordLength(int minBreakWordLength) {
|
||||
this.minBreakWordLength = minBreakWordLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The maximum numbers of changes (word breaks or combinations) to make on the
|
||||
* original term(s). Default=1
|
||||
* </p>
|
||||
*
|
||||
* @param maxChanges
|
||||
*/
|
||||
public void setMaxChanges(int maxChanges) {
|
||||
this.maxChanges = maxChanges;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The maximum number of word combinations to evaluate. Default=1000. A higher
|
||||
* value might improve result quality. A lower value might improve
|
||||
* performance.
|
||||
* </p>
|
||||
*
|
||||
* @param maxEvaluations
|
||||
*/
|
||||
public void setMaxEvaluations(int maxEvaluations) {
|
||||
this.maxEvaluations = maxEvaluations;
|
||||
}
|
||||
|
||||
private class LengthThenMaxFreqComparator implements
|
||||
Comparator<SuggestWordArrayWrapper> {
|
||||
@Override
|
||||
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
|
||||
if (o1.suggestWords.length != o2.suggestWords.length) {
|
||||
return o2.suggestWords.length - o1.suggestWords.length;
|
||||
}
|
||||
if (o1.freqMax != o2.freqMax) {
|
||||
return o1.freqMax - o2.freqMax;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private class LengthThenSumFreqComparator implements
|
||||
Comparator<SuggestWordArrayWrapper> {
|
||||
@Override
|
||||
public int compare(SuggestWordArrayWrapper o1, SuggestWordArrayWrapper o2) {
|
||||
if (o1.suggestWords.length != o2.suggestWords.length) {
|
||||
return o2.suggestWords.length - o1.suggestWords.length;
|
||||
}
|
||||
if (o1.freqSum != o2.freqSum) {
|
||||
return o1.freqSum - o2.freqSum;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private class CombinationsThenFreqComparator implements
|
||||
Comparator<CombineSuggestionWrapper> {
|
||||
@Override
|
||||
public int compare(CombineSuggestionWrapper o1, CombineSuggestionWrapper o2) {
|
||||
if (o1.numCombinations != o2.numCombinations) {
|
||||
return o2.numCombinations - o1.numCombinations;
|
||||
}
|
||||
if (o1.combineSuggestion.suggestion.freq != o2.combineSuggestion.suggestion.freq) {
|
||||
return o1.combineSuggestion.suggestion.freq
|
||||
- o2.combineSuggestion.suggestion.freq;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private class SuggestWordArrayWrapper {
|
||||
final SuggestWord[] suggestWords;
|
||||
final int freqMax;
|
||||
final int freqSum;
|
||||
|
||||
SuggestWordArrayWrapper(SuggestWord[] suggestWords) {
|
||||
this.suggestWords = suggestWords;
|
||||
int aFreqSum = 0;
|
||||
int aFreqMax = 0;
|
||||
for (SuggestWord sw : suggestWords) {
|
||||
aFreqSum += sw.freq;
|
||||
aFreqMax = Math.max(aFreqMax, sw.freq);
|
||||
}
|
||||
this.freqSum = aFreqSum;
|
||||
this.freqMax = aFreqMax;
|
||||
}
|
||||
}
|
||||
|
||||
private class CombineSuggestionWrapper {
|
||||
final CombineSuggestion combineSuggestion;
|
||||
final int numCombinations;
|
||||
|
||||
CombineSuggestionWrapper(CombineSuggestion combineSuggestion,
|
||||
int numCombinations) {
|
||||
this.combineSuggestion = combineSuggestion;
|
||||
this.numCombinations = numCombinations;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,234 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortMethod;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||
private Directory dir = null;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true));
|
||||
|
||||
for (int i = 900; i < 1112; i++) {
|
||||
Document doc = new Document();
|
||||
String num = English.intToEnglish(i).replaceAll("[-]", " ").replaceAll("[,]", "");
|
||||
doc.add(newField("numbers", num, TextField.TYPE_UNSTORED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
{
|
||||
Document doc = new Document();
|
||||
doc.add(newField("numbers", "thou hast sand betwixt thy toes", TextField.TYPE_UNSTORED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
{
|
||||
Document doc = new Document();
|
||||
doc.add(newField("numbers", "hundredeight eightyeight yeight", TextField.TYPE_UNSTORED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
{
|
||||
Document doc = new Document();
|
||||
doc.add(newField("numbers", "tres y cinco", TextField.TYPE_UNSTORED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
writer.commit();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
if(dir!=null) {
|
||||
dir.close();
|
||||
dir = null;
|
||||
}
|
||||
super.tearDown();
|
||||
}
|
||||
public void testCombiningWords() throws Exception {
|
||||
IndexReader ir = null;
|
||||
try {
|
||||
ir = DirectoryReader.open(dir);
|
||||
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
|
||||
|
||||
{
|
||||
Term[] terms = {
|
||||
new Term("numbers", "one"),
|
||||
new Term("numbers", "hun"),
|
||||
new Term("numbers", "dred"),
|
||||
new Term("numbers", "eight"),
|
||||
new Term("numbers", "y"),
|
||||
new Term("numbers", "eight"),
|
||||
};
|
||||
wbsp.setMaxChanges(3);
|
||||
wbsp.setMaxCombineWordLength(20);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, 10, ir, SuggestMode.SUGGEST_ALWAYS);
|
||||
Assert.assertTrue(cs.length==5);
|
||||
|
||||
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
|
||||
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
|
||||
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
|
||||
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
|
||||
Assert.assertTrue(cs[0].suggestion.score==1);
|
||||
|
||||
Assert.assertTrue(cs[1].originalTermIndexes.length==2);
|
||||
Assert.assertTrue(cs[1].originalTermIndexes[0]==3);
|
||||
Assert.assertTrue(cs[1].originalTermIndexes[1]==4);
|
||||
Assert.assertTrue(cs[1].suggestion.string.equals("eighty"));
|
||||
Assert.assertTrue(cs[1].suggestion.score==1);
|
||||
|
||||
Assert.assertTrue(cs[2].originalTermIndexes.length==2);
|
||||
Assert.assertTrue(cs[2].originalTermIndexes[0]==4);
|
||||
Assert.assertTrue(cs[2].originalTermIndexes[1]==5);
|
||||
Assert.assertTrue(cs[2].suggestion.string.equals("yeight"));
|
||||
Assert.assertTrue(cs[2].suggestion.score==1);
|
||||
|
||||
for(int i=3 ; i<5 ; i++) {
|
||||
Assert.assertTrue(cs[i].originalTermIndexes.length==3);
|
||||
Assert.assertTrue(cs[i].suggestion.score==2);
|
||||
Assert.assertTrue(
|
||||
(cs[i].originalTermIndexes[0]==1 &&
|
||||
cs[i].originalTermIndexes[1]==2 &&
|
||||
cs[i].originalTermIndexes[2]==3 &&
|
||||
cs[i].suggestion.string.equals("hundredeight")) ||
|
||||
(cs[i].originalTermIndexes[0]==3 &&
|
||||
cs[i].originalTermIndexes[1]==4 &&
|
||||
cs[i].originalTermIndexes[2]==5 &&
|
||||
cs[i].suggestion.string.equals("eightyeight"))
|
||||
);
|
||||
}
|
||||
|
||||
cs = wbsp.suggestWordCombinations(terms, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||
Assert.assertTrue(cs.length==2);
|
||||
Assert.assertTrue(cs[0].originalTermIndexes.length==2);
|
||||
Assert.assertTrue(cs[0].suggestion.score==1);
|
||||
Assert.assertTrue(cs[0].originalTermIndexes[0]==1);
|
||||
Assert.assertTrue(cs[0].originalTermIndexes[1]==2);
|
||||
Assert.assertTrue(cs[0].suggestion.string.equals("hundred"));
|
||||
Assert.assertTrue(cs[0].suggestion.score==1);
|
||||
|
||||
Assert.assertTrue(cs[1].originalTermIndexes.length==3);
|
||||
Assert.assertTrue(cs[1].suggestion.score==2);
|
||||
Assert.assertTrue(cs[1].originalTermIndexes[0] == 1);
|
||||
Assert.assertTrue(cs[1].originalTermIndexes[1] == 2);
|
||||
Assert.assertTrue(cs[1].originalTermIndexes[2] == 3);
|
||||
Assert.assertTrue(cs[1].suggestion.string.equals("hundredeight"));
|
||||
}
|
||||
} catch(Exception e) {
|
||||
throw e;
|
||||
} finally {
|
||||
try { ir.close(); } catch(Exception e1) { }
|
||||
}
|
||||
}
|
||||
|
||||
public void testBreakingWords() throws Exception {
|
||||
IndexReader ir = null;
|
||||
try {
|
||||
ir = DirectoryReader.open(dir);
|
||||
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
|
||||
|
||||
{
|
||||
Term term = new Term("numbers", "ninetynine");
|
||||
wbsp.setMaxChanges(1);
|
||||
wbsp.setMinBreakWordLength(1);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==1);
|
||||
Assert.assertTrue(sw[0].length==2);
|
||||
Assert.assertTrue(sw[0][0].string.equals("ninety"));
|
||||
Assert.assertTrue(sw[0][1].string.equals("nine"));
|
||||
Assert.assertTrue(sw[0][0].score == 1);
|
||||
Assert.assertTrue(sw[0][1].score == 1);
|
||||
}
|
||||
{
|
||||
Term term = new Term("numbers", "onethousand");
|
||||
wbsp.setMaxChanges(1);
|
||||
wbsp.setMinBreakWordLength(1);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==1);
|
||||
Assert.assertTrue(sw[0].length==2);
|
||||
Assert.assertTrue(sw[0][0].string.equals("one"));
|
||||
Assert.assertTrue(sw[0][1].string.equals("thousand"));
|
||||
Assert.assertTrue(sw[0][0].score == 1);
|
||||
Assert.assertTrue(sw[0][1].score == 1);
|
||||
|
||||
wbsp.setMaxChanges(2);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
sw = wbsp.suggestWordBreaks(term, 1, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==1);
|
||||
Assert.assertTrue(sw[0].length==2);
|
||||
|
||||
wbsp.setMaxChanges(2);
|
||||
wbsp.setMinSuggestionFrequency(2);
|
||||
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==1);
|
||||
Assert.assertTrue(sw[0].length==2);
|
||||
|
||||
wbsp.setMaxChanges(2);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
sw = wbsp.suggestWordBreaks(term, 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==2);
|
||||
Assert.assertTrue(sw[0].length==2);
|
||||
Assert.assertTrue(sw[0][0].string.equals("one"));
|
||||
Assert.assertTrue(sw[0][1].string.equals("thousand"));
|
||||
Assert.assertTrue(sw[0][0].score == 1);
|
||||
Assert.assertTrue(sw[0][1].score == 1);
|
||||
Assert.assertTrue(sw[0][1].freq>1);
|
||||
Assert.assertTrue(sw[0][0].freq>sw[0][1].freq);
|
||||
Assert.assertTrue(sw[1].length==3);
|
||||
Assert.assertTrue(sw[1][0].string.equals("one"));
|
||||
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
||||
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
||||
Assert.assertTrue(sw[1][0].score == 2);
|
||||
Assert.assertTrue(sw[1][1].score == 2);
|
||||
Assert.assertTrue(sw[1][2].score == 2);
|
||||
Assert.assertTrue(sw[1][0].freq>1);
|
||||
Assert.assertTrue(sw[1][1].freq==1);
|
||||
Assert.assertTrue(sw[1][2].freq==1);
|
||||
}
|
||||
{
|
||||
Term term = new Term("numbers", "onethousandonehundredeleven");
|
||||
wbsp.setMaxChanges(3);
|
||||
wbsp.setMinBreakWordLength(1);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==0);
|
||||
|
||||
wbsp.setMaxChanges(4);
|
||||
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==1);
|
||||
Assert.assertTrue(sw[0].length==5);
|
||||
|
||||
wbsp.setMaxChanges(5);
|
||||
sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==2);
|
||||
Assert.assertTrue(sw[0].length==5);
|
||||
Assert.assertTrue(sw[0][1].string.equals("thousand"));
|
||||
Assert.assertTrue(sw[1].length==6);
|
||||
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
||||
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
||||
}
|
||||
|
||||
} catch(Exception e) {
|
||||
throw e;
|
||||
} finally {
|
||||
try { ir.close(); } catch(Exception e1) { }
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue