LUCENE-4587: fix WordBreakSpellChecker to handle non-latin characters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
James Dyer 2012-12-07 18:54:57 +00:00
parent 10bb87d0bf
commit 927531255d
3 changed files with 160 additions and 44 deletions

View File

@ -219,9 +219,13 @@ Bug Fixes
In addition, it wasn't possible to index a shape representing the entire In addition, it wasn't possible to index a shape representing the entire
globe. globe.
* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in * LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in
'forever' mode (Doron Cohen) 'forever' mode (Doron Cohen)
* LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented
with 2-char codepoints, and to correctly break/combine terms containing
non-latin characters. (James Dyer, Andreas Hubold)
Optimizations Optimizations
* LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and * LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and

View File

@ -180,35 +180,32 @@ public class WordBreakSpellChecker {
queueInitialCapacity, queueComparator); queueInitialCapacity, queueComparator);
int thisTimeEvaluations = 0; int thisTimeEvaluations = 0;
BytesRef reuse = new BytesRef();
for (int i = 0; i < terms.length - 1; i++) { for (int i = 0; i < terms.length - 1; i++) {
if (terms[i].equals(SEPARATOR_TERM)) { if (terms[i].equals(SEPARATOR_TERM)) {
continue; continue;
} }
String leftTermText = terms[i].text();
int byteLength = terms[i].bytes().length; int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
if (byteLength > maxCombineWordLength) { if (leftTermLength > maxCombineWordLength) {
continue; continue;
} }
reuse.grow(byteLength);
reuse.length = byteLength;
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
reuse.bytes, 0, byteLength);
int maxFreq = 0; int maxFreq = 0;
int minFreq = Integer.MAX_VALUE; int minFreq = Integer.MAX_VALUE;
if (origFreqs != null) { if (origFreqs != null) {
maxFreq = origFreqs[i]; maxFreq = origFreqs[i];
minFreq = origFreqs[i]; minFreq = origFreqs[i];
} }
String combinedTermText = leftTermText;
int combinedLength = leftTermLength;
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) { for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
if (terms[j].equals(SEPARATOR_TERM)) { if (terms[j].equals(SEPARATOR_TERM)) {
break; break;
} }
byteLength += terms[j].bytes().length; String rightTermText = terms[j].text();
if (byteLength > maxCombineWordLength) { int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
combinedTermText += rightTermText;
combinedLength +=rightTermLength;
if (combinedLength > maxCombineWordLength) {
break; break;
} }
@ -216,13 +213,8 @@ public class WordBreakSpellChecker {
maxFreq = Math.max(maxFreq, origFreqs[j]); maxFreq = Math.max(maxFreq, origFreqs[j]);
minFreq = Math.min(minFreq, origFreqs[j]); minFreq = Math.min(minFreq, origFreqs[j]);
} }
reuse.grow(byteLength); Term combinedTerm = new Term(terms[0].field(), combinedTermText);
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
reuse.bytes, reuse.length, terms[j].bytes().length);
reuse.length = byteLength;
Term combinedTerm = new Term(terms[0].field(), reuse);
int combinedTermFreq = ir.docFreq(combinedTerm); int combinedTermFreq = ir.docFreq(combinedTerm);
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
@ -268,24 +260,25 @@ public class WordBreakSpellChecker {
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions, SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
int totalEvaluations, BreakSuggestionSortMethod sortMethod) int totalEvaluations, BreakSuggestionSortMethod sortMethod)
throws IOException { throws IOException {
int termLength = term.bytes().length; String termText = term.text();
int termLength = termText.codePointCount(0, termText.length());
int useMinBreakWordLength = minBreakWordLength; int useMinBreakWordLength = minBreakWordLength;
if (useMinBreakWordLength < 1) { if (useMinBreakWordLength < 1) {
useMinBreakWordLength = 1; useMinBreakWordLength = 1;
} }
if (termLength <= (useMinBreakWordLength * 2)) { if (termLength < (useMinBreakWordLength * 2)) {
return 0; return 0;
} }
int thisTimeEvaluations = 0; int thisTimeEvaluations = 0;
BytesRef termBytes = term.bytes().clone(); for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) { int end = termText.offsetByCodePoints(0, i);
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term String leftText = termText.substring(0, end);
.field()); String rightText = termText.substring(end);
SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
if (leftWord.freq >= useMinSuggestionFrequency) { if (leftWord.freq >= useMinSuggestionFrequency) {
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i, SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
termLength - i, term.field());
if (rightWord.freq >= useMinSuggestionFrequency) { if (rightWord.freq >= useMinSuggestionFrequency) {
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper( SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
newSuggestion(prefix, leftWord, rightWord)); newSuggestion(prefix, leftWord, rightWord));
@ -293,8 +286,7 @@ public class WordBreakSpellChecker {
if (suggestions.size() > maxSuggestions) { if (suggestions.size() > maxSuggestions) {
suggestions.poll(); suggestions.poll();
} }
} }
int newNumberBreaks = numberBreaks + 1; int newNumberBreaks = numberBreaks + 1;
if (newNumberBreaks <= maxChanges) { if (newNumberBreaks <= maxChanges) {
int evaluations = generateBreakUpSuggestions(new Term(term.field(), int evaluations = generateBreakUpSuggestions(new Term(term.field(),
@ -304,6 +296,7 @@ public class WordBreakSpellChecker {
totalEvaluations += evaluations; totalEvaluations += evaluations;
} }
} }
thisTimeEvaluations++; thisTimeEvaluations++;
totalEvaluations++; totalEvaluations++;
if (totalEvaluations >= maxEvaluations) { if (totalEvaluations >= maxEvaluations) {
@ -338,16 +331,13 @@ public class WordBreakSpellChecker {
return newSuggestion; return newSuggestion;
} }
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes, private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
int offset, int length, String fieldname) throws IOException { Term term = new Term(fieldname, text);
bytes.offset = offset;
bytes.length = length;
Term term = new Term(fieldname, bytes);
int freq = ir.docFreq(term); int freq = ir.docFreq(term);
SuggestWord word = new SuggestWord(); SuggestWord word = new SuggestWord();
word.freq = freq; word.freq = freq;
word.score = 1; word.score = 1;
word.string = term.text(); word.string = text;
return word; return word;
} }

View File

@ -17,6 +17,10 @@ package org.apache.lucene.search.spell;
* limitations under the License. * limitations under the License.
*/ */
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import junit.framework.Assert; import junit.framework.Assert;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English; import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestWordBreakSpellChecker extends LuceneTestCase { public class TestWordBreakSpellChecker extends LuceneTestCase {
private Directory dir = null; private Directory dir = null;
@ -151,8 +156,7 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
} finally { } finally {
try { ir.close(); } catch(Exception e1) { } try { ir.close(); } catch(Exception e1) { }
} }
} }
public void testBreakingWords() throws Exception { public void testBreakingWords() throws Exception {
IndexReader ir = null; IndexReader ir = null;
try { try {
@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
Assert.assertTrue(sw[1][1].string.equals("thou")); Assert.assertTrue(sw[1][1].string.equals("thou"));
Assert.assertTrue(sw[1][2].string.equals("sand")); Assert.assertTrue(sw[1][2].string.equals("sand"));
} }
{
//make sure we can handle 2-char codepoints
Term term = new Term("numbers", "\uD864\uDC79");
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Assert.assertTrue(sw.length==0);
}
} catch(Exception e) { } catch(Exception e) {
throw e; throw e;
@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
try { ir.close(); } catch(Exception e1) { } try { ir.close(); } catch(Exception e1) { }
} }
} }
public void testRandom() throws Exception {
int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
(100 * RANDOM_MULTIPLIER));
Directory dir = null;
RandomIndexWriter writer = null;
IndexReader ir = null;
try {
dir = newDirectory();
writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(),
MockTokenizer.WHITESPACE, false));
int maxLength = _TestUtil.nextInt(random(), 5, 50);
List<String> originals = new ArrayList<String>(numDocs);
List<String[]> breaks = new ArrayList<String[]>(numDocs);
for (int i = 0; i < numDocs; i++) {
String orig = "";
if (random().nextBoolean()) {
while (!goodTestString(orig)) {
orig = _TestUtil.randomSimpleString(random(), maxLength);
}
} else {
while (!goodTestString(orig)) {
orig = _TestUtil.randomUnicodeString(random(), maxLength);
}
}
originals.add(orig);
int totalLength = orig.codePointCount(0, orig.length());
int breakAt = orig.offsetByCodePoints(0,
_TestUtil.nextInt(random(), 1, totalLength - 1));
String[] broken = new String[2];
broken[0] = orig.substring(0, breakAt);
broken[1] = orig.substring(breakAt);
breaks.add(broken);
Document doc = new Document();
doc.add(newTextField("random_break", broken[0] + " " + broken[1],
Field.Store.NO));
doc.add(newTextField("random_combine", orig, Field.Store.NO));
writer.addDocument(doc);
}
writer.commit();
writer.close();
ir = DirectoryReader.open(dir);
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
wbsp.setMaxChanges(1);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
wbsp.setMaxCombineWordLength(maxLength);
for (int i = 0; i < originals.size(); i++) {
String orig = originals.get(i);
String left = breaks.get(i)[0];
String right = breaks.get(i)[1];
{
Term term = new Term("random_break", orig);
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
ir, SuggestMode.SUGGEST_ALWAYS,
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
boolean failed = true;
for (SuggestWord[] sw1 : sw) {
Assert.assertTrue(sw1.length == 2);
if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
failed = false;
}
}
Assert.assertFalse("Failed getting break suggestions\n >Original: "
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
}
{
Term[] terms = {new Term("random_combine", left),
new Term("random_combine", right)};
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
boolean failed = true;
for (CombineSuggestion cs1 : cs) {
Assert.assertTrue(cs1.originalTermIndexes.length == 2);
if (cs1.suggestion.string.equals(left + right)) {
failed = false;
}
}
Assert.assertFalse("Failed getting combine suggestions\n >Original: "
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
}
}
} catch (Exception e) {
throw e;
} finally {
try {
ir.close();
} catch (Exception e1) {}
try {
writer.close();
} catch (Exception e1) {}
try {
dir.close();
} catch (Exception e1) {}
}
}
private static final Pattern mockTokenizerWhitespacePattern = Pattern
.compile("[ \\t\\r\\n]");
private boolean goodTestString(String s) {
if (s.codePointCount(0, s.length()) < 2
|| mockTokenizerWhitespacePattern.matcher(s).find()) {
return false;
}
return true;
}
} }