mirror of https://github.com/apache/lucene.git
LUCENE-4587: fix WordBreakSpellChecker to handle non-latin characters
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
10bb87d0bf
commit
927531255d
|
@ -219,9 +219,13 @@ Bug Fixes
|
||||||
In addition, it wasn't possible to index a shape representing the entire
|
In addition, it wasn't possible to index a shape representing the entire
|
||||||
globe.
|
globe.
|
||||||
|
|
||||||
* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in
|
* LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in
|
||||||
'forever' mode (Doron Cohen)
|
'forever' mode (Doron Cohen)
|
||||||
|
|
||||||
|
* LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented
|
||||||
|
with 2-char codepoints, and to correctly break/combine terms containing
|
||||||
|
non-latin characters. (James Dyer, Andreas Hubold)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
|
* LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
|
||||||
|
|
|
@ -180,35 +180,32 @@ public class WordBreakSpellChecker {
|
||||||
queueInitialCapacity, queueComparator);
|
queueInitialCapacity, queueComparator);
|
||||||
|
|
||||||
int thisTimeEvaluations = 0;
|
int thisTimeEvaluations = 0;
|
||||||
BytesRef reuse = new BytesRef();
|
|
||||||
for (int i = 0; i < terms.length - 1; i++) {
|
for (int i = 0; i < terms.length - 1; i++) {
|
||||||
if (terms[i].equals(SEPARATOR_TERM)) {
|
if (terms[i].equals(SEPARATOR_TERM)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
String leftTermText = terms[i].text();
|
||||||
int byteLength = terms[i].bytes().length;
|
int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
|
||||||
if (byteLength > maxCombineWordLength) {
|
if (leftTermLength > maxCombineWordLength) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
reuse.grow(byteLength);
|
|
||||||
reuse.length = byteLength;
|
|
||||||
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
|
|
||||||
reuse.bytes, 0, byteLength);
|
|
||||||
|
|
||||||
int maxFreq = 0;
|
int maxFreq = 0;
|
||||||
int minFreq = Integer.MAX_VALUE;
|
int minFreq = Integer.MAX_VALUE;
|
||||||
if (origFreqs != null) {
|
if (origFreqs != null) {
|
||||||
maxFreq = origFreqs[i];
|
maxFreq = origFreqs[i];
|
||||||
minFreq = origFreqs[i];
|
minFreq = origFreqs[i];
|
||||||
}
|
}
|
||||||
|
String combinedTermText = leftTermText;
|
||||||
|
int combinedLength = leftTermLength;
|
||||||
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
|
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
|
||||||
if (terms[j].equals(SEPARATOR_TERM)) {
|
if (terms[j].equals(SEPARATOR_TERM)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
byteLength += terms[j].bytes().length;
|
String rightTermText = terms[j].text();
|
||||||
if (byteLength > maxCombineWordLength) {
|
int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
|
||||||
|
combinedTermText += rightTermText;
|
||||||
|
combinedLength +=rightTermLength;
|
||||||
|
if (combinedLength > maxCombineWordLength) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -217,12 +214,7 @@ public class WordBreakSpellChecker {
|
||||||
minFreq = Math.min(minFreq, origFreqs[j]);
|
minFreq = Math.min(minFreq, origFreqs[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
reuse.grow(byteLength);
|
Term combinedTerm = new Term(terms[0].field(), combinedTermText);
|
||||||
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
|
|
||||||
reuse.bytes, reuse.length, terms[j].bytes().length);
|
|
||||||
reuse.length = byteLength;
|
|
||||||
|
|
||||||
Term combinedTerm = new Term(terms[0].field(), reuse);
|
|
||||||
int combinedTermFreq = ir.docFreq(combinedTerm);
|
int combinedTermFreq = ir.docFreq(combinedTerm);
|
||||||
|
|
||||||
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|
||||||
|
@ -268,24 +260,25 @@ public class WordBreakSpellChecker {
|
||||||
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
|
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
|
||||||
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
|
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
int termLength = term.bytes().length;
|
String termText = term.text();
|
||||||
|
int termLength = termText.codePointCount(0, termText.length());
|
||||||
int useMinBreakWordLength = minBreakWordLength;
|
int useMinBreakWordLength = minBreakWordLength;
|
||||||
if (useMinBreakWordLength < 1) {
|
if (useMinBreakWordLength < 1) {
|
||||||
useMinBreakWordLength = 1;
|
useMinBreakWordLength = 1;
|
||||||
}
|
}
|
||||||
if (termLength <= (useMinBreakWordLength * 2)) {
|
if (termLength < (useMinBreakWordLength * 2)) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int thisTimeEvaluations = 0;
|
int thisTimeEvaluations = 0;
|
||||||
BytesRef termBytes = term.bytes().clone();
|
for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
|
||||||
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
|
int end = termText.offsetByCodePoints(0, i);
|
||||||
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
|
String leftText = termText.substring(0, end);
|
||||||
.field());
|
String rightText = termText.substring(end);
|
||||||
|
SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
|
||||||
|
|
||||||
if (leftWord.freq >= useMinSuggestionFrequency) {
|
if (leftWord.freq >= useMinSuggestionFrequency) {
|
||||||
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
|
SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
|
||||||
termLength - i, term.field());
|
|
||||||
if (rightWord.freq >= useMinSuggestionFrequency) {
|
if (rightWord.freq >= useMinSuggestionFrequency) {
|
||||||
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
|
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
|
||||||
newSuggestion(prefix, leftWord, rightWord));
|
newSuggestion(prefix, leftWord, rightWord));
|
||||||
|
@ -294,7 +287,6 @@ public class WordBreakSpellChecker {
|
||||||
suggestions.poll();
|
suggestions.poll();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int newNumberBreaks = numberBreaks + 1;
|
int newNumberBreaks = numberBreaks + 1;
|
||||||
if (newNumberBreaks <= maxChanges) {
|
if (newNumberBreaks <= maxChanges) {
|
||||||
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
|
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
|
||||||
|
@ -304,6 +296,7 @@ public class WordBreakSpellChecker {
|
||||||
totalEvaluations += evaluations;
|
totalEvaluations += evaluations;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
thisTimeEvaluations++;
|
thisTimeEvaluations++;
|
||||||
totalEvaluations++;
|
totalEvaluations++;
|
||||||
if (totalEvaluations >= maxEvaluations) {
|
if (totalEvaluations >= maxEvaluations) {
|
||||||
|
@ -338,16 +331,13 @@ public class WordBreakSpellChecker {
|
||||||
return newSuggestion;
|
return newSuggestion;
|
||||||
}
|
}
|
||||||
|
|
||||||
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
|
private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
|
||||||
int offset, int length, String fieldname) throws IOException {
|
Term term = new Term(fieldname, text);
|
||||||
bytes.offset = offset;
|
|
||||||
bytes.length = length;
|
|
||||||
Term term = new Term(fieldname, bytes);
|
|
||||||
int freq = ir.docFreq(term);
|
int freq = ir.docFreq(term);
|
||||||
SuggestWord word = new SuggestWord();
|
SuggestWord word = new SuggestWord();
|
||||||
word.freq = freq;
|
word.freq = freq;
|
||||||
word.score = 1;
|
word.score = 1;
|
||||||
word.string = term.text();
|
word.string = text;
|
||||||
return word;
|
return word;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,10 @@ package org.apache.lucene.search.spell;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
import junit.framework.Assert;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
|
@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.English;
|
import org.apache.lucene.util.English;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
public class TestWordBreakSpellChecker extends LuceneTestCase {
|
public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||||
private Directory dir = null;
|
private Directory dir = null;
|
||||||
|
@ -152,7 +157,6 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||||
try { ir.close(); } catch(Exception e1) { }
|
try { ir.close(); } catch(Exception e1) { }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBreakingWords() throws Exception {
|
public void testBreakingWords() throws Exception {
|
||||||
IndexReader ir = null;
|
IndexReader ir = null;
|
||||||
try {
|
try {
|
||||||
|
@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||||
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
||||||
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
//make sure we can handle 2-char codepoints
|
||||||
|
Term term = new Term("numbers", "\uD864\uDC79");
|
||||||
|
wbsp.setMaxChanges(1);
|
||||||
|
wbsp.setMinBreakWordLength(1);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
Assert.assertTrue(sw.length==0);
|
||||||
|
}
|
||||||
|
|
||||||
} catch(Exception e) {
|
} catch(Exception e) {
|
||||||
throw e;
|
throw e;
|
||||||
|
@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||||
try { ir.close(); } catch(Exception e1) { }
|
try { ir.close(); } catch(Exception e1) { }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
public void testRandom() throws Exception {
|
||||||
|
int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
|
||||||
|
(100 * RANDOM_MULTIPLIER));
|
||||||
|
Directory dir = null;
|
||||||
|
RandomIndexWriter writer = null;
|
||||||
|
IndexReader ir = null;
|
||||||
|
try {
|
||||||
|
dir = newDirectory();
|
||||||
|
writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(),
|
||||||
|
MockTokenizer.WHITESPACE, false));
|
||||||
|
int maxLength = _TestUtil.nextInt(random(), 5, 50);
|
||||||
|
List<String> originals = new ArrayList<String>(numDocs);
|
||||||
|
List<String[]> breaks = new ArrayList<String[]>(numDocs);
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
String orig = "";
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
while (!goodTestString(orig)) {
|
||||||
|
orig = _TestUtil.randomSimpleString(random(), maxLength);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
while (!goodTestString(orig)) {
|
||||||
|
orig = _TestUtil.randomUnicodeString(random(), maxLength);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
originals.add(orig);
|
||||||
|
int totalLength = orig.codePointCount(0, orig.length());
|
||||||
|
int breakAt = orig.offsetByCodePoints(0,
|
||||||
|
_TestUtil.nextInt(random(), 1, totalLength - 1));
|
||||||
|
String[] broken = new String[2];
|
||||||
|
broken[0] = orig.substring(0, breakAt);
|
||||||
|
broken[1] = orig.substring(breakAt);
|
||||||
|
breaks.add(broken);
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("random_break", broken[0] + " " + broken[1],
|
||||||
|
Field.Store.NO));
|
||||||
|
doc.add(newTextField("random_combine", orig, Field.Store.NO));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
writer.commit();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
ir = DirectoryReader.open(dir);
|
||||||
|
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
|
||||||
|
wbsp.setMaxChanges(1);
|
||||||
|
wbsp.setMinBreakWordLength(1);
|
||||||
|
wbsp.setMinSuggestionFrequency(1);
|
||||||
|
wbsp.setMaxCombineWordLength(maxLength);
|
||||||
|
for (int i = 0; i < originals.size(); i++) {
|
||||||
|
String orig = originals.get(i);
|
||||||
|
String left = breaks.get(i)[0];
|
||||||
|
String right = breaks.get(i)[1];
|
||||||
|
{
|
||||||
|
Term term = new Term("random_break", orig);
|
||||||
|
|
||||||
|
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
|
||||||
|
ir, SuggestMode.SUGGEST_ALWAYS,
|
||||||
|
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||||
|
boolean failed = true;
|
||||||
|
for (SuggestWord[] sw1 : sw) {
|
||||||
|
Assert.assertTrue(sw1.length == 2);
|
||||||
|
if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
|
||||||
|
failed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Assert.assertFalse("Failed getting break suggestions\n >Original: "
|
||||||
|
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Term[] terms = {new Term("random_combine", left),
|
||||||
|
new Term("random_combine", right)};
|
||||||
|
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
|
||||||
|
originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
|
||||||
|
boolean failed = true;
|
||||||
|
for (CombineSuggestion cs1 : cs) {
|
||||||
|
Assert.assertTrue(cs1.originalTermIndexes.length == 2);
|
||||||
|
if (cs1.suggestion.string.equals(left + right)) {
|
||||||
|
failed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Assert.assertFalse("Failed getting combine suggestions\n >Original: "
|
||||||
|
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw e;
|
||||||
|
} finally {
|
||||||
|
try {
|
||||||
|
ir.close();
|
||||||
|
} catch (Exception e1) {}
|
||||||
|
try {
|
||||||
|
writer.close();
|
||||||
|
} catch (Exception e1) {}
|
||||||
|
try {
|
||||||
|
dir.close();
|
||||||
|
} catch (Exception e1) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Pattern mockTokenizerWhitespacePattern = Pattern
|
||||||
|
.compile("[ \\t\\r\\n]");
|
||||||
|
|
||||||
|
private boolean goodTestString(String s) {
|
||||||
|
if (s.codePointCount(0, s.length()) < 2
|
||||||
|
|| mockTokenizerWhitespacePattern.matcher(s).find()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue