mirror of https://github.com/apache/lucene.git
LUCENE-4587: fix WordBreakSpellChecker to handle non-latin characters
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
10bb87d0bf
commit
927531255d
|
@ -219,9 +219,13 @@ Bug Fixes
|
|||
In addition, it wasn't possible to index a shape representing the entire
|
||||
globe.
|
||||
|
||||
* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in
|
||||
* LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in
|
||||
'forever' mode (Doron Cohen)
|
||||
|
||||
* LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented
|
||||
with 2-char codepoints, and to correctly break/combine terms containing
|
||||
non-latin characters. (James Dyer, Andreas Hubold)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
|
||||
|
|
|
@ -180,35 +180,32 @@ public class WordBreakSpellChecker {
|
|||
queueInitialCapacity, queueComparator);
|
||||
|
||||
int thisTimeEvaluations = 0;
|
||||
BytesRef reuse = new BytesRef();
|
||||
for (int i = 0; i < terms.length - 1; i++) {
|
||||
if (terms[i].equals(SEPARATOR_TERM)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int byteLength = terms[i].bytes().length;
|
||||
if (byteLength > maxCombineWordLength) {
|
||||
continue;
|
||||
}
|
||||
|
||||
reuse.grow(byteLength);
|
||||
reuse.length = byteLength;
|
||||
System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
|
||||
reuse.bytes, 0, byteLength);
|
||||
|
||||
}
|
||||
String leftTermText = terms[i].text();
|
||||
int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
|
||||
if (leftTermLength > maxCombineWordLength) {
|
||||
continue;
|
||||
}
|
||||
int maxFreq = 0;
|
||||
int minFreq = Integer.MAX_VALUE;
|
||||
if (origFreqs != null) {
|
||||
maxFreq = origFreqs[i];
|
||||
minFreq = origFreqs[i];
|
||||
}
|
||||
|
||||
}
|
||||
String combinedTermText = leftTermText;
|
||||
int combinedLength = leftTermLength;
|
||||
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
|
||||
if (terms[j].equals(SEPARATOR_TERM)) {
|
||||
break;
|
||||
}
|
||||
byteLength += terms[j].bytes().length;
|
||||
if (byteLength > maxCombineWordLength) {
|
||||
String rightTermText = terms[j].text();
|
||||
int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
|
||||
combinedTermText += rightTermText;
|
||||
combinedLength +=rightTermLength;
|
||||
if (combinedLength > maxCombineWordLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -216,13 +213,8 @@ public class WordBreakSpellChecker {
|
|||
maxFreq = Math.max(maxFreq, origFreqs[j]);
|
||||
minFreq = Math.min(minFreq, origFreqs[j]);
|
||||
}
|
||||
|
||||
reuse.grow(byteLength);
|
||||
System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
|
||||
reuse.bytes, reuse.length, terms[j].bytes().length);
|
||||
reuse.length = byteLength;
|
||||
|
||||
Term combinedTerm = new Term(terms[0].field(), reuse);
|
||||
|
||||
Term combinedTerm = new Term(terms[0].field(), combinedTermText);
|
||||
int combinedTermFreq = ir.docFreq(combinedTerm);
|
||||
|
||||
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
|
||||
|
@ -268,24 +260,25 @@ public class WordBreakSpellChecker {
|
|||
SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
|
||||
int totalEvaluations, BreakSuggestionSortMethod sortMethod)
|
||||
throws IOException {
|
||||
int termLength = term.bytes().length;
|
||||
String termText = term.text();
|
||||
int termLength = termText.codePointCount(0, termText.length());
|
||||
int useMinBreakWordLength = minBreakWordLength;
|
||||
if (useMinBreakWordLength < 1) {
|
||||
useMinBreakWordLength = 1;
|
||||
}
|
||||
if (termLength <= (useMinBreakWordLength * 2)) {
|
||||
if (termLength < (useMinBreakWordLength * 2)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int thisTimeEvaluations = 0;
|
||||
BytesRef termBytes = term.bytes().clone();
|
||||
for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
|
||||
SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
|
||||
.field());
|
||||
for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
|
||||
int end = termText.offsetByCodePoints(0, i);
|
||||
String leftText = termText.substring(0, end);
|
||||
String rightText = termText.substring(end);
|
||||
SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
|
||||
|
||||
if (leftWord.freq >= useMinSuggestionFrequency) {
|
||||
SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
|
||||
termLength - i, term.field());
|
||||
SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
|
||||
if (rightWord.freq >= useMinSuggestionFrequency) {
|
||||
SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
|
||||
newSuggestion(prefix, leftWord, rightWord));
|
||||
|
@ -293,8 +286,7 @@ public class WordBreakSpellChecker {
|
|||
if (suggestions.size() > maxSuggestions) {
|
||||
suggestions.poll();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
int newNumberBreaks = numberBreaks + 1;
|
||||
if (newNumberBreaks <= maxChanges) {
|
||||
int evaluations = generateBreakUpSuggestions(new Term(term.field(),
|
||||
|
@ -304,6 +296,7 @@ public class WordBreakSpellChecker {
|
|||
totalEvaluations += evaluations;
|
||||
}
|
||||
}
|
||||
|
||||
thisTimeEvaluations++;
|
||||
totalEvaluations++;
|
||||
if (totalEvaluations >= maxEvaluations) {
|
||||
|
@ -338,16 +331,13 @@ public class WordBreakSpellChecker {
|
|||
return newSuggestion;
|
||||
}
|
||||
|
||||
private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
|
||||
int offset, int length, String fieldname) throws IOException {
|
||||
bytes.offset = offset;
|
||||
bytes.length = length;
|
||||
Term term = new Term(fieldname, bytes);
|
||||
private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
|
||||
Term term = new Term(fieldname, text);
|
||||
int freq = ir.docFreq(term);
|
||||
SuggestWord word = new SuggestWord();
|
||||
word.freq = freq;
|
||||
word.score = 1;
|
||||
word.string = term.text();
|
||||
word.string = text;
|
||||
return word;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,10 @@ package org.apache.lucene.search.spell;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
|
@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
public class TestWordBreakSpellChecker extends LuceneTestCase {
|
||||
private Directory dir = null;
|
||||
|
@ -151,8 +156,7 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
|
|||
} finally {
|
||||
try { ir.close(); } catch(Exception e1) { }
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
public void testBreakingWords() throws Exception {
|
||||
IndexReader ir = null;
|
||||
try {
|
||||
|
@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
|
|||
Assert.assertTrue(sw[1][1].string.equals("thou"));
|
||||
Assert.assertTrue(sw[1][2].string.equals("sand"));
|
||||
}
|
||||
{
|
||||
//make sure we can handle 2-char codepoints
|
||||
Term term = new Term("numbers", "\uD864\uDC79");
|
||||
wbsp.setMaxChanges(1);
|
||||
wbsp.setMinBreakWordLength(1);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
Assert.assertTrue(sw.length==0);
|
||||
}
|
||||
|
||||
} catch(Exception e) {
|
||||
throw e;
|
||||
|
@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
|
|||
try { ir.close(); } catch(Exception e1) { }
|
||||
}
|
||||
}
|
||||
public void testRandom() throws Exception {
|
||||
int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
|
||||
(100 * RANDOM_MULTIPLIER));
|
||||
Directory dir = null;
|
||||
RandomIndexWriter writer = null;
|
||||
IndexReader ir = null;
|
||||
try {
|
||||
dir = newDirectory();
|
||||
writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(),
|
||||
MockTokenizer.WHITESPACE, false));
|
||||
int maxLength = _TestUtil.nextInt(random(), 5, 50);
|
||||
List<String> originals = new ArrayList<String>(numDocs);
|
||||
List<String[]> breaks = new ArrayList<String[]>(numDocs);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
String orig = "";
|
||||
if (random().nextBoolean()) {
|
||||
while (!goodTestString(orig)) {
|
||||
orig = _TestUtil.randomSimpleString(random(), maxLength);
|
||||
}
|
||||
} else {
|
||||
while (!goodTestString(orig)) {
|
||||
orig = _TestUtil.randomUnicodeString(random(), maxLength);
|
||||
}
|
||||
}
|
||||
originals.add(orig);
|
||||
int totalLength = orig.codePointCount(0, orig.length());
|
||||
int breakAt = orig.offsetByCodePoints(0,
|
||||
_TestUtil.nextInt(random(), 1, totalLength - 1));
|
||||
String[] broken = new String[2];
|
||||
broken[0] = orig.substring(0, breakAt);
|
||||
broken[1] = orig.substring(breakAt);
|
||||
breaks.add(broken);
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("random_break", broken[0] + " " + broken[1],
|
||||
Field.Store.NO));
|
||||
doc.add(newTextField("random_combine", orig, Field.Store.NO));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.commit();
|
||||
writer.close();
|
||||
|
||||
ir = DirectoryReader.open(dir);
|
||||
WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
|
||||
wbsp.setMaxChanges(1);
|
||||
wbsp.setMinBreakWordLength(1);
|
||||
wbsp.setMinSuggestionFrequency(1);
|
||||
wbsp.setMaxCombineWordLength(maxLength);
|
||||
for (int i = 0; i < originals.size(); i++) {
|
||||
String orig = originals.get(i);
|
||||
String left = breaks.get(i)[0];
|
||||
String right = breaks.get(i)[1];
|
||||
{
|
||||
Term term = new Term("random_break", orig);
|
||||
|
||||
SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
|
||||
ir, SuggestMode.SUGGEST_ALWAYS,
|
||||
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
|
||||
boolean failed = true;
|
||||
for (SuggestWord[] sw1 : sw) {
|
||||
Assert.assertTrue(sw1.length == 2);
|
||||
if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
|
||||
failed = false;
|
||||
}
|
||||
}
|
||||
Assert.assertFalse("Failed getting break suggestions\n >Original: "
|
||||
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
|
||||
}
|
||||
{
|
||||
Term[] terms = {new Term("random_combine", left),
|
||||
new Term("random_combine", right)};
|
||||
CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
|
||||
originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
|
||||
boolean failed = true;
|
||||
for (CombineSuggestion cs1 : cs) {
|
||||
Assert.assertTrue(cs1.originalTermIndexes.length == 2);
|
||||
if (cs1.suggestion.string.equals(left + right)) {
|
||||
failed = false;
|
||||
}
|
||||
}
|
||||
Assert.assertFalse("Failed getting combine suggestions\n >Original: "
|
||||
+ orig + "\n >Left: " + left + "\n >Right: " + right, failed);
|
||||
}
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
throw e;
|
||||
} finally {
|
||||
try {
|
||||
ir.close();
|
||||
} catch (Exception e1) {}
|
||||
try {
|
||||
writer.close();
|
||||
} catch (Exception e1) {}
|
||||
try {
|
||||
dir.close();
|
||||
} catch (Exception e1) {}
|
||||
}
|
||||
}
|
||||
|
||||
private static final Pattern mockTokenizerWhitespacePattern = Pattern
|
||||
.compile("[ \\t\\r\\n]");
|
||||
|
||||
private boolean goodTestString(String s) {
|
||||
if (s.codePointCount(0, s.length()) < 2
|
||||
|| mockTokenizerWhitespacePattern.matcher(s).find()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue