From 927531255d487775072fff543a14fbb72985d82a Mon Sep 17 00:00:00 2001 From: James Dyer Date: Fri, 7 Dec 2012 18:54:57 +0000 Subject: [PATCH] LUCENE-4587: fix WordBreakSpellChecker to handle non-latin characters git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 +- .../search/spell/WordBreakSpellChecker.java | 72 +++++----- .../spell/TestWordBreakSpellChecker.java | 126 +++++++++++++++++- 3 files changed, 160 insertions(+), 44 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index d6a7d92d3e7..89f3b4decea 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -219,9 +219,13 @@ Bug Fixes In addition, it wasn't possible to index a shape representing the entire globe. -* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in +* LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in 'forever' mode (Doron Cohen) +* LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented + with 2-char codepoints, and to correctly break/combine terms containing + non-latin characters. (James Dyer, Andreas Hubold) + Optimizations * LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java index 50e4cd867be..30f26c95c39 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java @@ -180,35 +180,32 @@ public class WordBreakSpellChecker { queueInitialCapacity, queueComparator); int thisTimeEvaluations = 0; - BytesRef reuse = new BytesRef(); for (int i = 0; i < terms.length - 1; i++) { if (terms[i].equals(SEPARATOR_TERM)) { continue; - } - - int byteLength = terms[i].bytes().length; - if (byteLength > maxCombineWordLength) { - continue; - } - - reuse.grow(byteLength); - reuse.length = byteLength; - System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset, - reuse.bytes, 0, byteLength); - + } + String leftTermText = terms[i].text(); + int leftTermLength = leftTermText.codePointCount(0, leftTermText.length()); + if (leftTermLength > maxCombineWordLength) { + continue; + } int maxFreq = 0; int minFreq = Integer.MAX_VALUE; if (origFreqs != null) { maxFreq = origFreqs[i]; minFreq = origFreqs[i]; - } - + } + String combinedTermText = leftTermText; + int combinedLength = leftTermLength; for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) { if (terms[j].equals(SEPARATOR_TERM)) { break; } - byteLength += terms[j].bytes().length; - if (byteLength > maxCombineWordLength) { + String rightTermText = terms[j].text(); + int rightTermLength = rightTermText.codePointCount(0, rightTermText.length()); + combinedTermText += rightTermText; + combinedLength +=rightTermLength; + if (combinedLength > maxCombineWordLength) { break; } @@ -216,13 +213,8 @@ public class WordBreakSpellChecker { maxFreq = Math.max(maxFreq, origFreqs[j]); minFreq = Math.min(minFreq, origFreqs[j]); } - - reuse.grow(byteLength); - System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset, - reuse.bytes, reuse.length, terms[j].bytes().length); - reuse.length = byteLength; - - Term combinedTerm = new Term(terms[0].field(), reuse); + + Term combinedTerm = new Term(terms[0].field(), combinedTermText); int combinedTermFreq = ir.docFreq(combinedTerm); if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR @@ -268,24 +260,25 @@ public class WordBreakSpellChecker { SuggestWord[] prefix, Queue suggestions, int totalEvaluations, BreakSuggestionSortMethod sortMethod) throws IOException { - int termLength = term.bytes().length; + String termText = term.text(); + int termLength = termText.codePointCount(0, termText.length()); int useMinBreakWordLength = minBreakWordLength; if (useMinBreakWordLength < 1) { useMinBreakWordLength = 1; } - if (termLength <= (useMinBreakWordLength * 2)) { + if (termLength < (useMinBreakWordLength * 2)) { return 0; - } + } int thisTimeEvaluations = 0; - BytesRef termBytes = term.bytes().clone(); - for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) { - SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term - .field()); + for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) { + int end = termText.offsetByCodePoints(0, i); + String leftText = termText.substring(0, end); + String rightText = termText.substring(end); + SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText); if (leftWord.freq >= useMinSuggestionFrequency) { - SuggestWord rightWord = generateSuggestWord(ir, termBytes, i, - termLength - i, term.field()); + SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText); if (rightWord.freq >= useMinSuggestionFrequency) { SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper( newSuggestion(prefix, leftWord, rightWord)); @@ -293,8 +286,7 @@ public class WordBreakSpellChecker { if (suggestions.size() > maxSuggestions) { suggestions.poll(); } - } - + } int newNumberBreaks = numberBreaks + 1; if (newNumberBreaks <= maxChanges) { int evaluations = generateBreakUpSuggestions(new Term(term.field(), @@ -304,6 +296,7 @@ public class WordBreakSpellChecker { totalEvaluations += evaluations; } } + thisTimeEvaluations++; totalEvaluations++; if (totalEvaluations >= maxEvaluations) { @@ -338,16 +331,13 @@ public class WordBreakSpellChecker { return newSuggestion; } - private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes, - int offset, int length, String fieldname) throws IOException { - bytes.offset = offset; - bytes.length = length; - Term term = new Term(fieldname, bytes); + private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException { + Term term = new Term(fieldname, text); int freq = ir.docFreq(term); SuggestWord word = new SuggestWord(); word.freq = freq; word.score = 1; - word.string = term.text(); + word.string = text; return word; } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java index 36e94119e11..58a8807eef0 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java @@ -17,6 +17,10 @@ package org.apache.lucene.search.spell; * limitations under the License. */ +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + import junit.framework.Assert; import org.apache.lucene.analysis.MockAnalyzer; @@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM import org.apache.lucene.store.Directory; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util._TestUtil; public class TestWordBreakSpellChecker extends LuceneTestCase { private Directory dir = null; @@ -151,8 +156,7 @@ public class TestWordBreakSpellChecker extends LuceneTestCase { } finally { try { ir.close(); } catch(Exception e1) { } } - } - + } public void testBreakingWords() throws Exception { IndexReader ir = null; try { @@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase { Assert.assertTrue(sw[1][1].string.equals("thou")); Assert.assertTrue(sw[1][2].string.equals("sand")); } + { + //make sure we can handle 2-char codepoints + Term term = new Term("numbers", "\uD864\uDC79"); + wbsp.setMaxChanges(1); + wbsp.setMinBreakWordLength(1); + wbsp.setMinSuggestionFrequency(1); + SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + Assert.assertTrue(sw.length==0); + } } catch(Exception e) { throw e; @@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase { try { ir.close(); } catch(Exception e1) { } } } + public void testRandom() throws Exception { + int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER), + (100 * RANDOM_MULTIPLIER)); + Directory dir = null; + RandomIndexWriter writer = null; + IndexReader ir = null; + try { + dir = newDirectory(); + writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(), + MockTokenizer.WHITESPACE, false)); + int maxLength = _TestUtil.nextInt(random(), 5, 50); + List originals = new ArrayList(numDocs); + List breaks = new ArrayList(numDocs); + for (int i = 0; i < numDocs; i++) { + String orig = ""; + if (random().nextBoolean()) { + while (!goodTestString(orig)) { + orig = _TestUtil.randomSimpleString(random(), maxLength); + } + } else { + while (!goodTestString(orig)) { + orig = _TestUtil.randomUnicodeString(random(), maxLength); + } + } + originals.add(orig); + int totalLength = orig.codePointCount(0, orig.length()); + int breakAt = orig.offsetByCodePoints(0, + _TestUtil.nextInt(random(), 1, totalLength - 1)); + String[] broken = new String[2]; + broken[0] = orig.substring(0, breakAt); + broken[1] = orig.substring(breakAt); + breaks.add(broken); + Document doc = new Document(); + doc.add(newTextField("random_break", broken[0] + " " + broken[1], + Field.Store.NO)); + doc.add(newTextField("random_combine", orig, Field.Store.NO)); + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + + ir = DirectoryReader.open(dir); + WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); + wbsp.setMaxChanges(1); + wbsp.setMinBreakWordLength(1); + wbsp.setMinSuggestionFrequency(1); + wbsp.setMaxCombineWordLength(maxLength); + for (int i = 0; i < originals.size(); i++) { + String orig = originals.get(i); + String left = breaks.get(i)[0]; + String right = breaks.get(i)[1]; + { + Term term = new Term("random_break", orig); + + SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(), + ir, SuggestMode.SUGGEST_ALWAYS, + BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + boolean failed = true; + for (SuggestWord[] sw1 : sw) { + Assert.assertTrue(sw1.length == 2); + if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) { + failed = false; + } + } + Assert.assertFalse("Failed getting break suggestions\n >Original: " + + orig + "\n >Left: " + left + "\n >Right: " + right, failed); + } + { + Term[] terms = {new Term("random_combine", left), + new Term("random_combine", right)}; + CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms, + originals.size(), ir, SuggestMode.SUGGEST_ALWAYS); + boolean failed = true; + for (CombineSuggestion cs1 : cs) { + Assert.assertTrue(cs1.originalTermIndexes.length == 2); + if (cs1.suggestion.string.equals(left + right)) { + failed = false; + } + } + Assert.assertFalse("Failed getting combine suggestions\n >Original: " + + orig + "\n >Left: " + left + "\n >Right: " + right, failed); + } + } + + } catch (Exception e) { + throw e; + } finally { + try { + ir.close(); + } catch (Exception e1) {} + try { + writer.close(); + } catch (Exception e1) {} + try { + dir.close(); + } catch (Exception e1) {} + } + } + + private static final Pattern mockTokenizerWhitespacePattern = Pattern + .compile("[ \\t\\r\\n]"); + + private boolean goodTestString(String s) { + if (s.codePointCount(0, s.length()) < 2 + || mockTokenizerWhitespacePattern.matcher(s).find()) { + return false; + } + return true; + } }