LUCENE-4587: fix WordBreakSpellChecker to handle non-latin characters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68
2012-12-07 18:54:57 +00:00 · 2012-12-07 18:54:57 +00:00 · 927531255d
parent 10bb87d0bf
commit 927531255d
3 changed files with 160 additions and 44 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -219,9 +219,13 @@ Bug Fixes
  In addition, it wasn't possible to index a shape representing the entire
  globe.

-* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in 
+* LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in 
  'forever' mode (Doron Cohen)

+* LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented
+  with 2-char codepoints, and to correctly break/combine terms containing
+  non-latin characters. (James Dyer, Andreas Hubold)
+
 Optimizations

 * LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@ -180,35 +180,32 @@ public class WordBreakSpellChecker {
        queueInitialCapacity, queueComparator);
    
    int thisTimeEvaluations = 0;
-    BytesRef reuse = new BytesRef();
    for (int i = 0; i < terms.length - 1; i++) {
      if (terms[i].equals(SEPARATOR_TERM)) {
        continue;
      }      
-      
-      int byteLength = terms[i].bytes().length;
-      if (byteLength > maxCombineWordLength) {
+      String leftTermText = terms[i].text();
+      int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
+      if (leftTermLength > maxCombineWordLength) {
       continue;
      } 
-      
-      reuse.grow(byteLength);
-      reuse.length = byteLength;
-      System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
-          reuse.bytes, 0, byteLength);
-      
      int maxFreq = 0;
      int minFreq = Integer.MAX_VALUE;
      if (origFreqs != null) {
        maxFreq = origFreqs[i];
        minFreq = origFreqs[i];
      } 
-      
+      String combinedTermText = leftTermText;
+      int combinedLength = leftTermLength;
      for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
        if (terms[j].equals(SEPARATOR_TERM)) {
          break;
        }
-        byteLength += terms[j].bytes().length;
-        if (byteLength > maxCombineWordLength) {
+        String rightTermText = terms[j].text();
+        int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
+        combinedTermText += rightTermText;
+        combinedLength +=rightTermLength;
+        if (combinedLength > maxCombineWordLength) {
          break;
        }
        
@ -217,12 +214,7 @@ public class WordBreakSpellChecker {
          minFreq = Math.min(minFreq, origFreqs[j]);
        }
                
-        reuse.grow(byteLength);
-        System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
-            reuse.bytes, reuse.length, terms[j].bytes().length);
-        reuse.length = byteLength;
-        
-        Term combinedTerm = new Term(terms[0].field(), reuse);
+        Term combinedTerm = new Term(terms[0].field(), combinedTermText);
        int combinedTermFreq = ir.docFreq(combinedTerm);
        
        if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
@ -268,24 +260,25 @@ public class WordBreakSpellChecker {
      SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
      int totalEvaluations, BreakSuggestionSortMethod sortMethod)
      throws IOException {
-    int termLength = term.bytes().length;
+    String termText = term.text();
+    int termLength = termText.codePointCount(0, termText.length());
    int useMinBreakWordLength = minBreakWordLength;
    if (useMinBreakWordLength < 1) {
      useMinBreakWordLength = 1;
    }
-    if (termLength <= (useMinBreakWordLength * 2)) {
+    if (termLength < (useMinBreakWordLength * 2)) {
      return 0;
    }    
    
    int thisTimeEvaluations = 0;
-    BytesRef termBytes = term.bytes().clone();
-    for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
-      SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
-          .field());
+    for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
+      int end = termText.offsetByCodePoints(0, i);
+      String leftText = termText.substring(0, end);
+      String rightText = termText.substring(end);
+      SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
      
      if (leftWord.freq >= useMinSuggestionFrequency) {
-        SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
-            termLength - i, term.field());
+        SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
        if (rightWord.freq >= useMinSuggestionFrequency) {
          SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
              newSuggestion(prefix, leftWord, rightWord));
@ -294,7 +287,6 @@ public class WordBreakSpellChecker {
            suggestions.poll();
          }
        }        
-        
        int newNumberBreaks = numberBreaks + 1;
        if (newNumberBreaks <= maxChanges) {
          int evaluations = generateBreakUpSuggestions(new Term(term.field(),
@ -304,6 +296,7 @@ public class WordBreakSpellChecker {
          totalEvaluations += evaluations;
        }
      }
+      
      thisTimeEvaluations++;
      totalEvaluations++;
      if (totalEvaluations >= maxEvaluations) {
@ -338,16 +331,13 @@ public class WordBreakSpellChecker {
    return newSuggestion;
  }
  
-  private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
-      int offset, int length, String fieldname) throws IOException {
-    bytes.offset = offset;
-    bytes.length = length;
-    Term term = new Term(fieldname, bytes);
+  private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
+    Term term = new Term(fieldname, text);
    int freq = ir.docFreq(term);
    SuggestWord word = new SuggestWord();
    word.freq = freq;
    word.score = 1;
-    word.string = term.text();
+    word.string = text;
    return word;
  }
  
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@ -17,6 +17,10 @@ package org.apache.lucene.search.spell;
 * limitations under the License.
 */

+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
 import junit.framework.Assert;

 import org.apache.lucene.analysis.MockAnalyzer;
@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.English;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;

 public class TestWordBreakSpellChecker extends LuceneTestCase {
  private Directory dir = null;
@ -152,7 +157,6 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
      try { ir.close(); } catch(Exception e1) { }
    }    
  }  
-  
  public void testBreakingWords() throws Exception {
    IndexReader ir = null;
    try {
@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
        Assert.assertTrue(sw[1][1].string.equals("thou"));
        Assert.assertTrue(sw[1][2].string.equals("sand"));
      }
+      {
+        //make sure we can handle 2-char codepoints
+        Term term = new Term("numbers", "\uD864\uDC79");
+        wbsp.setMaxChanges(1);
+        wbsp.setMinBreakWordLength(1);
+        wbsp.setMinSuggestionFrequency(1);
+        SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+        Assert.assertTrue(sw.length==0);        
+      }
      
    } catch(Exception e) {
      throw e;
@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
      try { ir.close(); } catch(Exception e1) { }
    }    
  }
+  public void testRandom() throws Exception {
+    int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
+        (100 * RANDOM_MULTIPLIER));
+    Directory dir = null;
+    RandomIndexWriter writer = null;
+    IndexReader ir = null;
+    try {
+      dir = newDirectory();
+      writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(),
+          MockTokenizer.WHITESPACE, false));
+      int maxLength = _TestUtil.nextInt(random(), 5, 50);
+      List<String> originals = new ArrayList<String>(numDocs);
+      List<String[]> breaks = new ArrayList<String[]>(numDocs);
+      for (int i = 0; i < numDocs; i++) {
+        String orig = "";
+        if (random().nextBoolean()) {
+          while (!goodTestString(orig)) {
+            orig = _TestUtil.randomSimpleString(random(), maxLength);
+          }
+        } else {
+          while (!goodTestString(orig)) {
+            orig = _TestUtil.randomUnicodeString(random(), maxLength);
+          }
+        }
+        originals.add(orig);
+        int totalLength = orig.codePointCount(0, orig.length());
+        int breakAt = orig.offsetByCodePoints(0,
+            _TestUtil.nextInt(random(), 1, totalLength - 1));
+        String[] broken = new String[2];
+        broken[0] = orig.substring(0, breakAt);
+        broken[1] = orig.substring(breakAt);
+        breaks.add(broken);
+        Document doc = new Document();
+        doc.add(newTextField("random_break", broken[0] + " " + broken[1],
+            Field.Store.NO));
+        doc.add(newTextField("random_combine", orig, Field.Store.NO));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      writer.close();
+      
+      ir = DirectoryReader.open(dir);
+      WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+      wbsp.setMaxChanges(1);
+      wbsp.setMinBreakWordLength(1);
+      wbsp.setMinSuggestionFrequency(1);
+      wbsp.setMaxCombineWordLength(maxLength);
+      for (int i = 0; i < originals.size(); i++) {
+        String orig = originals.get(i);
+        String left = breaks.get(i)[0];
+        String right = breaks.get(i)[1];
+        {
+          Term term = new Term("random_break", orig);
+          
+          SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
+              ir, SuggestMode.SUGGEST_ALWAYS,
+              BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+          boolean failed = true;
+          for (SuggestWord[] sw1 : sw) {
+            Assert.assertTrue(sw1.length == 2);
+            if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
+              failed = false;
+            }
+          }
+          Assert.assertFalse("Failed getting break suggestions\n >Original: "
+              + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
+        }
+        {
+          Term[] terms = {new Term("random_combine", left),
+              new Term("random_combine", right)};
+          CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
+              originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
+          boolean failed = true;
+          for (CombineSuggestion cs1 : cs) {
+            Assert.assertTrue(cs1.originalTermIndexes.length == 2);
+            if (cs1.suggestion.string.equals(left + right)) {
+              failed = false;
+            }
+          }
+          Assert.assertFalse("Failed getting combine suggestions\n >Original: "
+              + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
+        }
+      }
+      
+    } catch (Exception e) {
+      throw e;
+    } finally {
+      try {
+        ir.close();
+      } catch (Exception e1) {}
+      try {
+        writer.close();
+      } catch (Exception e1) {}
+      try {
+        dir.close();
+      } catch (Exception e1) {}
+    }
+  }
+  
+  private static final Pattern mockTokenizerWhitespacePattern = Pattern
+      .compile("[ \\t\\r\\n]");
+  
+  private boolean goodTestString(String s) {
+    if (s.codePointCount(0, s.length()) < 2
+        || mockTokenizerWhitespacePattern.matcher(s).find()) {
+      return false;
+    }
+    return true;
+  }
 }