LUCENE-4587: fix WordBreakSpellChecker to handle non-latin characters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68
2012-12-07 18:54:57 +00:00 · 2012-12-07 18:54:57 +00:00 · 927531255d
parent 10bb87d0bf
commit 927531255d
3 changed files with 160 additions and 44 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -219,9 +219,13 @@ Bug Fixes
  In addition, it wasn't possible to index a shape representing the entire
  globe.
-* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in 
+* LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in 
  'forever' mode (Doron Cohen)
 * LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented
  with 2-char codepoints, and to correctly break/combine terms containing
  non-latin characters. (James Dyer, Andreas Hubold)
 Optimizations
 * LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@ -180,35 +180,32 @@ public class WordBreakSpellChecker {
        queueInitialCapacity, queueComparator);
    int thisTimeEvaluations = 0;
    BytesRef reuse = new BytesRef();
    for (int i = 0; i < terms.length - 1; i++) {
      if (terms[i].equals(SEPARATOR_TERM)) {
        continue;
      }      
-      
+      String leftTermText = terms[i].text();
-      int byteLength = terms[i].bytes().length;
+      int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
-      if (byteLength > maxCombineWordLength) {
+      if (leftTermLength > maxCombineWordLength) {
       continue;
      } 
      reuse.grow(byteLength);
      reuse.length = byteLength;
      System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
          reuse.bytes, 0, byteLength);
      int maxFreq = 0;
      int minFreq = Integer.MAX_VALUE;
      if (origFreqs != null) {
        maxFreq = origFreqs[i];
        minFreq = origFreqs[i];
      } 
-      
+      String combinedTermText = leftTermText;
      int combinedLength = leftTermLength;
      for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
        if (terms[j].equals(SEPARATOR_TERM)) {
          break;
        }
-        byteLength += terms[j].bytes().length;
+        String rightTermText = terms[j].text();
-        if (byteLength > maxCombineWordLength) {
+        int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
        combinedTermText += rightTermText;
        combinedLength +=rightTermLength;
        if (combinedLength > maxCombineWordLength) {
          break;
        }
@ -217,12 +214,7 @@ public class WordBreakSpellChecker {
          minFreq = Math.min(minFreq, origFreqs[j]);
        }
-        reuse.grow(byteLength);
+        Term combinedTerm = new Term(terms[0].field(), combinedTermText);
        System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
            reuse.bytes, reuse.length, terms[j].bytes().length);
        reuse.length = byteLength;
        Term combinedTerm = new Term(terms[0].field(), reuse);
        int combinedTermFreq = ir.docFreq(combinedTerm);
        if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
@ -268,24 +260,25 @@ public class WordBreakSpellChecker {
      SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
      int totalEvaluations, BreakSuggestionSortMethod sortMethod)
      throws IOException {
-    int termLength = term.bytes().length;
+    String termText = term.text();
    int termLength = termText.codePointCount(0, termText.length());
    int useMinBreakWordLength = minBreakWordLength;
    if (useMinBreakWordLength < 1) {
      useMinBreakWordLength = 1;
    }
-    if (termLength <= (useMinBreakWordLength * 2)) {
+    if (termLength < (useMinBreakWordLength * 2)) {
      return 0;
    }    
    int thisTimeEvaluations = 0;
-    BytesRef termBytes = term.bytes().clone();
+    for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
-    for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
+      int end = termText.offsetByCodePoints(0, i);
-      SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
+      String leftText = termText.substring(0, end);
-          .field());
+      String rightText = termText.substring(end);
      SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
      if (leftWord.freq >= useMinSuggestionFrequency) {
-        SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
+        SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
            termLength - i, term.field());
        if (rightWord.freq >= useMinSuggestionFrequency) {
          SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
              newSuggestion(prefix, leftWord, rightWord));
@ -294,7 +287,6 @@ public class WordBreakSpellChecker {
            suggestions.poll();
          }
        }        
        int newNumberBreaks = numberBreaks + 1;
        if (newNumberBreaks <= maxChanges) {
          int evaluations = generateBreakUpSuggestions(new Term(term.field(),
@ -304,6 +296,7 @@ public class WordBreakSpellChecker {
          totalEvaluations += evaluations;
        }
      }
      thisTimeEvaluations++;
      totalEvaluations++;
      if (totalEvaluations >= maxEvaluations) {
@ -338,16 +331,13 @@ public class WordBreakSpellChecker {
    return newSuggestion;
  }
-  private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
+  private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
-      int offset, int length, String fieldname) throws IOException {
+    Term term = new Term(fieldname, text);
    bytes.offset = offset;
    bytes.length = length;
    Term term = new Term(fieldname, bytes);
    int freq = ir.docFreq(term);
    SuggestWord word = new SuggestWord();
    word.freq = freq;
    word.score = 1;
-    word.string = term.text();
+    word.string = text;
    return word;
  }
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@ -17,6 +17,10 @@ package org.apache.lucene.search.spell;
 * limitations under the License.
 */
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Pattern;
 import junit.framework.Assert;
 import org.apache.lucene.analysis.MockAnalyzer;
@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.English;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util._TestUtil;
 public class TestWordBreakSpellChecker extends LuceneTestCase {
  private Directory dir = null;
@ -152,7 +157,6 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
      try { ir.close(); } catch(Exception e1) { }
    }    
  }  
  public void testBreakingWords() throws Exception {
    IndexReader ir = null;
    try {
@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
        Assert.assertTrue(sw[1][1].string.equals("thou"));
        Assert.assertTrue(sw[1][2].string.equals("sand"));
      }
      {
        //make sure we can handle 2-char codepoints
        Term term = new Term("numbers", "\uD864\uDC79");
        wbsp.setMaxChanges(1);
        wbsp.setMinBreakWordLength(1);
        wbsp.setMinSuggestionFrequency(1);
        SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
        Assert.assertTrue(sw.length==0);        
      }
    } catch(Exception e) {
      throw e;
@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
      try { ir.close(); } catch(Exception e1) { }
    }    
  }
  public void testRandom() throws Exception {
    int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
        (100 * RANDOM_MULTIPLIER));
    Directory dir = null;
    RandomIndexWriter writer = null;
    IndexReader ir = null;
    try {
      dir = newDirectory();
      writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(),
          MockTokenizer.WHITESPACE, false));
      int maxLength = _TestUtil.nextInt(random(), 5, 50);
      List<String> originals = new ArrayList<String>(numDocs);
      List<String[]> breaks = new ArrayList<String[]>(numDocs);
      for (int i = 0; i < numDocs; i++) {
        String orig = "";
        if (random().nextBoolean()) {
          while (!goodTestString(orig)) {
            orig = _TestUtil.randomSimpleString(random(), maxLength);
          }
        } else {
          while (!goodTestString(orig)) {
            orig = _TestUtil.randomUnicodeString(random(), maxLength);
          }
        }
        originals.add(orig);
        int totalLength = orig.codePointCount(0, orig.length());
        int breakAt = orig.offsetByCodePoints(0,
            _TestUtil.nextInt(random(), 1, totalLength - 1));
        String[] broken = new String[2];
        broken[0] = orig.substring(0, breakAt);
        broken[1] = orig.substring(breakAt);
        breaks.add(broken);
        Document doc = new Document();
        doc.add(newTextField("random_break", broken[0] + " " + broken[1],
            Field.Store.NO));
        doc.add(newTextField("random_combine", orig, Field.Store.NO));
        writer.addDocument(doc);
      }
      writer.commit();
      writer.close();
      ir = DirectoryReader.open(dir);
      WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
      wbsp.setMaxChanges(1);
      wbsp.setMinBreakWordLength(1);
      wbsp.setMinSuggestionFrequency(1);
      wbsp.setMaxCombineWordLength(maxLength);
      for (int i = 0; i < originals.size(); i++) {
        String orig = originals.get(i);
        String left = breaks.get(i)[0];
        String right = breaks.get(i)[1];
        {
          Term term = new Term("random_break", orig);
          SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
              ir, SuggestMode.SUGGEST_ALWAYS,
              BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
          boolean failed = true;
          for (SuggestWord[] sw1 : sw) {
            Assert.assertTrue(sw1.length == 2);
            if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
              failed = false;
            }
          }
          Assert.assertFalse("Failed getting break suggestions\n >Original: "
              + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
        }
        {
          Term[] terms = {new Term("random_combine", left),
              new Term("random_combine", right)};
          CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
              originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
          boolean failed = true;
          for (CombineSuggestion cs1 : cs) {
            Assert.assertTrue(cs1.originalTermIndexes.length == 2);
            if (cs1.suggestion.string.equals(left + right)) {
              failed = false;
            }
          }
          Assert.assertFalse("Failed getting combine suggestions\n >Original: "
              + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
        }
      }
    } catch (Exception e) {
      throw e;
    } finally {
      try {
        ir.close();
      } catch (Exception e1) {}
      try {
        writer.close();
      } catch (Exception e1) {}
      try {
        dir.close();
      } catch (Exception e1) {}
    }
  }
  private static final Pattern mockTokenizerWhitespacePattern = Pattern
      .compile("[ \\t\\r\\n]");
  private boolean goodTestString(String s) {
    if (s.codePointCount(0, s.length()) < 2
        || mockTokenizerWhitespacePattern.matcher(s).find()) {
      return false;
    }
    return true;
  }
 }