From 927531255d487775072fff543a14fbb72985d82a Mon Sep 17 00:00:00 2001
From: James Dyer <jdyer@apache.org>
Date: Fri, 7 Dec 2012 18:54:57 +0000
Subject: [PATCH] LUCENE-4587: fix WordBreakSpellChecker to handle non-latin
 characters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418437 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |   6 +-
 .../search/spell/WordBreakSpellChecker.java   |  72 +++++-----
 .../spell/TestWordBreakSpellChecker.java      | 126 +++++++++++++++++-
 3 files changed, 160 insertions(+), 44 deletions(-)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index d6a7d92d3e7..89f3b4decea 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -219,9 +219,13 @@ Bug Fixes
   In addition, it wasn't possible to index a shape representing the entire
   globe.
 
-* LUCENE--4595: EnwikiContentSource had a thread safety problem (NPE) in 
+* LUCENE-4595: EnwikiContentSource had a thread safety problem (NPE) in 
   'forever' mode (Doron Cohen)
 
+* LUCENE-4587: fix WordBreakSpellChecker to not throw AIOOBE when presented
+  with 2-char codepoints, and to correctly break/combine terms containing
+  non-latin characters. (James Dyer, Andreas Hubold)
+
 Optimizations
 
 * LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
index 50e4cd867be..30f26c95c39 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@@ -180,35 +180,32 @@ public class WordBreakSpellChecker {
         queueInitialCapacity, queueComparator);
     
     int thisTimeEvaluations = 0;
-    BytesRef reuse = new BytesRef();
     for (int i = 0; i < terms.length - 1; i++) {
       if (terms[i].equals(SEPARATOR_TERM)) {
         continue;
-      }
-      
-      int byteLength = terms[i].bytes().length;
-      if (byteLength > maxCombineWordLength) {
-        continue;
-      }
-      
-      reuse.grow(byteLength);
-      reuse.length = byteLength;
-      System.arraycopy(terms[i].bytes().bytes, terms[i].bytes().offset,
-          reuse.bytes, 0, byteLength);
-      
+      }      
+      String leftTermText = terms[i].text();
+      int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
+      if (leftTermLength > maxCombineWordLength) {
+       continue;
+      } 
       int maxFreq = 0;
       int minFreq = Integer.MAX_VALUE;
       if (origFreqs != null) {
         maxFreq = origFreqs[i];
         minFreq = origFreqs[i];
-      }
-      
+      } 
+      String combinedTermText = leftTermText;
+      int combinedLength = leftTermLength;
       for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
         if (terms[j].equals(SEPARATOR_TERM)) {
           break;
         }
-        byteLength += terms[j].bytes().length;
-        if (byteLength > maxCombineWordLength) {
+        String rightTermText = terms[j].text();
+        int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
+        combinedTermText += rightTermText;
+        combinedLength +=rightTermLength;
+        if (combinedLength > maxCombineWordLength) {
           break;
         }
         
@@ -216,13 +213,8 @@ public class WordBreakSpellChecker {
           maxFreq = Math.max(maxFreq, origFreqs[j]);
           minFreq = Math.min(minFreq, origFreqs[j]);
         }
-        
-        reuse.grow(byteLength);
-        System.arraycopy(terms[j].bytes().bytes, terms[j].bytes().offset,
-            reuse.bytes, reuse.length, terms[j].bytes().length);
-        reuse.length = byteLength;
-        
-        Term combinedTerm = new Term(terms[0].field(), reuse);
+                
+        Term combinedTerm = new Term(terms[0].field(), combinedTermText);
         int combinedTermFreq = ir.docFreq(combinedTerm);
         
         if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR
@@ -268,24 +260,25 @@ public class WordBreakSpellChecker {
       SuggestWord[] prefix, Queue<SuggestWordArrayWrapper> suggestions,
       int totalEvaluations, BreakSuggestionSortMethod sortMethod)
       throws IOException {
-    int termLength = term.bytes().length;
+    String termText = term.text();
+    int termLength = termText.codePointCount(0, termText.length());
     int useMinBreakWordLength = minBreakWordLength;
     if (useMinBreakWordLength < 1) {
       useMinBreakWordLength = 1;
     }
-    if (termLength <= (useMinBreakWordLength * 2)) {
+    if (termLength < (useMinBreakWordLength * 2)) {
       return 0;
-    }
+    }    
     
     int thisTimeEvaluations = 0;
-    BytesRef termBytes = term.bytes().clone();
-    for (int i = useMinBreakWordLength; i < (termLength - useMinBreakWordLength); i++) {
-      SuggestWord leftWord = generateSuggestWord(ir, termBytes, 0, i, term
-          .field());
+    for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
+      int end = termText.offsetByCodePoints(0, i);
+      String leftText = termText.substring(0, end);
+      String rightText = termText.substring(end);
+      SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
       
       if (leftWord.freq >= useMinSuggestionFrequency) {
-        SuggestWord rightWord = generateSuggestWord(ir, termBytes, i,
-            termLength - i, term.field());
+        SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
         if (rightWord.freq >= useMinSuggestionFrequency) {
           SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(
               newSuggestion(prefix, leftWord, rightWord));
@@ -293,8 +286,7 @@ public class WordBreakSpellChecker {
           if (suggestions.size() > maxSuggestions) {
             suggestions.poll();
           }
-        }
-        
+        }        
         int newNumberBreaks = numberBreaks + 1;
         if (newNumberBreaks <= maxChanges) {
           int evaluations = generateBreakUpSuggestions(new Term(term.field(),
@@ -304,6 +296,7 @@ public class WordBreakSpellChecker {
           totalEvaluations += evaluations;
         }
       }
+      
       thisTimeEvaluations++;
       totalEvaluations++;
       if (totalEvaluations >= maxEvaluations) {
@@ -338,16 +331,13 @@ public class WordBreakSpellChecker {
     return newSuggestion;
   }
   
-  private SuggestWord generateSuggestWord(IndexReader ir, BytesRef bytes,
-      int offset, int length, String fieldname) throws IOException {
-    bytes.offset = offset;
-    bytes.length = length;
-    Term term = new Term(fieldname, bytes);
+  private SuggestWord generateSuggestWord(IndexReader ir, String fieldname, String text) throws IOException {
+    Term term = new Term(fieldname, text);
     int freq = ir.docFreq(term);
     SuggestWord word = new SuggestWord();
     word.freq = freq;
     word.score = 1;
-    word.string = term.text();
+    word.string = text;
     return word;
   }
   
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
index 36e94119e11..58a8807eef0 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@@ -17,6 +17,10 @@ package org.apache.lucene.search.spell;
  * limitations under the License.
  */
 
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
 import junit.framework.Assert;
 
 import org.apache.lucene.analysis.MockAnalyzer;
@@ -31,6 +35,7 @@ import org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.English;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
 
 public class TestWordBreakSpellChecker extends LuceneTestCase {
   private Directory dir = null;
@@ -151,8 +156,7 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
     } finally {
       try { ir.close(); } catch(Exception e1) { }
     }    
-  }
-  
+  }  
   public void testBreakingWords() throws Exception {
     IndexReader ir = null;
     try {
@@ -241,6 +245,15 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
         Assert.assertTrue(sw[1][1].string.equals("thou"));
         Assert.assertTrue(sw[1][2].string.equals("sand"));
       }
+      {
+        //make sure we can handle 2-char codepoints
+        Term term = new Term("numbers", "\uD864\uDC79");
+        wbsp.setMaxChanges(1);
+        wbsp.setMinBreakWordLength(1);
+        wbsp.setMinSuggestionFrequency(1);
+        SuggestWord[][] sw = wbsp.suggestWordBreaks(term, 5, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+        Assert.assertTrue(sw.length==0);        
+      }
       
     } catch(Exception e) {
       throw e;
@@ -248,4 +261,113 @@ public class TestWordBreakSpellChecker extends LuceneTestCase {
       try { ir.close(); } catch(Exception e1) { }
     }    
   }
+  public void testRandom() throws Exception {
+    int numDocs = _TestUtil.nextInt(random(), (10 * RANDOM_MULTIPLIER),
+        (100 * RANDOM_MULTIPLIER));
+    Directory dir = null;
+    RandomIndexWriter writer = null;
+    IndexReader ir = null;
+    try {
+      dir = newDirectory();
+      writer = new RandomIndexWriter(random(), dir, new MockAnalyzer(random(),
+          MockTokenizer.WHITESPACE, false));
+      int maxLength = _TestUtil.nextInt(random(), 5, 50);
+      List<String> originals = new ArrayList<String>(numDocs);
+      List<String[]> breaks = new ArrayList<String[]>(numDocs);
+      for (int i = 0; i < numDocs; i++) {
+        String orig = "";
+        if (random().nextBoolean()) {
+          while (!goodTestString(orig)) {
+            orig = _TestUtil.randomSimpleString(random(), maxLength);
+          }
+        } else {
+          while (!goodTestString(orig)) {
+            orig = _TestUtil.randomUnicodeString(random(), maxLength);
+          }
+        }
+        originals.add(orig);
+        int totalLength = orig.codePointCount(0, orig.length());
+        int breakAt = orig.offsetByCodePoints(0,
+            _TestUtil.nextInt(random(), 1, totalLength - 1));
+        String[] broken = new String[2];
+        broken[0] = orig.substring(0, breakAt);
+        broken[1] = orig.substring(breakAt);
+        breaks.add(broken);
+        Document doc = new Document();
+        doc.add(newTextField("random_break", broken[0] + " " + broken[1],
+            Field.Store.NO));
+        doc.add(newTextField("random_combine", orig, Field.Store.NO));
+        writer.addDocument(doc);
+      }
+      writer.commit();
+      writer.close();
+      
+      ir = DirectoryReader.open(dir);
+      WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+      wbsp.setMaxChanges(1);
+      wbsp.setMinBreakWordLength(1);
+      wbsp.setMinSuggestionFrequency(1);
+      wbsp.setMaxCombineWordLength(maxLength);
+      for (int i = 0; i < originals.size(); i++) {
+        String orig = originals.get(i);
+        String left = breaks.get(i)[0];
+        String right = breaks.get(i)[1];
+        {
+          Term term = new Term("random_break", orig);
+          
+          SuggestWord[][] sw = wbsp.suggestWordBreaks(term, originals.size(),
+              ir, SuggestMode.SUGGEST_ALWAYS,
+              BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+          boolean failed = true;
+          for (SuggestWord[] sw1 : sw) {
+            Assert.assertTrue(sw1.length == 2);
+            if (sw1[0].string.equals(left) && sw1[1].string.equals(right)) {
+              failed = false;
+            }
+          }
+          Assert.assertFalse("Failed getting break suggestions\n >Original: "
+              + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
+        }
+        {
+          Term[] terms = {new Term("random_combine", left),
+              new Term("random_combine", right)};
+          CombineSuggestion[] cs = wbsp.suggestWordCombinations(terms,
+              originals.size(), ir, SuggestMode.SUGGEST_ALWAYS);
+          boolean failed = true;
+          for (CombineSuggestion cs1 : cs) {
+            Assert.assertTrue(cs1.originalTermIndexes.length == 2);
+            if (cs1.suggestion.string.equals(left + right)) {
+              failed = false;
+            }
+          }
+          Assert.assertFalse("Failed getting combine suggestions\n >Original: "
+              + orig + "\n >Left: " + left + "\n >Right: " + right, failed);
+        }
+      }
+      
+    } catch (Exception e) {
+      throw e;
+    } finally {
+      try {
+        ir.close();
+      } catch (Exception e1) {}
+      try {
+        writer.close();
+      } catch (Exception e1) {}
+      try {
+        dir.close();
+      } catch (Exception e1) {}
+    }
+  }
+  
+  private static final Pattern mockTokenizerWhitespacePattern = Pattern
+      .compile("[ \\t\\r\\n]");
+  
+  private boolean goodTestString(String s) {
+    if (s.codePointCount(0, s.length()) < 2
+        || mockTokenizerWhitespacePattern.matcher(s).find()) {
+      return false;
+    }
+    return true;
+  }
  }