From f21ac2f58cba6657ff4b059f544589e411951d91 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Sun, 23 Oct 2011 14:55:25 +0000
Subject: [PATCH] LUCENE-3301: add workaround for jre breakiterator bugs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1187900 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/contrib/CHANGES.txt                    |   4 +
 .../lucene/analysis/th/ThaiWordFilter.java    |   8 +-
 .../analysis/util/CharArrayIterator.java      | 175 ++++++++++++++++++
 .../analysis/util/TestCharArrayIterator.java  | 162 ++++++++++++++++
 4 files changed, 344 insertions(+), 5 deletions(-)
 create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java
 create mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java

diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt
index 12ea75c612e..64d161182f9 100644
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@@ -125,6 +125,10 @@ Bug Fixes
    to retrieve top groups for any BlockJoinQuery after the first (Mark
    Harwood, Mike McCandless)
 
+ * LUCENE-3301: Added a workaround for buggy BreakIterator implementations in
+   Java that crash on certain inputs containing supplementary characters.
+   (Robert Muir)
+
 API Changes
  
  * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
index 5f3b7c79988..67397512e1e 100644
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@@ -20,7 +20,6 @@ import java.io.IOException;
 import java.lang.Character.UnicodeBlock;
 import java.text.BreakIterator;
 import java.util.Locale;
-import javax.swing.text.Segment;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.util.CharArrayIterator;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;
 
@@ -56,7 +56,7 @@ public final class ThaiWordFilter extends TokenFilter {
     DBBI_AVAILABLE = proto.isBoundary(4);
   }
   private final BreakIterator breaker = (BreakIterator) proto.clone();
-  private final Segment charIterator = new Segment();
+  private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
   
   private final boolean handlePosIncr;
   
@@ -113,9 +113,7 @@ public final class ThaiWordFilter extends TokenFilter {
     }
     
     // reinit CharacterIterator
-    charIterator.array = clonedTermAtt.buffer();
-    charIterator.offset = 0;
-    charIterator.count = clonedTermAtt.length();
+    charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
     breaker.setText(charIterator);
     int end = breaker.next();
     if (end != BreakIterator.DONE) {
diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java
new file mode 100644
index 00000000000..4d1447947bc
--- /dev/null
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java
@@ -0,0 +1,175 @@
+package org.apache.lucene.analysis.util;
+
+import java.text.BreakIterator; // javadoc
+import java.text.CharacterIterator;
+import java.util.Locale;
+
+/** 
+ * A CharacterIterator used internally for use with {@link BreakIterator}
+ * @lucene.internal
+ */
+public abstract class CharArrayIterator implements CharacterIterator {
+  private char array[];
+  private int start;
+  private int index;
+  private int length;
+  private int limit;
+
+  public char [] getText() {
+    return array;
+  }
+  
+  public int getStart() {
+    return start;
+  }
+  
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * Set a new region of text to be examined by this iterator
+   * 
+   * @param array text buffer to examine
+   * @param start offset into buffer
+   * @param length maximum length to examine
+   */
+  public void setText(final char array[], int start, int length) {
+    this.array = array;
+    this.start = start;
+    this.index = start;
+    this.length = length;
+    this.limit = start + length;
+  }
+
+  public char current() {
+    return (index == limit) ? DONE : jreBugWorkaround(array[index]);
+  }
+  
+  protected abstract char jreBugWorkaround(char ch);
+
+  public char first() {
+    index = start;
+    return current();
+  }
+
+  public int getBeginIndex() {
+    return 0;
+  }
+
+  public int getEndIndex() {
+    return length;
+  }
+
+  public int getIndex() {
+    return index - start;
+  }
+
+  public char last() {
+    index = (limit == start) ? limit : limit - 1;
+    return current();
+  }
+
+  public char next() {
+    if (++index >= limit) {
+      index = limit;
+      return DONE;
+    } else {
+      return current();
+    }
+  }
+
+  public char previous() {
+    if (--index < start) {
+      index = start;
+      return DONE;
+    } else {
+      return current();
+    }
+  }
+
+  public char setIndex(int position) {
+    if (position < getBeginIndex() || position > getEndIndex())
+      throw new IllegalArgumentException("Illegal Position: " + position);
+    index = start + position;
+    return current();
+  }
+  
+  @Override
+  public Object clone() {
+    try {
+      return super.clone();
+    } catch (CloneNotSupportedException e) {
+      // CharacterIterator does not allow you to throw CloneNotSupported
+      throw new RuntimeException(e);
+    }
+  }
+  
+  /**
+   * Create a new CharArrayIterator that works around JRE bugs
+   * in a manner suitable for {@link BreakIterator#getSentenceInstance()}
+   */
+  public static CharArrayIterator newSentenceInstance() {
+    if (HAS_BUGGY_BREAKITERATORS) {
+      return new CharArrayIterator() {
+        // work around this for now by lying about all surrogates to 
+        // the sentence tokenizer, instead we treat them all as 
+        // SContinue so we won't break around them.
+        @Override
+        protected char jreBugWorkaround(char ch) {
+          return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
+        }
+      };
+    } else {
+      return new CharArrayIterator() {
+        // no bugs
+        @Override
+        protected char jreBugWorkaround(char ch) {
+          return ch;
+        }
+      };
+    }
+  }
+  
+  /**
+   * Create a new CharArrayIterator that works around JRE bugs
+   * in a manner suitable for {@link BreakIterator#getWordInstance()}
+   */
+  public static CharArrayIterator newWordInstance() {
+    if (HAS_BUGGY_BREAKITERATORS) {
+      return new CharArrayIterator() {
+        // work around this for now by lying about all surrogates to the word, 
+        // instead we treat them all as ALetter so we won't break around them.
+        @Override
+        protected char jreBugWorkaround(char ch) {
+          return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
+        }
+      };
+    } else {
+      return new CharArrayIterator() {
+        // no bugs
+        @Override
+        protected char jreBugWorkaround(char ch) {
+          return ch;
+        }
+      };
+    }
+  }
+  
+  /**
+   * True if this JRE has a buggy BreakIterator implementation
+   */
+  public static final boolean HAS_BUGGY_BREAKITERATORS;
+  static {
+    boolean v;
+    try {
+      BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
+      bi.setText("\udb40\udc53");
+      bi.next();
+      v = false;
+    } catch (Exception e) {
+      v = true;
+    }
+    HAS_BUGGY_BREAKITERATORS = v;
+  }
+}
diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java
new file mode 100644
index 00000000000..bfcab81eaf5
--- /dev/null
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java
@@ -0,0 +1,162 @@
+package org.apache.lucene.analysis.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+import java.util.Locale;
+
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+
+public class TestCharArrayIterator extends LuceneTestCase {
+  
+  public void testWordInstance() {
+    doTests(CharArrayIterator.newWordInstance());
+  }
+  
+  public void testConsumeWordInstance() {
+    BreakIterator bi = BreakIterator.getWordInstance();
+    CharArrayIterator ci = CharArrayIterator.newWordInstance();
+    for (int i = 0; i < 10000; i++) {
+      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+      ci.setText(text, 0, text.length);
+      consume(bi, ci);
+    }
+  }
+  
+  /* run this to test if your JRE is buggy
+  public void testWordInstanceJREBUG() {
+    BreakIterator bi = BreakIterator.getWordInstance();
+    Segment ci = new Segment();
+    for (int i = 0; i < 10000; i++) {
+      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+      ci.array = text;
+      ci.offset = 0;
+      ci.count = text.length;
+      consume(bi, ci);
+    }
+  }
+  */
+  
+  public void testSentenceInstance() {
+    doTests(CharArrayIterator.newSentenceInstance());
+  }
+  
+  public void testConsumeSentenceInstance() {
+    BreakIterator bi = BreakIterator.getSentenceInstance();
+    CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
+    for (int i = 0; i < 10000; i++) {
+      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+      ci.setText(text, 0, text.length);
+      consume(bi, ci);
+    }
+  }
+  
+  /* run this to test if your JRE is buggy
+  public void testSentenceInstanceJREBUG() {
+    BreakIterator bi = BreakIterator.getSentenceInstance();
+    Segment ci = new Segment();
+    for (int i = 0; i < 10000; i++) {
+      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
+      ci.array = text;
+      ci.offset = 0;
+      ci.count = text.length;
+      consume(bi, ci);
+    }
+  }
+  */
+  
+  private void doTests(CharArrayIterator ci) {
+    // basics
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    assertEquals(0, ci.getBeginIndex());
+    assertEquals(7, ci.getEndIndex());
+    assertEquals(0, ci.getIndex());
+    assertEquals('t', ci.current());
+    assertEquals('e', ci.next());
+    assertEquals('g', ci.last());
+    assertEquals('n', ci.previous());
+    assertEquals('t', ci.first());
+    assertEquals(CharacterIterator.DONE, ci.previous());
+    
+    // first()
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    ci.next();
+    // Sets the position to getBeginIndex() and returns the character at that position. 
+    assertEquals('t', ci.first());
+    assertEquals(ci.getBeginIndex(), ci.getIndex());
+    // or DONE if the text is empty
+    ci.setText(new char[] {}, 0, 0);
+    assertEquals(CharacterIterator.DONE, ci.first());
+    
+    // last()
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) 
+    // and returns the character at that position. 
+    assertEquals('g', ci.last());
+    assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
+    // or DONE if the text is empty
+    ci.setText(new char[] {}, 0, 0);
+    assertEquals(CharacterIterator.DONE, ci.last());
+    assertEquals(ci.getEndIndex(), ci.getIndex());
+    
+    // current()
+    // Gets the character at the current position (as returned by getIndex()). 
+    ci.setText("testing".toCharArray(), 0, "testing".length());
+    assertEquals('t', ci.current());
+    ci.last();
+    ci.next();
+    // or DONE if the current position is off the end of the text.
+    assertEquals(CharacterIterator.DONE, ci.current());
+    
+    // next()
+    ci.setText("te".toCharArray(), 0, 2);
+    // Increments the iterator's index by one and returns the character at the new index.
+    assertEquals('e', ci.next());
+    assertEquals(1, ci.getIndex());
+    // or DONE if the new position is off the end of the text range.
+    assertEquals(CharacterIterator.DONE, ci.next());
+    assertEquals(ci.getEndIndex(), ci.getIndex());
+    
+    // setIndex()
+    ci.setText("test".toCharArray(), 0, "test".length());
+    try {
+      ci.setIndex(5);
+      fail();
+    } catch (Exception e) {
+      assertTrue(e instanceof IllegalArgumentException);
+    }
+    
+    // clone()
+    char text[] = "testing".toCharArray();
+    ci.setText(text, 0, text.length);
+    ci.next();
+    CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
+    assertEquals(ci.getIndex(), ci2.getIndex());
+    assertEquals(ci.next(), ci2.next());
+    assertEquals(ci.last(), ci2.last());
+  }
+  
+  private void consume(BreakIterator bi, CharacterIterator ci) {
+    bi.setText(ci);
+    while (bi.next() != BreakIterator.DONE)
+      ;
+  }
+}