LUCENE-3301: add workaround for jre breakiterator bugs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1187900 13f79535-47bb-0310-9956-ffa450edef68
2011-10-23 14:55:25 +00:00 · 2011-10-23 14:55:25 +00:00 · f21ac2f58c
parent 18f75faac9
commit f21ac2f58c
4 changed files with 344 additions and 5 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -125,6 +125,10 @@ Bug Fixes
   to retrieve top groups for any BlockJoinQuery after the first (Mark
   Harwood, Mike McCandless)
 * LUCENE-3301: Added a workaround for buggy BreakIterator implementations in
   Java that crash on certain inputs containing supplementary characters.
   (Robert Muir)
 API Changes
 * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java
@ -20,7 +20,6 @@ import java.io.IOException;
 import java.lang.Character.UnicodeBlock;
 import java.text.BreakIterator;
 import java.util.Locale;
 import javax.swing.text.Segment;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.util.CharArrayIterator;
 import org.apache.lucene.util.AttributeSource;
 import org.apache.lucene.util.Version;
@ -56,7 +56,7 @@ public final class ThaiWordFilter extends TokenFilter {
    DBBI_AVAILABLE = proto.isBoundary(4);
  }
  private final BreakIterator breaker = (BreakIterator) proto.clone();
-  private final Segment charIterator = new Segment();
+  private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
  private final boolean handlePosIncr;
@ -113,9 +113,7 @@ public final class ThaiWordFilter extends TokenFilter {
    }
    // reinit CharacterIterator
-    charIterator.array = clonedTermAtt.buffer();
+    charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
    charIterator.offset = 0;
    charIterator.count = clonedTermAtt.length();
    breaker.setText(charIterator);
    int end = breaker.next();
    if (end != BreakIterator.DONE) {
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java
@ -0,0 +1,175 @@
 package org.apache.lucene.analysis.util;
 import java.text.BreakIterator; // javadoc
 import java.text.CharacterIterator;
 import java.util.Locale;
 /** 
 * A CharacterIterator used internally for use with {@link BreakIterator}
 * @lucene.internal
 */
 public abstract class CharArrayIterator implements CharacterIterator {
  private char array[];
  private int start;
  private int index;
  private int length;
  private int limit;
  public char [] getText() {
    return array;
  }
  public int getStart() {
    return start;
  }
  public int getLength() {
    return length;
  }
  /**
   * Set a new region of text to be examined by this iterator
   * 
   * @param array text buffer to examine
   * @param start offset into buffer
   * @param length maximum length to examine
   */
  public void setText(final char array[], int start, int length) {
    this.array = array;
    this.start = start;
    this.index = start;
    this.length = length;
    this.limit = start + length;
  }
  public char current() {
    return (index == limit) ? DONE : jreBugWorkaround(array[index]);
  }
  protected abstract char jreBugWorkaround(char ch);
  public char first() {
    index = start;
    return current();
  }
  public int getBeginIndex() {
    return 0;
  }
  public int getEndIndex() {
    return length;
  }
  public int getIndex() {
    return index - start;
  }
  public char last() {
    index = (limit == start) ? limit : limit - 1;
    return current();
  }
  public char next() {
    if (++index >= limit) {
      index = limit;
      return DONE;
    } else {
      return current();
    }
  }
  public char previous() {
    if (--index < start) {
      index = start;
      return DONE;
    } else {
      return current();
    }
  }
  public char setIndex(int position) {
    if (position < getBeginIndex() || position > getEndIndex())
      throw new IllegalArgumentException("Illegal Position: " + position);
    index = start + position;
    return current();
  }
  @Override
  public Object clone() {
    try {
      return super.clone();
    } catch (CloneNotSupportedException e) {
      // CharacterIterator does not allow you to throw CloneNotSupported
      throw new RuntimeException(e);
    }
  }
  /**
   * Create a new CharArrayIterator that works around JRE bugs
   * in a manner suitable for {@link BreakIterator#getSentenceInstance()}
   */
  public static CharArrayIterator newSentenceInstance() {
    if (HAS_BUGGY_BREAKITERATORS) {
      return new CharArrayIterator() {
        // work around this for now by lying about all surrogates to 
        // the sentence tokenizer, instead we treat them all as 
        // SContinue so we won't break around them.
        @Override
        protected char jreBugWorkaround(char ch) {
          return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
        }
      };
    } else {
      return new CharArrayIterator() {
        // no bugs
        @Override
        protected char jreBugWorkaround(char ch) {
          return ch;
        }
      };
    }
  }
  /**
   * Create a new CharArrayIterator that works around JRE bugs
   * in a manner suitable for {@link BreakIterator#getWordInstance()}
   */
  public static CharArrayIterator newWordInstance() {
    if (HAS_BUGGY_BREAKITERATORS) {
      return new CharArrayIterator() {
        // work around this for now by lying about all surrogates to the word, 
        // instead we treat them all as ALetter so we won't break around them.
        @Override
        protected char jreBugWorkaround(char ch) {
          return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
        }
      };
    } else {
      return new CharArrayIterator() {
        // no bugs
        @Override
        protected char jreBugWorkaround(char ch) {
          return ch;
        }
      };
    }
  }
  /**
   * True if this JRE has a buggy BreakIterator implementation
   */
  public static final boolean HAS_BUGGY_BREAKITERATORS;
  static {
    boolean v;
    try {
      BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
      bi.setText("\udb40\udc53");
      bi.next();
      v = false;
    } catch (Exception e) {
      v = true;
    }
    HAS_BUGGY_BREAKITERATORS = v;
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java
@ -0,0 +1,162 @@
 package org.apache.lucene.analysis.util;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.text.BreakIterator;
 import java.text.CharacterIterator;
 import java.util.Locale;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util._TestUtil;
 public class TestCharArrayIterator extends LuceneTestCase {
  public void testWordInstance() {
    doTests(CharArrayIterator.newWordInstance());
  }
  public void testConsumeWordInstance() {
    BreakIterator bi = BreakIterator.getWordInstance();
    CharArrayIterator ci = CharArrayIterator.newWordInstance();
    for (int i = 0; i < 10000; i++) {
      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
      ci.setText(text, 0, text.length);
      consume(bi, ci);
    }
  }
  /* run this to test if your JRE is buggy
  public void testWordInstanceJREBUG() {
    BreakIterator bi = BreakIterator.getWordInstance();
    Segment ci = new Segment();
    for (int i = 0; i < 10000; i++) {
      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
      ci.array = text;
      ci.offset = 0;
      ci.count = text.length;
      consume(bi, ci);
    }
  }
  */
  public void testSentenceInstance() {
    doTests(CharArrayIterator.newSentenceInstance());
  }
  public void testConsumeSentenceInstance() {
    BreakIterator bi = BreakIterator.getSentenceInstance();
    CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
    for (int i = 0; i < 10000; i++) {
      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
      ci.setText(text, 0, text.length);
      consume(bi, ci);
    }
  }
  /* run this to test if your JRE is buggy
  public void testSentenceInstanceJREBUG() {
    BreakIterator bi = BreakIterator.getSentenceInstance();
    Segment ci = new Segment();
    for (int i = 0; i < 10000; i++) {
      char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
      ci.array = text;
      ci.offset = 0;
      ci.count = text.length;
      consume(bi, ci);
    }
  }
  */
  private void doTests(CharArrayIterator ci) {
    // basics
    ci.setText("testing".toCharArray(), 0, "testing".length());
    assertEquals(0, ci.getBeginIndex());
    assertEquals(7, ci.getEndIndex());
    assertEquals(0, ci.getIndex());
    assertEquals('t', ci.current());
    assertEquals('e', ci.next());
    assertEquals('g', ci.last());
    assertEquals('n', ci.previous());
    assertEquals('t', ci.first());
    assertEquals(CharacterIterator.DONE, ci.previous());
    // first()
    ci.setText("testing".toCharArray(), 0, "testing".length());
    ci.next();
    // Sets the position to getBeginIndex() and returns the character at that position. 
    assertEquals('t', ci.first());
    assertEquals(ci.getBeginIndex(), ci.getIndex());
    // or DONE if the text is empty
    ci.setText(new char[] {}, 0, 0);
    assertEquals(CharacterIterator.DONE, ci.first());
    // last()
    ci.setText("testing".toCharArray(), 0, "testing".length());
    // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) 
    // and returns the character at that position. 
    assertEquals('g', ci.last());
    assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
    // or DONE if the text is empty
    ci.setText(new char[] {}, 0, 0);
    assertEquals(CharacterIterator.DONE, ci.last());
    assertEquals(ci.getEndIndex(), ci.getIndex());
    // current()
    // Gets the character at the current position (as returned by getIndex()). 
    ci.setText("testing".toCharArray(), 0, "testing".length());
    assertEquals('t', ci.current());
    ci.last();
    ci.next();
    // or DONE if the current position is off the end of the text.
    assertEquals(CharacterIterator.DONE, ci.current());
    // next()
    ci.setText("te".toCharArray(), 0, 2);
    // Increments the iterator's index by one and returns the character at the new index.
    assertEquals('e', ci.next());
    assertEquals(1, ci.getIndex());
    // or DONE if the new position is off the end of the text range.
    assertEquals(CharacterIterator.DONE, ci.next());
    assertEquals(ci.getEndIndex(), ci.getIndex());
    // setIndex()
    ci.setText("test".toCharArray(), 0, "test".length());
    try {
      ci.setIndex(5);
      fail();
    } catch (Exception e) {
      assertTrue(e instanceof IllegalArgumentException);
    }
    // clone()
    char text[] = "testing".toCharArray();
    ci.setText(text, 0, text.length);
    ci.next();
    CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
    assertEquals(ci.getIndex(), ci2.getIndex());
    assertEquals(ci.next(), ci2.next());
    assertEquals(ci.last(), ci2.last());
  }
  private void consume(BreakIterator bi, CharacterIterator ci) {
    bi.setText(ci);
    while (bi.next() != BreakIterator.DONE)
      ;
  }
 }