From f21ac2f58cba6657ff4b059f544589e411951d91 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 23 Oct 2011 14:55:25 +0000 Subject: [PATCH] LUCENE-3301: add workaround for jre breakiterator bugs git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1187900 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 4 + .../lucene/analysis/th/ThaiWordFilter.java | 8 +- .../analysis/util/CharArrayIterator.java | 175 ++++++++++++++++++ .../analysis/util/TestCharArrayIterator.java | 162 ++++++++++++++++ 4 files changed, 344 insertions(+), 5 deletions(-) create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java create mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 12ea75c612e..64d161182f9 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -125,6 +125,10 @@ Bug Fixes to retrieve top groups for any BlockJoinQuery after the first (Mark Harwood, Mike McCandless) + * LUCENE-3301: Added a workaround for buggy BreakIterator implementations in + Java that crash on certain inputs containing supplementary characters. + (Robert Muir) + API Changes * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java index 5f3b7c79988..67397512e1e 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/th/ThaiWordFilter.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.lang.Character.UnicodeBlock; import java.text.BreakIterator; import java.util.Locale; -import javax.swing.text.Segment; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; @@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.util.CharArrayIterator; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; @@ -56,7 +56,7 @@ public final class ThaiWordFilter extends TokenFilter { DBBI_AVAILABLE = proto.isBoundary(4); } private final BreakIterator breaker = (BreakIterator) proto.clone(); - private final Segment charIterator = new Segment(); + private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance(); private final boolean handlePosIncr; @@ -113,9 +113,7 @@ public final class ThaiWordFilter extends TokenFilter { } // reinit CharacterIterator - charIterator.array = clonedTermAtt.buffer(); - charIterator.offset = 0; - charIterator.count = clonedTermAtt.length(); + charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length()); breaker.setText(charIterator); int end = breaker.next(); if (end != BreakIterator.DONE) { diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java new file mode 100644 index 00000000000..4d1447947bc --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java @@ -0,0 +1,175 @@ +package org.apache.lucene.analysis.util; + +import java.text.BreakIterator; // javadoc +import java.text.CharacterIterator; +import java.util.Locale; + +/** + * A CharacterIterator used internally for use with {@link BreakIterator} + * @lucene.internal + */ +public abstract class CharArrayIterator implements CharacterIterator { + private char array[]; + private int start; + private int index; + private int length; + private int limit; + + public char [] getText() { + return array; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Set a new region of text to be examined by this iterator + * + * @param array text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + public void setText(final char array[], int start, int length) { + this.array = array; + this.start = start; + this.index = start; + this.length = length; + this.limit = start + length; + } + + public char current() { + return (index == limit) ? DONE : jreBugWorkaround(array[index]); + } + + protected abstract char jreBugWorkaround(char ch); + + public char first() { + index = start; + return current(); + } + + public int getBeginIndex() { + return 0; + } + + public int getEndIndex() { + return length; + } + + public int getIndex() { + return index - start; + } + + public char last() { + index = (limit == start) ? limit : limit - 1; + return current(); + } + + public char next() { + if (++index >= limit) { + index = limit; + return DONE; + } else { + return current(); + } + } + + public char previous() { + if (--index < start) { + index = start; + return DONE; + } else { + return current(); + } + } + + public char setIndex(int position) { + if (position < getBeginIndex() || position > getEndIndex()) + throw new IllegalArgumentException("Illegal Position: " + position); + index = start + position; + return current(); + } + + @Override + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + // CharacterIterator does not allow you to throw CloneNotSupported + throw new RuntimeException(e); + } + } + + /** + * Create a new CharArrayIterator that works around JRE bugs + * in a manner suitable for {@link BreakIterator#getSentenceInstance()} + */ + public static CharArrayIterator newSentenceInstance() { + if (HAS_BUGGY_BREAKITERATORS) { + return new CharArrayIterator() { + // work around this for now by lying about all surrogates to + // the sentence tokenizer, instead we treat them all as + // SContinue so we won't break around them. + @Override + protected char jreBugWorkaround(char ch) { + return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch; + } + }; + } else { + return new CharArrayIterator() { + // no bugs + @Override + protected char jreBugWorkaround(char ch) { + return ch; + } + }; + } + } + + /** + * Create a new CharArrayIterator that works around JRE bugs + * in a manner suitable for {@link BreakIterator#getWordInstance()} + */ + public static CharArrayIterator newWordInstance() { + if (HAS_BUGGY_BREAKITERATORS) { + return new CharArrayIterator() { + // work around this for now by lying about all surrogates to the word, + // instead we treat them all as ALetter so we won't break around them. + @Override + protected char jreBugWorkaround(char ch) { + return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch; + } + }; + } else { + return new CharArrayIterator() { + // no bugs + @Override + protected char jreBugWorkaround(char ch) { + return ch; + } + }; + } + } + + /** + * True if this JRE has a buggy BreakIterator implementation + */ + public static final boolean HAS_BUGGY_BREAKITERATORS; + static { + boolean v; + try { + BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); + bi.setText("\udb40\udc53"); + bi.next(); + v = false; + } catch (Exception e) { + v = true; + } + HAS_BUGGY_BREAKITERATORS = v; + } +} diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java new file mode 100644 index 00000000000..bfcab81eaf5 --- /dev/null +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/util/TestCharArrayIterator.java @@ -0,0 +1,162 @@ +package org.apache.lucene.analysis.util; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.BreakIterator; +import java.text.CharacterIterator; +import java.util.Locale; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; + +public class TestCharArrayIterator extends LuceneTestCase { + + public void testWordInstance() { + doTests(CharArrayIterator.newWordInstance()); + } + + public void testConsumeWordInstance() { + BreakIterator bi = BreakIterator.getWordInstance(); + CharArrayIterator ci = CharArrayIterator.newWordInstance(); + for (int i = 0; i < 10000; i++) { + char text[] = _TestUtil.randomUnicodeString(random).toCharArray(); + ci.setText(text, 0, text.length); + consume(bi, ci); + } + } + + /* run this to test if your JRE is buggy + public void testWordInstanceJREBUG() { + BreakIterator bi = BreakIterator.getWordInstance(); + Segment ci = new Segment(); + for (int i = 0; i < 10000; i++) { + char text[] = _TestUtil.randomUnicodeString(random).toCharArray(); + ci.array = text; + ci.offset = 0; + ci.count = text.length; + consume(bi, ci); + } + } + */ + + public void testSentenceInstance() { + doTests(CharArrayIterator.newSentenceInstance()); + } + + public void testConsumeSentenceInstance() { + BreakIterator bi = BreakIterator.getSentenceInstance(); + CharArrayIterator ci = CharArrayIterator.newSentenceInstance(); + for (int i = 0; i < 10000; i++) { + char text[] = _TestUtil.randomUnicodeString(random).toCharArray(); + ci.setText(text, 0, text.length); + consume(bi, ci); + } + } + + /* run this to test if your JRE is buggy + public void testSentenceInstanceJREBUG() { + BreakIterator bi = BreakIterator.getSentenceInstance(); + Segment ci = new Segment(); + for (int i = 0; i < 10000; i++) { + char text[] = _TestUtil.randomUnicodeString(random).toCharArray(); + ci.array = text; + ci.offset = 0; + ci.count = text.length; + consume(bi, ci); + } + } + */ + + private void doTests(CharArrayIterator ci) { + // basics + ci.setText("testing".toCharArray(), 0, "testing".length()); + assertEquals(0, ci.getBeginIndex()); + assertEquals(7, ci.getEndIndex()); + assertEquals(0, ci.getIndex()); + assertEquals('t', ci.current()); + assertEquals('e', ci.next()); + assertEquals('g', ci.last()); + assertEquals('n', ci.previous()); + assertEquals('t', ci.first()); + assertEquals(CharacterIterator.DONE, ci.previous()); + + // first() + ci.setText("testing".toCharArray(), 0, "testing".length()); + ci.next(); + // Sets the position to getBeginIndex() and returns the character at that position. + assertEquals('t', ci.first()); + assertEquals(ci.getBeginIndex(), ci.getIndex()); + // or DONE if the text is empty + ci.setText(new char[] {}, 0, 0); + assertEquals(CharacterIterator.DONE, ci.first()); + + // last() + ci.setText("testing".toCharArray(), 0, "testing".length()); + // Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty) + // and returns the character at that position. + assertEquals('g', ci.last()); + assertEquals(ci.getIndex(), ci.getEndIndex() - 1); + // or DONE if the text is empty + ci.setText(new char[] {}, 0, 0); + assertEquals(CharacterIterator.DONE, ci.last()); + assertEquals(ci.getEndIndex(), ci.getIndex()); + + // current() + // Gets the character at the current position (as returned by getIndex()). + ci.setText("testing".toCharArray(), 0, "testing".length()); + assertEquals('t', ci.current()); + ci.last(); + ci.next(); + // or DONE if the current position is off the end of the text. + assertEquals(CharacterIterator.DONE, ci.current()); + + // next() + ci.setText("te".toCharArray(), 0, 2); + // Increments the iterator's index by one and returns the character at the new index. + assertEquals('e', ci.next()); + assertEquals(1, ci.getIndex()); + // or DONE if the new position is off the end of the text range. + assertEquals(CharacterIterator.DONE, ci.next()); + assertEquals(ci.getEndIndex(), ci.getIndex()); + + // setIndex() + ci.setText("test".toCharArray(), 0, "test".length()); + try { + ci.setIndex(5); + fail(); + } catch (Exception e) { + assertTrue(e instanceof IllegalArgumentException); + } + + // clone() + char text[] = "testing".toCharArray(); + ci.setText(text, 0, text.length); + ci.next(); + CharArrayIterator ci2 = (CharArrayIterator) ci.clone(); + assertEquals(ci.getIndex(), ci2.getIndex()); + assertEquals(ci.next(), ci2.next()); + assertEquals(ci.last(), ci2.last()); + } + + private void consume(BreakIterator bi, CharacterIterator ci) { + bi.setText(ci); + while (bi.next() != BreakIterator.DONE) + ; + } +}