LUCENE-3301: add workaround for jre breakiterator bugs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1187900 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-10-23 14:55:25 +00:00
parent 18f75faac9
commit f21ac2f58c
4 changed files with 344 additions and 5 deletions

View File

@ -125,6 +125,10 @@ Bug Fixes
to retrieve top groups for any BlockJoinQuery after the first (Mark to retrieve top groups for any BlockJoinQuery after the first (Mark
Harwood, Mike McCandless) Harwood, Mike McCandless)
* LUCENE-3301: Added a workaround for buggy BreakIterator implementations in
Java that crash on certain inputs containing supplementary characters.
(Robert Muir)
API Changes API Changes
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy

View File

@ -20,7 +20,6 @@ import java.io.IOException;
import java.lang.Character.UnicodeBlock; import java.lang.Character.UnicodeBlock;
import java.text.BreakIterator; import java.text.BreakIterator;
import java.util.Locale; import java.util.Locale;
import javax.swing.text.Segment;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharArrayIterator;
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -56,7 +56,7 @@ public final class ThaiWordFilter extends TokenFilter {
DBBI_AVAILABLE = proto.isBoundary(4); DBBI_AVAILABLE = proto.isBoundary(4);
} }
private final BreakIterator breaker = (BreakIterator) proto.clone(); private final BreakIterator breaker = (BreakIterator) proto.clone();
private final Segment charIterator = new Segment(); private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
private final boolean handlePosIncr; private final boolean handlePosIncr;
@ -113,9 +113,7 @@ public final class ThaiWordFilter extends TokenFilter {
} }
// reinit CharacterIterator // reinit CharacterIterator
charIterator.array = clonedTermAtt.buffer(); charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
charIterator.offset = 0;
charIterator.count = clonedTermAtt.length();
breaker.setText(charIterator); breaker.setText(charIterator);
int end = breaker.next(); int end = breaker.next();
if (end != BreakIterator.DONE) { if (end != BreakIterator.DONE) {

View File

@ -0,0 +1,175 @@
package org.apache.lucene.analysis.util;
import java.text.BreakIterator; // javadoc
import java.text.CharacterIterator;
import java.util.Locale;
/**
* A CharacterIterator used internally for use with {@link BreakIterator}
* @lucene.internal
*/
public abstract class CharArrayIterator implements CharacterIterator {
private char array[];
private int start;
private int index;
private int length;
private int limit;
public char [] getText() {
return array;
}
public int getStart() {
return start;
}
public int getLength() {
return length;
}
/**
* Set a new region of text to be examined by this iterator
*
* @param array text buffer to examine
* @param start offset into buffer
* @param length maximum length to examine
*/
public void setText(final char array[], int start, int length) {
this.array = array;
this.start = start;
this.index = start;
this.length = length;
this.limit = start + length;
}
public char current() {
return (index == limit) ? DONE : jreBugWorkaround(array[index]);
}
protected abstract char jreBugWorkaround(char ch);
public char first() {
index = start;
return current();
}
public int getBeginIndex() {
return 0;
}
public int getEndIndex() {
return length;
}
public int getIndex() {
return index - start;
}
public char last() {
index = (limit == start) ? limit : limit - 1;
return current();
}
public char next() {
if (++index >= limit) {
index = limit;
return DONE;
} else {
return current();
}
}
public char previous() {
if (--index < start) {
index = start;
return DONE;
} else {
return current();
}
}
public char setIndex(int position) {
if (position < getBeginIndex() || position > getEndIndex())
throw new IllegalArgumentException("Illegal Position: " + position);
index = start + position;
return current();
}
@Override
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
// CharacterIterator does not allow you to throw CloneNotSupported
throw new RuntimeException(e);
}
}
/**
* Create a new CharArrayIterator that works around JRE bugs
* in a manner suitable for {@link BreakIterator#getSentenceInstance()}
*/
public static CharArrayIterator newSentenceInstance() {
if (HAS_BUGGY_BREAKITERATORS) {
return new CharArrayIterator() {
// work around this for now by lying about all surrogates to
// the sentence tokenizer, instead we treat them all as
// SContinue so we won't break around them.
@Override
protected char jreBugWorkaround(char ch) {
return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
}
};
} else {
return new CharArrayIterator() {
// no bugs
@Override
protected char jreBugWorkaround(char ch) {
return ch;
}
};
}
}
/**
* Create a new CharArrayIterator that works around JRE bugs
* in a manner suitable for {@link BreakIterator#getWordInstance()}
*/
public static CharArrayIterator newWordInstance() {
if (HAS_BUGGY_BREAKITERATORS) {
return new CharArrayIterator() {
// work around this for now by lying about all surrogates to the word,
// instead we treat them all as ALetter so we won't break around them.
@Override
protected char jreBugWorkaround(char ch) {
return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
}
};
} else {
return new CharArrayIterator() {
// no bugs
@Override
protected char jreBugWorkaround(char ch) {
return ch;
}
};
}
}
/**
* True if this JRE has a buggy BreakIterator implementation
*/
public static final boolean HAS_BUGGY_BREAKITERATORS;
static {
boolean v;
try {
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
bi.setText("\udb40\udc53");
bi.next();
v = false;
} catch (Exception e) {
v = true;
}
HAS_BUGGY_BREAKITERATORS = v;
}
}

View File

@ -0,0 +1,162 @@
package org.apache.lucene.analysis.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util._TestUtil;
public class TestCharArrayIterator extends LuceneTestCase {
public void testWordInstance() {
doTests(CharArrayIterator.newWordInstance());
}
public void testConsumeWordInstance() {
BreakIterator bi = BreakIterator.getWordInstance();
CharArrayIterator ci = CharArrayIterator.newWordInstance();
for (int i = 0; i < 10000; i++) {
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
ci.setText(text, 0, text.length);
consume(bi, ci);
}
}
/* run this to test if your JRE is buggy
public void testWordInstanceJREBUG() {
BreakIterator bi = BreakIterator.getWordInstance();
Segment ci = new Segment();
for (int i = 0; i < 10000; i++) {
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
ci.array = text;
ci.offset = 0;
ci.count = text.length;
consume(bi, ci);
}
}
*/
public void testSentenceInstance() {
doTests(CharArrayIterator.newSentenceInstance());
}
public void testConsumeSentenceInstance() {
BreakIterator bi = BreakIterator.getSentenceInstance();
CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
for (int i = 0; i < 10000; i++) {
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
ci.setText(text, 0, text.length);
consume(bi, ci);
}
}
/* run this to test if your JRE is buggy
public void testSentenceInstanceJREBUG() {
BreakIterator bi = BreakIterator.getSentenceInstance();
Segment ci = new Segment();
for (int i = 0; i < 10000; i++) {
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
ci.array = text;
ci.offset = 0;
ci.count = text.length;
consume(bi, ci);
}
}
*/
private void doTests(CharArrayIterator ci) {
// basics
ci.setText("testing".toCharArray(), 0, "testing".length());
assertEquals(0, ci.getBeginIndex());
assertEquals(7, ci.getEndIndex());
assertEquals(0, ci.getIndex());
assertEquals('t', ci.current());
assertEquals('e', ci.next());
assertEquals('g', ci.last());
assertEquals('n', ci.previous());
assertEquals('t', ci.first());
assertEquals(CharacterIterator.DONE, ci.previous());
// first()
ci.setText("testing".toCharArray(), 0, "testing".length());
ci.next();
// Sets the position to getBeginIndex() and returns the character at that position.
assertEquals('t', ci.first());
assertEquals(ci.getBeginIndex(), ci.getIndex());
// or DONE if the text is empty
ci.setText(new char[] {}, 0, 0);
assertEquals(CharacterIterator.DONE, ci.first());
// last()
ci.setText("testing".toCharArray(), 0, "testing".length());
// Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
// and returns the character at that position.
assertEquals('g', ci.last());
assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
// or DONE if the text is empty
ci.setText(new char[] {}, 0, 0);
assertEquals(CharacterIterator.DONE, ci.last());
assertEquals(ci.getEndIndex(), ci.getIndex());
// current()
// Gets the character at the current position (as returned by getIndex()).
ci.setText("testing".toCharArray(), 0, "testing".length());
assertEquals('t', ci.current());
ci.last();
ci.next();
// or DONE if the current position is off the end of the text.
assertEquals(CharacterIterator.DONE, ci.current());
// next()
ci.setText("te".toCharArray(), 0, 2);
// Increments the iterator's index by one and returns the character at the new index.
assertEquals('e', ci.next());
assertEquals(1, ci.getIndex());
// or DONE if the new position is off the end of the text range.
assertEquals(CharacterIterator.DONE, ci.next());
assertEquals(ci.getEndIndex(), ci.getIndex());
// setIndex()
ci.setText("test".toCharArray(), 0, "test".length());
try {
ci.setIndex(5);
fail();
} catch (Exception e) {
assertTrue(e instanceof IllegalArgumentException);
}
// clone()
char text[] = "testing".toCharArray();
ci.setText(text, 0, text.length);
ci.next();
CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
assertEquals(ci.getIndex(), ci2.getIndex());
assertEquals(ci.next(), ci2.next());
assertEquals(ci.last(), ci2.last());
}
private void consume(BreakIterator bi, CharacterIterator ci) {
bi.setText(ci);
while (bi.next() != BreakIterator.DONE)
;
}
}