mirror of https://github.com/apache/lucene.git
LUCENE-3301: add workaround for jre breakiterator bugs
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1187900 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
18f75faac9
commit
f21ac2f58c
|
@ -125,6 +125,10 @@ Bug Fixes
|
||||||
to retrieve top groups for any BlockJoinQuery after the first (Mark
|
to retrieve top groups for any BlockJoinQuery after the first (Mark
|
||||||
Harwood, Mike McCandless)
|
Harwood, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-3301: Added a workaround for buggy BreakIterator implementations in
|
||||||
|
Java that crash on certain inputs containing supplementary characters.
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
|
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
|
||||||
|
|
|
@ -20,7 +20,6 @@ import java.io.IOException;
|
||||||
import java.lang.Character.UnicodeBlock;
|
import java.lang.Character.UnicodeBlock;
|
||||||
import java.text.BreakIterator;
|
import java.text.BreakIterator;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import javax.swing.text.Segment;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -28,6 +27,7 @@ import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.util.CharArrayIterator;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ public final class ThaiWordFilter extends TokenFilter {
|
||||||
DBBI_AVAILABLE = proto.isBoundary(4);
|
DBBI_AVAILABLE = proto.isBoundary(4);
|
||||||
}
|
}
|
||||||
private final BreakIterator breaker = (BreakIterator) proto.clone();
|
private final BreakIterator breaker = (BreakIterator) proto.clone();
|
||||||
private final Segment charIterator = new Segment();
|
private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
|
||||||
|
|
||||||
private final boolean handlePosIncr;
|
private final boolean handlePosIncr;
|
||||||
|
|
||||||
|
@ -113,9 +113,7 @@ public final class ThaiWordFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
// reinit CharacterIterator
|
// reinit CharacterIterator
|
||||||
charIterator.array = clonedTermAtt.buffer();
|
charIterator.setText(clonedTermAtt.buffer(), 0, clonedTermAtt.length());
|
||||||
charIterator.offset = 0;
|
|
||||||
charIterator.count = clonedTermAtt.length();
|
|
||||||
breaker.setText(charIterator);
|
breaker.setText(charIterator);
|
||||||
int end = breaker.next();
|
int end = breaker.next();
|
||||||
if (end != BreakIterator.DONE) {
|
if (end != BreakIterator.DONE) {
|
||||||
|
|
|
@ -0,0 +1,175 @@
|
||||||
|
package org.apache.lucene.analysis.util;
|
||||||
|
|
||||||
|
import java.text.BreakIterator; // javadoc
|
||||||
|
import java.text.CharacterIterator;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A CharacterIterator used internally for use with {@link BreakIterator}
|
||||||
|
* @lucene.internal
|
||||||
|
*/
|
||||||
|
public abstract class CharArrayIterator implements CharacterIterator {
|
||||||
|
private char array[];
|
||||||
|
private int start;
|
||||||
|
private int index;
|
||||||
|
private int length;
|
||||||
|
private int limit;
|
||||||
|
|
||||||
|
public char [] getText() {
|
||||||
|
return array;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getStart() {
|
||||||
|
return start;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLength() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set a new region of text to be examined by this iterator
|
||||||
|
*
|
||||||
|
* @param array text buffer to examine
|
||||||
|
* @param start offset into buffer
|
||||||
|
* @param length maximum length to examine
|
||||||
|
*/
|
||||||
|
public void setText(final char array[], int start, int length) {
|
||||||
|
this.array = array;
|
||||||
|
this.start = start;
|
||||||
|
this.index = start;
|
||||||
|
this.length = length;
|
||||||
|
this.limit = start + length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public char current() {
|
||||||
|
return (index == limit) ? DONE : jreBugWorkaround(array[index]);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract char jreBugWorkaround(char ch);
|
||||||
|
|
||||||
|
public char first() {
|
||||||
|
index = start;
|
||||||
|
return current();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getBeginIndex() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getEndIndex() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getIndex() {
|
||||||
|
return index - start;
|
||||||
|
}
|
||||||
|
|
||||||
|
public char last() {
|
||||||
|
index = (limit == start) ? limit : limit - 1;
|
||||||
|
return current();
|
||||||
|
}
|
||||||
|
|
||||||
|
public char next() {
|
||||||
|
if (++index >= limit) {
|
||||||
|
index = limit;
|
||||||
|
return DONE;
|
||||||
|
} else {
|
||||||
|
return current();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public char previous() {
|
||||||
|
if (--index < start) {
|
||||||
|
index = start;
|
||||||
|
return DONE;
|
||||||
|
} else {
|
||||||
|
return current();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public char setIndex(int position) {
|
||||||
|
if (position < getBeginIndex() || position > getEndIndex())
|
||||||
|
throw new IllegalArgumentException("Illegal Position: " + position);
|
||||||
|
index = start + position;
|
||||||
|
return current();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Object clone() {
|
||||||
|
try {
|
||||||
|
return super.clone();
|
||||||
|
} catch (CloneNotSupportedException e) {
|
||||||
|
// CharacterIterator does not allow you to throw CloneNotSupported
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new CharArrayIterator that works around JRE bugs
|
||||||
|
* in a manner suitable for {@link BreakIterator#getSentenceInstance()}
|
||||||
|
*/
|
||||||
|
public static CharArrayIterator newSentenceInstance() {
|
||||||
|
if (HAS_BUGGY_BREAKITERATORS) {
|
||||||
|
return new CharArrayIterator() {
|
||||||
|
// work around this for now by lying about all surrogates to
|
||||||
|
// the sentence tokenizer, instead we treat them all as
|
||||||
|
// SContinue so we won't break around them.
|
||||||
|
@Override
|
||||||
|
protected char jreBugWorkaround(char ch) {
|
||||||
|
return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return new CharArrayIterator() {
|
||||||
|
// no bugs
|
||||||
|
@Override
|
||||||
|
protected char jreBugWorkaround(char ch) {
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new CharArrayIterator that works around JRE bugs
|
||||||
|
* in a manner suitable for {@link BreakIterator#getWordInstance()}
|
||||||
|
*/
|
||||||
|
public static CharArrayIterator newWordInstance() {
|
||||||
|
if (HAS_BUGGY_BREAKITERATORS) {
|
||||||
|
return new CharArrayIterator() {
|
||||||
|
// work around this for now by lying about all surrogates to the word,
|
||||||
|
// instead we treat them all as ALetter so we won't break around them.
|
||||||
|
@Override
|
||||||
|
protected char jreBugWorkaround(char ch) {
|
||||||
|
return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return new CharArrayIterator() {
|
||||||
|
// no bugs
|
||||||
|
@Override
|
||||||
|
protected char jreBugWorkaround(char ch) {
|
||||||
|
return ch;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* True if this JRE has a buggy BreakIterator implementation
|
||||||
|
*/
|
||||||
|
public static final boolean HAS_BUGGY_BREAKITERATORS;
|
||||||
|
static {
|
||||||
|
boolean v;
|
||||||
|
try {
|
||||||
|
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
|
||||||
|
bi.setText("\udb40\udc53");
|
||||||
|
bi.next();
|
||||||
|
v = false;
|
||||||
|
} catch (Exception e) {
|
||||||
|
v = true;
|
||||||
|
}
|
||||||
|
HAS_BUGGY_BREAKITERATORS = v;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,162 @@
|
||||||
|
package org.apache.lucene.analysis.util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.text.BreakIterator;
|
||||||
|
import java.text.CharacterIterator;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
public class TestCharArrayIterator extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testWordInstance() {
|
||||||
|
doTests(CharArrayIterator.newWordInstance());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testConsumeWordInstance() {
|
||||||
|
BreakIterator bi = BreakIterator.getWordInstance();
|
||||||
|
CharArrayIterator ci = CharArrayIterator.newWordInstance();
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
||||||
|
ci.setText(text, 0, text.length);
|
||||||
|
consume(bi, ci);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* run this to test if your JRE is buggy
|
||||||
|
public void testWordInstanceJREBUG() {
|
||||||
|
BreakIterator bi = BreakIterator.getWordInstance();
|
||||||
|
Segment ci = new Segment();
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
||||||
|
ci.array = text;
|
||||||
|
ci.offset = 0;
|
||||||
|
ci.count = text.length;
|
||||||
|
consume(bi, ci);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
public void testSentenceInstance() {
|
||||||
|
doTests(CharArrayIterator.newSentenceInstance());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testConsumeSentenceInstance() {
|
||||||
|
BreakIterator bi = BreakIterator.getSentenceInstance();
|
||||||
|
CharArrayIterator ci = CharArrayIterator.newSentenceInstance();
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
||||||
|
ci.setText(text, 0, text.length);
|
||||||
|
consume(bi, ci);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* run this to test if your JRE is buggy
|
||||||
|
public void testSentenceInstanceJREBUG() {
|
||||||
|
BreakIterator bi = BreakIterator.getSentenceInstance();
|
||||||
|
Segment ci = new Segment();
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
char text[] = _TestUtil.randomUnicodeString(random).toCharArray();
|
||||||
|
ci.array = text;
|
||||||
|
ci.offset = 0;
|
||||||
|
ci.count = text.length;
|
||||||
|
consume(bi, ci);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
private void doTests(CharArrayIterator ci) {
|
||||||
|
// basics
|
||||||
|
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||||
|
assertEquals(0, ci.getBeginIndex());
|
||||||
|
assertEquals(7, ci.getEndIndex());
|
||||||
|
assertEquals(0, ci.getIndex());
|
||||||
|
assertEquals('t', ci.current());
|
||||||
|
assertEquals('e', ci.next());
|
||||||
|
assertEquals('g', ci.last());
|
||||||
|
assertEquals('n', ci.previous());
|
||||||
|
assertEquals('t', ci.first());
|
||||||
|
assertEquals(CharacterIterator.DONE, ci.previous());
|
||||||
|
|
||||||
|
// first()
|
||||||
|
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||||
|
ci.next();
|
||||||
|
// Sets the position to getBeginIndex() and returns the character at that position.
|
||||||
|
assertEquals('t', ci.first());
|
||||||
|
assertEquals(ci.getBeginIndex(), ci.getIndex());
|
||||||
|
// or DONE if the text is empty
|
||||||
|
ci.setText(new char[] {}, 0, 0);
|
||||||
|
assertEquals(CharacterIterator.DONE, ci.first());
|
||||||
|
|
||||||
|
// last()
|
||||||
|
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||||
|
// Sets the position to getEndIndex()-1 (getEndIndex() if the text is empty)
|
||||||
|
// and returns the character at that position.
|
||||||
|
assertEquals('g', ci.last());
|
||||||
|
assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
|
||||||
|
// or DONE if the text is empty
|
||||||
|
ci.setText(new char[] {}, 0, 0);
|
||||||
|
assertEquals(CharacterIterator.DONE, ci.last());
|
||||||
|
assertEquals(ci.getEndIndex(), ci.getIndex());
|
||||||
|
|
||||||
|
// current()
|
||||||
|
// Gets the character at the current position (as returned by getIndex()).
|
||||||
|
ci.setText("testing".toCharArray(), 0, "testing".length());
|
||||||
|
assertEquals('t', ci.current());
|
||||||
|
ci.last();
|
||||||
|
ci.next();
|
||||||
|
// or DONE if the current position is off the end of the text.
|
||||||
|
assertEquals(CharacterIterator.DONE, ci.current());
|
||||||
|
|
||||||
|
// next()
|
||||||
|
ci.setText("te".toCharArray(), 0, 2);
|
||||||
|
// Increments the iterator's index by one and returns the character at the new index.
|
||||||
|
assertEquals('e', ci.next());
|
||||||
|
assertEquals(1, ci.getIndex());
|
||||||
|
// or DONE if the new position is off the end of the text range.
|
||||||
|
assertEquals(CharacterIterator.DONE, ci.next());
|
||||||
|
assertEquals(ci.getEndIndex(), ci.getIndex());
|
||||||
|
|
||||||
|
// setIndex()
|
||||||
|
ci.setText("test".toCharArray(), 0, "test".length());
|
||||||
|
try {
|
||||||
|
ci.setIndex(5);
|
||||||
|
fail();
|
||||||
|
} catch (Exception e) {
|
||||||
|
assertTrue(e instanceof IllegalArgumentException);
|
||||||
|
}
|
||||||
|
|
||||||
|
// clone()
|
||||||
|
char text[] = "testing".toCharArray();
|
||||||
|
ci.setText(text, 0, text.length);
|
||||||
|
ci.next();
|
||||||
|
CharArrayIterator ci2 = (CharArrayIterator) ci.clone();
|
||||||
|
assertEquals(ci.getIndex(), ci2.getIndex());
|
||||||
|
assertEquals(ci.next(), ci2.next());
|
||||||
|
assertEquals(ci.last(), ci2.last());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void consume(BreakIterator bi, CharacterIterator ci) {
|
||||||
|
bi.setText(ci);
|
||||||
|
while (bi.next() != BreakIterator.DONE)
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue