mirror of https://github.com/apache/lucene.git
LUCENE-3896: CharacterUtils.fill must call Reader.read again if it only got a single high surrogate char on the first read
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1303374 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1d642b3cd7
commit
595744089a
|
@ -1,10 +1,5 @@
|
||||||
package org.apache.lucene.analysis.util;
|
package org.apache.lucene.analysis.util;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
|
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -22,6 +17,11 @@ import org.apache.lucene.util.Version;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link CharacterUtils} provides a unified interface to Character-related
|
* {@link CharacterUtils} provides a unified interface to Character-related
|
||||||
* operations to implement backwards compatible character operations based on a
|
* operations to implement backwards compatible character operations based on a
|
||||||
|
@ -121,8 +121,9 @@ public abstract class CharacterUtils {
|
||||||
* @return a new {@link CharacterBuffer} instance.
|
* @return a new {@link CharacterBuffer} instance.
|
||||||
*/
|
*/
|
||||||
public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
|
public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
|
||||||
if(bufferSize < 2)
|
if (bufferSize < 2) {
|
||||||
throw new IllegalArgumentException("buffersize must be >= 2");
|
throw new IllegalArgumentException("buffersize must be >= 2");
|
||||||
|
}
|
||||||
return new CharacterBuffer(new char[bufferSize], 0, 0);
|
return new CharacterBuffer(new char[bufferSize], 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,7 +160,7 @@ public abstract class CharacterUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final int codePointAt(final char[] chars, final int offset) {
|
public int codePointAt(final char[] chars, final int offset) {
|
||||||
return Character.codePointAt(chars, offset);
|
return Character.codePointAt(chars, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -177,21 +178,51 @@ public abstract class CharacterUtils {
|
||||||
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
|
public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
|
||||||
final char[] charBuffer = buffer.buffer;
|
final char[] charBuffer = buffer.buffer;
|
||||||
buffer.offset = 0;
|
buffer.offset = 0;
|
||||||
charBuffer[0] = buffer.lastTrailingHighSurrogate;
|
final int offset;
|
||||||
final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1;
|
|
||||||
buffer.lastTrailingHighSurrogate = 0;
|
// Install the previously saved ending high surrogate:
|
||||||
final int read = reader.read(charBuffer, offset, charBuffer.length
|
if (buffer.lastTrailingHighSurrogate != 0) {
|
||||||
- offset);
|
charBuffer[0] = buffer.lastTrailingHighSurrogate;
|
||||||
|
offset = 1;
|
||||||
|
} else {
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
final int read = reader.read(charBuffer,
|
||||||
|
offset,
|
||||||
|
charBuffer.length - offset);
|
||||||
if (read == -1) {
|
if (read == -1) {
|
||||||
buffer.length = offset;
|
buffer.length = offset;
|
||||||
|
buffer.lastTrailingHighSurrogate = 0;
|
||||||
return offset != 0;
|
return offset != 0;
|
||||||
}
|
}
|
||||||
|
assert read > 0;
|
||||||
buffer.length = read + offset;
|
buffer.length = read + offset;
|
||||||
// special case if the read returns 0 and the lastTrailingHighSurrogate was set
|
|
||||||
|
// If we read only a single char, and that char was a
|
||||||
|
// high surrogate, read again:
|
||||||
|
if (buffer.length == 1
|
||||||
|
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||||
|
final int read2 = reader.read(charBuffer,
|
||||||
|
1,
|
||||||
|
charBuffer.length - 1);
|
||||||
|
if (read2 == -1) {
|
||||||
|
// NOTE: mal-formed input (ended on a high
|
||||||
|
// surrogate)! Consumer must deal with it...
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
assert read2 > 0;
|
||||||
|
|
||||||
|
buffer.length += read2;
|
||||||
|
}
|
||||||
|
|
||||||
if (buffer.length > 1
|
if (buffer.length > 1
|
||||||
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
&& Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
|
||||||
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
|
buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
|
||||||
|
} else {
|
||||||
|
buffer.lastTrailingHighSurrogate = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -201,7 +232,7 @@ public abstract class CharacterUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final int codePointAt(final char[] chars, final int offset) {
|
public int codePointAt(final char[] chars, final int offset) {
|
||||||
return chars[offset];
|
return chars[offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -238,7 +269,9 @@ public abstract class CharacterUtils {
|
||||||
private final char[] buffer;
|
private final char[] buffer;
|
||||||
private int offset;
|
private int offset;
|
||||||
private int length;
|
private int length;
|
||||||
private char lastTrailingHighSurrogate = 0;
|
// NOTE: not private so outer class can access without
|
||||||
|
// $access methods:
|
||||||
|
char lastTrailingHighSurrogate;
|
||||||
|
|
||||||
CharacterBuffer(char[] buffer, int offset, int length) {
|
CharacterBuffer(char[] buffer, int offset, int length) {
|
||||||
this.buffer = buffer;
|
this.buffer = buffer;
|
||||||
|
|
Loading…
Reference in New Issue