LUCENE-9191: ensure LineFileDocs random seeking effort does not seek into the middle of a multi-byte UTF-8 encoded Unicode character

This commit is contained in:
Mike McCandless 2020-05-04 13:29:00 -04:00
parent b81083142c
commit 1783c4ad47
1 changed files with 9 additions and 8 deletions

View File

@ -88,7 +88,6 @@ public class LineFileDocs implements Closeable {
// true if the InputStream is not already randomly seek'd after the if/else block below:
boolean needSkip;
boolean skipFirstLineFragment = false;
long size = 0L, seekTo = 0L;
if (is == null) {
@ -109,8 +108,15 @@ public class LineFileDocs implements Closeable {
channel.position(seekTo);
is = Channels.newInputStream(channel);
// we (likely) seeked to the middle of a line:
skipFirstLineFragment = true;
// read until newline char, otherwise we may hit "java.nio.charset.MalformedInputException: Input length = 1"
// exception in readline() below, because we seeked part way through a multi-byte (in UTF-8) encoded
// unicode character:
if (seekTo > 0L) {
int b;
do {
b = is.read();
} while (b >= 0 && b != 13 && b != 10);
}
needSkip = false;
}
@ -169,11 +175,6 @@ public class LineFileDocs implements Closeable {
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);
if (skipFirstLineFragment) {
// read until end of line:
reader.readLine();
}
}
public synchronized void reset() throws IOException {