CSV-278: Reuse Buffers in Lexer for Delimiter Detection (#162)

* CSV-278: Reuse Buffers in Lexer for Delimiter Detection

* Remove erroneous tab character

* Reduce change set with fewer formatting changes

* Reduce change set with fewer formatting changes
This commit is contained in:
belugabehr 2021-07-15 09:41:22 -04:00 committed by GitHub
parent a4e005fdf5
commit 3ac702b190
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 12 deletions

View File

@ -132,6 +132,21 @@ final class ExtendedBufferedReader extends BufferedReader {
*/
char[] lookAhead(final int n) throws IOException {
final char[] buf = new char[n];
return lookAhead(buf);
}
/**
* Populates the buffer with the next {@code buf.length} characters in the
* current reader without consuming them. The next call to {@link #read()} will
* still return the next value. This doesn't affect line number or last
* character.
*
* @param buf the buffer to fill for the look ahead.
* @return the buffer itself
* @throws IOException If an I/O error occurs
*/
char[] lookAhead(final char[] buf) throws IOException {
final int n = buf.length;
super.mark(n);
super.read(buf, 0, n);
super.reset();

View File

@ -49,6 +49,8 @@ final class Lexer implements Closeable {
private static final char DISABLED = '\ufffe';
private final char[] delimiter;
private final char[] delimiterBuf;
private final char[] escapeDelimiterBuf;
private final char escape;
private final char quoteChar;
private final char commentStart;
@ -68,6 +70,8 @@ final class Lexer implements Closeable {
this.commentStart = mapNullToDisabled(format.getCommentMarker());
this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces();
this.ignoreEmptyLines = format.getIgnoreEmptyLines();
this.delimiterBuf = new char[delimiter.length - 1];
this.escapeDelimiterBuf = new char[2 * delimiter.length - 1];
}
/**
@ -112,7 +116,7 @@ final class Lexer implements Closeable {
}
/**
* Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#lookAhead(int)}
* Determine whether the next characters constitute a delimiter through {@link ExtendedBufferedReader#lookAhead(char[])}.
*
* @param ch
* the current character.
@ -126,14 +130,13 @@ final class Lexer implements Closeable {
if (delimiter.length == 1) {
return true;
}
final int len = delimiter.length - 1;
final char[] buf = reader.lookAhead(len);
for (int i = 0; i < len; i++) {
if (buf[i] != delimiter[i+1]) {
reader.lookAhead(delimiterBuf);
for (int i = 0; i < delimiterBuf.length; i++) {
if (delimiterBuf[i] != delimiter[i+1]) {
return false;
}
}
final int count = reader.read(buf, 0, len);
final int count = reader.read(delimiterBuf, 0, delimiterBuf.length);
return count != END_OF_STREAM;
}
@ -156,7 +159,7 @@ final class Lexer implements Closeable {
}
/**
* Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#lookAhead(int)}.
* Tests if the next characters constitute a escape delimiter through {@link ExtendedBufferedReader#lookAhead(char[])}.
*
* For example, for delimiter "[|]" and escape '!', return true if the next characters constitute "![!|!]".
*
@ -164,17 +167,16 @@ final class Lexer implements Closeable {
* @throws IOException If an I/O error occurs.
*/
boolean isEscapeDelimiter() throws IOException {
final int len = 2 * delimiter.length - 1;
final char[] buf = reader.lookAhead(len);
if (buf[0] != delimiter[0]) {
reader.lookAhead(escapeDelimiterBuf);
if (escapeDelimiterBuf[0] != delimiter[0]) {
return false;
}
for (int i = 1; i < delimiter.length; i++) {
if (buf[2 * i] != delimiter[i] || buf[2 * i - 1] != escape) {
if (escapeDelimiterBuf[2 * i] != delimiter[i] || escapeDelimiterBuf[2 * i - 1] != escape) {
return false;
}
}
final int count = reader.read(buf, 0, len);
final int count = reader.read(escapeDelimiterBuf, 0, escapeDelimiterBuf.length);
return count != END_OF_STREAM;
}