mirror of
https://github.com/apache/commons-csv.git
synced 2025-02-28 05:49:04 +00:00
SANDBOX-206: fix whitespace handling w/ escaping, add an option to not remove trailing whitespace
git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609327 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b55fb21d78
commit
086f434320
@ -399,47 +399,39 @@ public class CSVParser {
|
||||
* @throws IOException on stream access error
|
||||
*/
|
||||
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
|
||||
wsBuf.clear();
|
||||
for (;;) {
|
||||
if (isEndOfLine(c)) {
|
||||
// end of record
|
||||
tkn.type = TT_EORECORD;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
break;
|
||||
} else if (isEndOfFile(c)) {
|
||||
// end of file
|
||||
tkn.type = TT_EOF;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
break;
|
||||
} else if (c == strategy.getDelimiter()) {
|
||||
// end of token
|
||||
tkn.type = TT_TOKEN;
|
||||
tkn.isReady = true;
|
||||
return tkn;
|
||||
break;
|
||||
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
|
||||
// interpret unicode escaped chars (like \u0070 -> p)
|
||||
tkn.content.append((char) unicodeEscapeLexer(c));
|
||||
} else if (isWhitespace(c)) {
|
||||
// gather whitespaces
|
||||
// (as long as they are not at the beginning of a token)
|
||||
if (tkn.content.length() > 0) {
|
||||
wsBuf.append((char) c);
|
||||
}
|
||||
} else if (c == strategy.getEscape()) {
|
||||
tkn.content.append((char)readEscape(c));
|
||||
} else {
|
||||
// prepend whitespaces (if we have)
|
||||
if (wsBuf.length() > 0) {
|
||||
tkn.content.append(wsBuf);
|
||||
wsBuf.clear();
|
||||
}
|
||||
tkn.content.append((char) c);
|
||||
}
|
||||
// get the next char
|
||||
if (!tkn.isReady) {
|
||||
c = in.read();
|
||||
}
|
||||
|
||||
c = in.read();
|
||||
}
|
||||
|
||||
if (strategy.getIgnoreTrailingWhitespaces()) {
|
||||
tkn.content.trimTrailingWhitespace();
|
||||
}
|
||||
|
||||
return tkn;
|
||||
}
|
||||
|
||||
|
||||
|
@ -30,6 +30,7 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
private char commentStart;
|
||||
private char escape;
|
||||
private boolean ignoreLeadingWhitespaces;
|
||||
private boolean ignoreTrailingWhitespaces;
|
||||
private boolean interpretUnicodeEscapes;
|
||||
private boolean ignoreEmptyLines;
|
||||
|
||||
@ -40,9 +41,9 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
public static char COMMENTS_DISABLED = (char)-2;
|
||||
public static char ESCAPE_DISABLED = (char)-2;
|
||||
|
||||
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
|
||||
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
|
||||
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
|
||||
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true);
|
||||
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false, false);
|
||||
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true);
|
||||
|
||||
|
||||
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
|
||||
@ -67,6 +68,7 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
char commentStart,
|
||||
char escape,
|
||||
boolean ignoreLeadingWhitespace,
|
||||
boolean ignoreTrailingWhitespace,
|
||||
boolean interpretUnicodeEscapes,
|
||||
boolean ignoreEmptyLines)
|
||||
{
|
||||
@ -75,6 +77,7 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
setCommentStart(commentStart);
|
||||
setEscape(escape);
|
||||
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
|
||||
setIgnoreTrailingWhitespaces(ignoreTrailingWhitespace);
|
||||
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
|
||||
setIgnoreEmptyLines(ignoreEmptyLines);
|
||||
}
|
||||
@ -88,7 +91,7 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
boolean interpretUnicodeEscapes,
|
||||
boolean ignoreEmptyLines)
|
||||
{
|
||||
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
|
||||
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,true,interpretUnicodeEscapes,ignoreEmptyLines);
|
||||
}
|
||||
|
||||
|
||||
@ -108,6 +111,9 @@ public class CSVStrategy implements Cloneable, Serializable {
|
||||
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
|
||||
public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }
|
||||
|
||||
public void setIgnoreTrailingWhitespaces(boolean ignoreTrailingWhitespaces) { this.ignoreTrailingWhitespaces = ignoreTrailingWhitespaces; }
|
||||
public boolean getIgnoreTrailingWhitespaces() { return this.ignoreTrailingWhitespaces; }
|
||||
|
||||
public void setUnicodeEscapeInterpretation(boolean interpretUnicodeEscapes) { this.interpretUnicodeEscapes = interpretUnicodeEscapes; }
|
||||
public boolean getUnicodeEscapeInterpretation() { return this.interpretUnicodeEscapes; }
|
||||
|
||||
|
@ -24,7 +24,7 @@ package org.apache.commons.csv;
|
||||
* grows as necessary.
|
||||
* This class is not thread safe.
|
||||
*
|
||||
* @author Ortwin Glück
|
||||
* @author Ortwin Gl<EFBFBD>ck
|
||||
*/
|
||||
public class CharBuffer {
|
||||
private char[] c;
|
||||
@ -65,7 +65,7 @@ public class CharBuffer {
|
||||
public int length() {
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the current capacity of the buffer.
|
||||
* @return the maximum number of characters that can be stored in this buffer without
|
||||
@ -74,6 +74,7 @@ public class CharBuffer {
|
||||
public int capacity() {
|
||||
return c.length;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Appends the contents of <code>cb</code> to the end of this CharBuffer.
|
||||
@ -142,6 +143,15 @@ public class CharBuffer {
|
||||
c = newc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes trailing whitespace.
|
||||
*/
|
||||
public void trimTrailingWhitespace() {
|
||||
while (length>0 && Character.isWhitespace(c[length-1])) {
|
||||
length--;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the contents of the buffer as a char[]. The returned array may
|
||||
* be the internal array of the buffer, so the caller must take care when
|
||||
@ -156,7 +166,14 @@ public class CharBuffer {
|
||||
System.arraycopy(c, 0, chars, 0, length);
|
||||
return chars;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the character at the specified position.
|
||||
*/
|
||||
public char charAt(int pos) {
|
||||
return c[pos];
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the contents of the buffer into a StringBuffer.
|
||||
* This method involves copying the new data once!
|
||||
|
@ -485,6 +485,8 @@ public class CSVParserTest extends TestCase {
|
||||
+ "/,,/,\n" // 5) separator escaped
|
||||
+ "//,//\n" // 6) escape escaped
|
||||
+ "'//','//'\n" // 7) escape escaped in encapsulation
|
||||
+ " 8 , \"quoted \"\" /\" // string\" \n" // don't eat spaces
|
||||
+ "9, /\n \n" // escaped newline
|
||||
+ "";
|
||||
String[][] res = {
|
||||
{ "one", "two", "three" }, // 0
|
||||
@ -495,10 +497,12 @@ public class CSVParserTest extends TestCase {
|
||||
{ ",", "," }, // 5
|
||||
{ "/", "/" }, // 6
|
||||
{ "/", "/" }, // 7
|
||||
{ " 8 ", " \"quoted \"\" \" / string\" " },
|
||||
{ "9", " \n " },
|
||||
};
|
||||
|
||||
|
||||
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
|
||||
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',false,false,true,true);
|
||||
|
||||
CSVParser parser = new CSVParser(new StringReader(code), strategy);
|
||||
System.out.println("---------\n" + code + "\n-------------");
|
||||
@ -513,6 +517,7 @@ public class CSVParserTest extends TestCase {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void testUnicodeEscape() throws IOException {
|
||||
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
|
||||
CSVParser parser = new CSVParser(new StringReader(code));
|
||||
|
Loading…
x
Reference in New Issue
Block a user