SANDBOX-206: fix whitespace handling w/ escaping, add an option to not remove trailing whitespace

git-svn-id: https://svn.apache.org/repos/asf/commons/sandbox/csv/trunk@609327 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2008-01-06 15:13:09 +00:00
parent b55fb21d78
commit 086f434320
4 changed files with 47 additions and 27 deletions

View File

@ -399,47 +399,39 @@ public class CSVParser {
* @throws IOException on stream access error
*/
private Token simpleTokenLexer(Token tkn, int c) throws IOException {
wsBuf.clear();
for (;;) {
if (isEndOfLine(c)) {
// end of record
tkn.type = TT_EORECORD;
tkn.isReady = true;
return tkn;
break;
} else if (isEndOfFile(c)) {
// end of file
tkn.type = TT_EOF;
tkn.isReady = true;
return tkn;
break;
} else if (c == strategy.getDelimiter()) {
// end of token
tkn.type = TT_TOKEN;
tkn.isReady = true;
return tkn;
break;
} else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') {
// interpret unicode escaped chars (like \u0070 -> p)
tkn.content.append((char) unicodeEscapeLexer(c));
} else if (isWhitespace(c)) {
// gather whitespaces
// (as long as they are not at the beginning of a token)
if (tkn.content.length() > 0) {
wsBuf.append((char) c);
}
} else if (c == strategy.getEscape()) {
tkn.content.append((char)readEscape(c));
} else {
// prepend whitespaces (if we have)
if (wsBuf.length() > 0) {
tkn.content.append(wsBuf);
wsBuf.clear();
}
tkn.content.append((char) c);
}
// get the next char
if (!tkn.isReady) {
c = in.read();
}
c = in.read();
}
if (strategy.getIgnoreTrailingWhitespaces()) {
tkn.content.trimTrailingWhitespace();
}
return tkn;
}

View File

@ -30,6 +30,7 @@ public class CSVStrategy implements Cloneable, Serializable {
private char commentStart;
private char escape;
private boolean ignoreLeadingWhitespaces;
private boolean ignoreTrailingWhitespaces;
private boolean interpretUnicodeEscapes;
private boolean ignoreEmptyLines;
@ -40,9 +41,9 @@ public class CSVStrategy implements Cloneable, Serializable {
public static char COMMENTS_DISABLED = (char)-2;
public static char ESCAPE_DISABLED = (char)-2;
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false);
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true);
public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true);
public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false, false);
public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true);
public CSVStrategy(char delimiter, char encapsulator, char commentStart) {
@ -67,6 +68,7 @@ public class CSVStrategy implements Cloneable, Serializable {
char commentStart,
char escape,
boolean ignoreLeadingWhitespace,
boolean ignoreTrailingWhitespace,
boolean interpretUnicodeEscapes,
boolean ignoreEmptyLines)
{
@ -75,6 +77,7 @@ public class CSVStrategy implements Cloneable, Serializable {
setCommentStart(commentStart);
setEscape(escape);
setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace);
setIgnoreTrailingWhitespaces(ignoreTrailingWhitespace);
setUnicodeEscapeInterpretation(interpretUnicodeEscapes);
setIgnoreEmptyLines(ignoreEmptyLines);
}
@ -88,7 +91,7 @@ public class CSVStrategy implements Cloneable, Serializable {
boolean interpretUnicodeEscapes,
boolean ignoreEmptyLines)
{
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines);
this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,true,interpretUnicodeEscapes,ignoreEmptyLines);
}
@ -108,6 +111,9 @@ public class CSVStrategy implements Cloneable, Serializable {
public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; }
public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; }
public void setIgnoreTrailingWhitespaces(boolean ignoreTrailingWhitespaces) { this.ignoreTrailingWhitespaces = ignoreTrailingWhitespaces; }
public boolean getIgnoreTrailingWhitespaces() { return this.ignoreTrailingWhitespaces; }
public void setUnicodeEscapeInterpretation(boolean interpretUnicodeEscapes) { this.interpretUnicodeEscapes = interpretUnicodeEscapes; }
public boolean getUnicodeEscapeInterpretation() { return this.interpretUnicodeEscapes; }

View File

@ -24,7 +24,7 @@ package org.apache.commons.csv;
* grows as necessary.
* This class is not thread safe.
*
* @author Ortwin Glück
* @author Ortwin Gl<EFBFBD>ck
*/
public class CharBuffer {
private char[] c;
@ -65,7 +65,7 @@ public class CharBuffer {
public int length() {
return length;
}
/**
* Returns the current capacity of the buffer.
* @return the maximum number of characters that can be stored in this buffer without
@ -74,6 +74,7 @@ public class CharBuffer {
public int capacity() {
return c.length;
}
/**
* Appends the contents of <code>cb</code> to the end of this CharBuffer.
@ -142,6 +143,15 @@ public class CharBuffer {
c = newc;
}
/**
* Removes trailing whitespace.
*/
public void trimTrailingWhitespace() {
while (length>0 && Character.isWhitespace(c[length-1])) {
length--;
}
}
/**
* Returns the contents of the buffer as a char[]. The returned array may
* be the internal array of the buffer, so the caller must take care when
@ -156,7 +166,14 @@ public class CharBuffer {
System.arraycopy(c, 0, chars, 0, length);
return chars;
}
/**
* Returns the character at the specified position.
*/
public char charAt(int pos) {
return c[pos];
}
/**
* Converts the contents of the buffer into a StringBuffer.
* This method involves copying the new data once!

View File

@ -485,6 +485,8 @@ public class CSVParserTest extends TestCase {
+ "/,,/,\n" // 5) separator escaped
+ "//,//\n" // 6) escape escaped
+ "'//','//'\n" // 7) escape escaped in encapsulation
+ " 8 , \"quoted \"\" /\" // string\" \n" // don't eat spaces
+ "9, /\n \n" // escaped newline
+ "";
String[][] res = {
{ "one", "two", "three" }, // 0
@ -495,10 +497,12 @@ public class CSVParserTest extends TestCase {
{ ",", "," }, // 5
{ "/", "/" }, // 6
{ "/", "/" }, // 7
{ " 8 ", " \"quoted \"\" \" / string\" " },
{ "9", " \n " },
};
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true);
CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',false,false,true,true);
CSVParser parser = new CSVParser(new StringReader(code), strategy);
System.out.println("---------\n" + code + "\n-------------");
@ -513,6 +517,7 @@ public class CSVParserTest extends TestCase {
}
public void testUnicodeEscape() throws IOException {
String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063";
CSVParser parser = new CSVParser(new StringReader(code));