diff --git a/src/java/org/apache/commons/csv/CSVParser.java b/src/java/org/apache/commons/csv/CSVParser.java index a6535f03..03fa2e9e 100644 --- a/src/java/org/apache/commons/csv/CSVParser.java +++ b/src/java/org/apache/commons/csv/CSVParser.java @@ -399,47 +399,39 @@ public class CSVParser { * @throws IOException on stream access error */ private Token simpleTokenLexer(Token tkn, int c) throws IOException { - wsBuf.clear(); for (;;) { if (isEndOfLine(c)) { // end of record tkn.type = TT_EORECORD; tkn.isReady = true; - return tkn; + break; } else if (isEndOfFile(c)) { // end of file tkn.type = TT_EOF; tkn.isReady = true; - return tkn; + break; } else if (c == strategy.getDelimiter()) { // end of token tkn.type = TT_TOKEN; tkn.isReady = true; - return tkn; + break; } else if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead() == 'u') { // interpret unicode escaped chars (like \u0070 -> p) tkn.content.append((char) unicodeEscapeLexer(c)); - } else if (isWhitespace(c)) { - // gather whitespaces - // (as long as they are not at the beginning of a token) - if (tkn.content.length() > 0) { - wsBuf.append((char) c); - } } else if (c == strategy.getEscape()) { tkn.content.append((char)readEscape(c)); } else { - // prepend whitespaces (if we have) - if (wsBuf.length() > 0) { - tkn.content.append(wsBuf); - wsBuf.clear(); - } tkn.content.append((char) c); } - // get the next char - if (!tkn.isReady) { - c = in.read(); - } + + c = in.read(); } + + if (strategy.getIgnoreTrailingWhitespaces()) { + tkn.content.trimTrailingWhitespace(); + } + + return tkn; } diff --git a/src/java/org/apache/commons/csv/CSVStrategy.java b/src/java/org/apache/commons/csv/CSVStrategy.java index 9ef30315..df9e3c89 100644 --- a/src/java/org/apache/commons/csv/CSVStrategy.java +++ b/src/java/org/apache/commons/csv/CSVStrategy.java @@ -30,6 +30,7 @@ public class CSVStrategy implements Cloneable, Serializable { private char commentStart; private char escape; private boolean ignoreLeadingWhitespaces; + private boolean ignoreTrailingWhitespaces; private boolean interpretUnicodeEscapes; private boolean ignoreEmptyLines; @@ -40,9 +41,9 @@ public class CSVStrategy implements Cloneable, Serializable { public static char COMMENTS_DISABLED = (char)-2; public static char ESCAPE_DISABLED = (char)-2; - public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true); - public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false); - public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, false, true); + public static CSVStrategy DEFAULT_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true); + public static CSVStrategy EXCEL_STRATEGY = new CSVStrategy(',', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, false, false, false, false); + public static CSVStrategy TDF_STRATEGY = new CSVStrategy(' ', '"', COMMENTS_DISABLED, ESCAPE_DISABLED, true, true, false, true); public CSVStrategy(char delimiter, char encapsulator, char commentStart) { @@ -67,6 +68,7 @@ public class CSVStrategy implements Cloneable, Serializable { char commentStart, char escape, boolean ignoreLeadingWhitespace, + boolean ignoreTrailingWhitespace, boolean interpretUnicodeEscapes, boolean ignoreEmptyLines) { @@ -75,6 +77,7 @@ public class CSVStrategy implements Cloneable, Serializable { setCommentStart(commentStart); setEscape(escape); setIgnoreLeadingWhitespaces(ignoreLeadingWhitespace); + setIgnoreTrailingWhitespaces(ignoreTrailingWhitespace); setUnicodeEscapeInterpretation(interpretUnicodeEscapes); setIgnoreEmptyLines(ignoreEmptyLines); } @@ -88,7 +91,7 @@ public class CSVStrategy implements Cloneable, Serializable { boolean interpretUnicodeEscapes, boolean ignoreEmptyLines) { - this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,interpretUnicodeEscapes,ignoreEmptyLines); + this(delimiter,encapsulator,commentStart,CSVStrategy.ESCAPE_DISABLED,ignoreLeadingWhitespace,true,interpretUnicodeEscapes,ignoreEmptyLines); } @@ -108,6 +111,9 @@ public class CSVStrategy implements Cloneable, Serializable { public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; } public boolean getIgnoreLeadingWhitespaces() { return this.ignoreLeadingWhitespaces; } + public void setIgnoreTrailingWhitespaces(boolean ignoreTrailingWhitespaces) { this.ignoreTrailingWhitespaces = ignoreTrailingWhitespaces; } + public boolean getIgnoreTrailingWhitespaces() { return this.ignoreTrailingWhitespaces; } + public void setUnicodeEscapeInterpretation(boolean interpretUnicodeEscapes) { this.interpretUnicodeEscapes = interpretUnicodeEscapes; } public boolean getUnicodeEscapeInterpretation() { return this.interpretUnicodeEscapes; } diff --git a/src/java/org/apache/commons/csv/CharBuffer.java b/src/java/org/apache/commons/csv/CharBuffer.java index b8e03cd0..a50cde0d 100644 --- a/src/java/org/apache/commons/csv/CharBuffer.java +++ b/src/java/org/apache/commons/csv/CharBuffer.java @@ -24,7 +24,7 @@ package org.apache.commons.csv; * grows as necessary. * This class is not thread safe. * - * @author Ortwin Glück + * @author Ortwin Gl�ck */ public class CharBuffer { private char[] c; @@ -65,7 +65,7 @@ public class CharBuffer { public int length() { return length; } - + /** * Returns the current capacity of the buffer. * @return the maximum number of characters that can be stored in this buffer without @@ -74,6 +74,7 @@ public class CharBuffer { public int capacity() { return c.length; } + /** * Appends the contents of cb to the end of this CharBuffer. @@ -142,6 +143,15 @@ public class CharBuffer { c = newc; } + /** + * Removes trailing whitespace. + */ + public void trimTrailingWhitespace() { + while (length>0 && Character.isWhitespace(c[length-1])) { + length--; + } + } + /** * Returns the contents of the buffer as a char[]. The returned array may * be the internal array of the buffer, so the caller must take care when @@ -156,7 +166,14 @@ public class CharBuffer { System.arraycopy(c, 0, chars, 0, length); return chars; } - + + /** + * Returns the character at the specified position. + */ + public char charAt(int pos) { + return c[pos]; + } + /** * Converts the contents of the buffer into a StringBuffer. * This method involves copying the new data once! diff --git a/src/test/org/apache/commons/csv/CSVParserTest.java b/src/test/org/apache/commons/csv/CSVParserTest.java index a95ff7c8..46b0c024 100644 --- a/src/test/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/org/apache/commons/csv/CSVParserTest.java @@ -485,6 +485,8 @@ public class CSVParserTest extends TestCase { + "/,,/,\n" // 5) separator escaped + "//,//\n" // 6) escape escaped + "'//','//'\n" // 7) escape escaped in encapsulation + + " 8 , \"quoted \"\" /\" // string\" \n" // don't eat spaces + + "9, /\n \n" // escaped newline + ""; String[][] res = { { "one", "two", "three" }, // 0 @@ -495,10 +497,12 @@ public class CSVParserTest extends TestCase { { ",", "," }, // 5 { "/", "/" }, // 6 { "/", "/" }, // 7 + { " 8 ", " \"quoted \"\" \" / string\" " }, + { "9", " \n " }, }; - CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',true,true,true); + CSVStrategy strategy = new CSVStrategy(',','\'',CSVStrategy.COMMENTS_DISABLED,'/',false,false,true,true); CSVParser parser = new CSVParser(new StringReader(code), strategy); System.out.println("---------\n" + code + "\n-------------"); @@ -513,6 +517,7 @@ public class CSVParserTest extends TestCase { } + public void testUnicodeEscape() throws IOException { String code = "abc,\\u0070\\u0075\\u0062\\u006C\\u0069\\u0063"; CSVParser parser = new CSVParser(new StringReader(code));