diff --git a/pom.xml b/pom.xml index f04501c1..bd5ec200 100644 --- a/pom.xml +++ b/pom.xml @@ -240,25 +240,25 @@ - src/test/resources/csv-167/sample1.csv - src/test/resources/CSV-198/optd_por_public.csv - src/test/resources/CSV-213/999751170.patch.csv - src/test/resources/CSVFileParser/bom.csv - src/test/resources/CSVFileParser/test.csv - src/test/resources/CSVFileParser/test_default.txt - src/test/resources/CSVFileParser/test_default_comment.txt - src/test/resources/CSVFileParser/test_rfc4180.txt - src/test/resources/CSVFileParser/test_rfc4180_trim.txt - src/test/resources/CSVFileParser/testCSV85.csv - src/test/resources/CSVFileParser/testCSV85_default.txt - src/test/resources/CSVFileParser/testCSV85_ignoreEmpty.txt + src/test/resources/org/apache/commons/csv/csv-167/sample1.csv + src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv + src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv + src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv + src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv + src/test/resources/org/apache/commons/csv/CSVFileParser/test_default.txt + src/test/resources/org/apache/commons/csv/CSVFileParser/test_default_comment.txt + src/test/resources/org/apache/commons/csv/CSVFileParser/test_rfc4180.txt + src/test/resources/org/apache/commons/csv/CSVFileParser/test_rfc4180_trim.txt + src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85.csv + src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85_default.txt + src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85_ignoreEmpty.txt - src/test/resources/ferc.gov/contract.txt - src/test/resources/ferc.gov/transaction.txt + src/test/resources/org/apache/commons/csv/ferc.gov/contract.txt + src/test/resources/org/apache/commons/csv/ferc.gov/transaction.txt src/test/resources/**/*.bin - src/test/resources/CSV-259/sample.txt - src/test/resources/CSVFileParser/testCSV246.csv - src/test/resources/CSVFileParser/testCSV246_checkWithNoComment.txt + src/test/resources/org/apache/commons/csv/CSV-259/sample.txt + src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV246.csv + src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV246_checkWithNoComment.txt diff --git a/src/test/java/org/apache/commons/csv/CSVFileParserTest.java b/src/test/java/org/apache/commons/csv/CSVFileParserTest.java index 413ef617..2628f529 100644 --- a/src/test/java/org/apache/commons/csv/CSVFileParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVFileParserTest.java @@ -40,7 +40,7 @@ import org.junit.jupiter.params.provider.MethodSource; */ public class CSVFileParserTest { - private static final File BASE = new File("src/test/resources/CSVFileParser"); + private static final File BASE = new File("src/test/resources/org/apache/commons/csv/CSVFileParser"); private String readTestData(final BufferedReader reader) throws IOException { String line; @@ -134,7 +134,7 @@ public class CSVFileParserTest { assertEquals(line, format.toString(), testFile.getName() + " Expected format "); // Now parse the file and compare against the expected results - final URL resource = ClassLoader.getSystemResource("CSVFileParser/" + split[0]); + final URL resource = ClassLoader.getSystemResource("org/apache/commons/csv/CSVFileParser/" + split[0]); try (final CSVParser parser = CSVParser.parse(resource, Charset.forName("UTF-8"), format)) { for (final CSVRecord record : parser) { String parsed = Arrays.toString(record.values()); diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java index 87e8a5a4..d058cc42 100644 --- a/src/test/java/org/apache/commons/csv/CSVParserTest.java +++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java @@ -178,7 +178,7 @@ public class CSVParserTest { @Test @Disabled("CSV-107") public void testBOM() throws IOException { - final URL url = ClassLoader.getSystemClassLoader().getResource("CSVFileParser/bom.csv"); + final URL url = ClassLoader.getSystemClassLoader().getResource("org/apache/commons/csv/CSVFileParser/bom.csv"); try (final CSVParser parser = CSVParser.parse(url, Charset.forName(UTF_8_NAME), CSVFormat.EXCEL.withHeader())) { for (final CSVRecord record : parser) { final String string = record.get("Date"); @@ -190,7 +190,7 @@ public class CSVParserTest { @Test public void testBOMInputStream_ParserWithInputStream() throws IOException { - try (final BOMInputStream inputStream = createBOMInputStream("CSVFileParser/bom.csv"); + try (final BOMInputStream inputStream = createBOMInputStream("org/apache/commons/csv/CSVFileParser/bom.csv"); final CSVParser parser = CSVParser.parse(inputStream, UTF_8, CSVFormat.EXCEL.withHeader())) { for (final CSVRecord record : parser) { final String string = record.get("Date"); @@ -202,7 +202,9 @@ public class CSVParserTest { @Test public void testBOMInputStream_ParserWithReader() throws IOException { - try (final Reader reader = new InputStreamReader(createBOMInputStream("CSVFileParser/bom.csv"), UTF_8_NAME); + try ( + final Reader reader = new InputStreamReader( + createBOMInputStream("org/apache/commons/csv/CSVFileParser/bom.csv"), UTF_8_NAME); final CSVParser parser = new CSVParser(reader, CSVFormat.EXCEL.withHeader())) { for (final CSVRecord record : parser) { final String string = record.get("Date"); @@ -214,7 +216,9 @@ public class CSVParserTest { @Test public void testBOMInputStream_parseWithReader() throws IOException { - try (final Reader reader = new InputStreamReader(createBOMInputStream("CSVFileParser/bom.csv"), UTF_8_NAME); + try ( + final Reader reader = new InputStreamReader( + createBOMInputStream("org/apache/commons/csv/CSVFileParser/bom.csv"), UTF_8_NAME); final CSVParser parser = CSVParser.parse(reader, CSVFormat.EXCEL.withHeader())) { for (final CSVRecord record : parser) { final String string = record.get("Date"); @@ -941,7 +945,7 @@ public class CSVParserTest { @Test public void testParse() throws Exception { final ClassLoader loader = ClassLoader.getSystemClassLoader(); - final URL url = loader.getResource("CSVFileParser/test.csv"); + final URL url = loader.getResource("org/apache/commons/csv/CSVFileParser/test.csv"); final CSVFormat format = CSVFormat.DEFAULT.withHeader("A", "B", "C", "D"); final Charset charset = StandardCharsets.UTF_8; diff --git a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java index d3c56ee2..c7e305ee 100644 --- a/src/test/java/org/apache/commons/csv/CSVPrinterTest.java +++ b/src/test/java/org/apache/commons/csv/CSVPrinterTest.java @@ -315,7 +315,7 @@ public class CSVPrinterTest { @Test public void testCSV259() throws IOException { final StringWriter sw = new StringWriter(); - final Reader reader = new FileReader("src/test/resources/CSV-259/sample.txt"); + final Reader reader = new FileReader("src/test/resources/org/apache/commons/csv/CSV-259/sample.txt"); try (final CSVPrinter printer = new CSVPrinter(sw, CSVFormat.DEFAULT.withEscape('!').withQuote(null))) { printer.print(reader); assertEquals("x!,y!,z", sw.toString()); diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv167Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv167Test.java index 0e6e49fb..5b8a20e5 100644 --- a/src/test/java/org/apache/commons/csv/issues/JiraCsv167Test.java +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv167Test.java @@ -83,7 +83,7 @@ public class JiraCsv167Test { } private Reader getTestInput() { - final InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream("csv-167/sample1.csv"); + final InputStream is = ClassLoader.getSystemClassLoader().getResourceAsStream("org/apache/commons/csv/csv-167/sample1.csv"); return new InputStreamReader(is); } } diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv198Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv198Test.java index f97c48d0..307610a2 100644 --- a/src/test/java/org/apache/commons/csv/issues/JiraCsv198Test.java +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv198Test.java @@ -34,7 +34,7 @@ public class JiraCsv198Test { @Test public void test() throws UnsupportedEncodingException, IOException { - final InputStream pointsOfReference = getClass().getResourceAsStream("/CSV-198/optd_por_public.csv"); + final InputStream pointsOfReference = getClass().getResourceAsStream("/org/apache/commons/csv/CSV-198/optd_por_public.csv"); assertNotNull(pointsOfReference); try (@SuppressWarnings("resource") CSVParser parser = CSV_FORMAT.parse(new InputStreamReader(pointsOfReference, "UTF-8"))) { diff --git a/src/test/java/org/apache/commons/csv/issues/JiraCsv248Test.java b/src/test/java/org/apache/commons/csv/issues/JiraCsv248Test.java index bf640b6d..1005e835 100644 --- a/src/test/java/org/apache/commons/csv/issues/JiraCsv248Test.java +++ b/src/test/java/org/apache/commons/csv/issues/JiraCsv248Test.java @@ -75,6 +75,6 @@ public class JiraCsv248Test { } private static InputStream getTestInput() { - return ClassLoader.getSystemClassLoader().getResourceAsStream("CSV-248/csvRecord.bin"); + return ClassLoader.getSystemClassLoader().getResourceAsStream("org/apache/commons/csv/CSV-248/csvRecord.bin"); } } diff --git a/src/test/resources/CSV-198/optd_por_public.csv b/src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv similarity index 100% rename from src/test/resources/CSV-198/optd_por_public.csv rename to src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv diff --git a/src/test/resources/CSV-213/999751170.patch.csv b/src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv similarity index 100% rename from src/test/resources/CSV-213/999751170.patch.csv rename to src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv diff --git a/src/test/resources/CSV-248/csvRecord.bin b/src/test/resources/org/apache/commons/csv/CSV-248/csvRecord.bin similarity index 100% rename from src/test/resources/CSV-248/csvRecord.bin rename to src/test/resources/org/apache/commons/csv/CSV-248/csvRecord.bin diff --git a/src/test/resources/CSV-259/sample.txt b/src/test/resources/org/apache/commons/csv/CSV-259/sample.txt similarity index 100% rename from src/test/resources/CSV-259/sample.txt rename to src/test/resources/org/apache/commons/csv/CSV-259/sample.txt diff --git a/src/test/resources/CSVFileParser/README.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/README.txt similarity index 100% rename from src/test/resources/CSVFileParser/README.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/README.txt diff --git a/src/test/resources/CSVFileParser/bom.csv b/src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv similarity index 100% rename from src/test/resources/CSVFileParser/bom.csv rename to src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv diff --git a/src/test/resources/CSVFileParser/test.csv b/src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv similarity index 93% rename from src/test/resources/CSVFileParser/test.csv rename to src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv index ebdb9525..93101ed3 100644 --- a/src/test/resources/CSVFileParser/test.csv +++ b/src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv @@ -1,16 +1,16 @@ -A,B,C,"D" -# plain values -a,b,c,d -# spaces before and after - e ,f , g,h -# quoted: with spaces before and after -" i ", " j " , " k "," l " -# empty values -,,, -# empty quoted values -"","","","" -# 3 empty lines - - - -# EOF on next line +A,B,C,"D" +# plain values +a,b,c,d +# spaces before and after + e ,f , g,h +# quoted: with spaces before and after +" i ", " j " , " k "," l " +# empty values +,,, +# empty quoted values +"","","","" +# 3 empty lines + + + +# EOF on next line diff --git a/src/test/resources/CSVFileParser/testCSV246.csv b/src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV246.csv similarity index 100% rename from src/test/resources/CSVFileParser/testCSV246.csv rename to src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV246.csv diff --git a/src/test/resources/CSVFileParser/testCSV246_checkWithNoComment.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV246_checkWithNoComment.txt similarity index 100% rename from src/test/resources/CSVFileParser/testCSV246_checkWithNoComment.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV246_checkWithNoComment.txt diff --git a/src/test/resources/CSVFileParser/testCSV85.csv b/src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85.csv similarity index 91% rename from src/test/resources/CSVFileParser/testCSV85.csv rename to src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85.csv index b1baab30..69bb80e3 100644 --- a/src/test/resources/CSVFileParser/testCSV85.csv +++ b/src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85.csv @@ -1,9 +1,9 @@ -# Comment 1 -a,b,c,e,f -# Very Long -# Comment 2 -g,h,i,j,k -# Very Long - -# Comment 3 +# Comment 1 +a,b,c,e,f +# Very Long +# Comment 2 +g,h,i,j,k +# Very Long + +# Comment 3 l,m,n,o,p \ No newline at end of file diff --git a/src/test/resources/CSVFileParser/testCSV85_default.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85_default.txt similarity index 100% rename from src/test/resources/CSVFileParser/testCSV85_default.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85_default.txt diff --git a/src/test/resources/CSVFileParser/testCSV85_ignoreEmpty.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85_ignoreEmpty.txt similarity index 100% rename from src/test/resources/CSVFileParser/testCSV85_ignoreEmpty.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/testCSV85_ignoreEmpty.txt diff --git a/src/test/resources/CSVFileParser/test_default.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/test_default.txt similarity index 100% rename from src/test/resources/CSVFileParser/test_default.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/test_default.txt diff --git a/src/test/resources/CSVFileParser/test_default_comment.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/test_default_comment.txt similarity index 100% rename from src/test/resources/CSVFileParser/test_default_comment.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/test_default_comment.txt diff --git a/src/test/resources/CSVFileParser/test_rfc4180.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/test_rfc4180.txt similarity index 100% rename from src/test/resources/CSVFileParser/test_rfc4180.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/test_rfc4180.txt diff --git a/src/test/resources/CSVFileParser/test_rfc4180_trim.txt b/src/test/resources/org/apache/commons/csv/CSVFileParser/test_rfc4180_trim.txt similarity index 100% rename from src/test/resources/CSVFileParser/test_rfc4180_trim.txt rename to src/test/resources/org/apache/commons/csv/CSVFileParser/test_rfc4180_trim.txt diff --git a/src/test/resources/org/apache/commons/csv/CSVFormat.java b/src/test/resources/org/apache/commons/csv/CSVFormat.java new file mode 100644 index 00000000..c00f993f --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSVFormat.java @@ -0,0 +1,2330 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Constants.BACKSLASH; +import static org.apache.commons.csv.Constants.COMMA; +import static org.apache.commons.csv.Constants.COMMENT; +import static org.apache.commons.csv.Constants.CR; +import static org.apache.commons.csv.Constants.CRLF; +import static org.apache.commons.csv.Constants.DOUBLE_QUOTE_CHAR; +import static org.apache.commons.csv.Constants.EMPTY; +import static org.apache.commons.csv.Constants.LF; +import static org.apache.commons.csv.Constants.PIPE; +import static org.apache.commons.csv.Constants.SP; +import static org.apache.commons.csv.Constants.TAB; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.io.Serializable; +import java.io.StringWriter; +import java.io.Writer; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +/** + * Specifies the format of a CSV file and parses input. + * + *

Using predefined formats

+ * + *

+ * You can use one of the predefined formats: + *

+ * + * + * + *

+ * For example: + *

+ * + *
+ * CSVParser parser = CSVFormat.EXCEL.parse(reader);
+ * 
+ * + *

+ * The {@link CSVParser} provides static methods to parse other input types, for example: + *

+ * + *
+ * CSVParser parser = CSVParser.parse(file, StandardCharsets.US_ASCII, CSVFormat.EXCEL);
+ * 
+ * + *

Defining formats

+ * + *

+ * You can extend a format by calling the {@code with} methods. For example: + *

+ * + *
+ * CSVFormat.EXCEL.withNullString("N/A").withIgnoreSurroundingSpaces(true);
+ * 
+ * + *

Defining column names

+ * + *

+ * To define the column names you want to use to access records, write: + *

+ * + *
+ * CSVFormat.EXCEL.withHeader("Col1", "Col2", "Col3");
+ * 
+ * + *

+ * Calling {@link #withHeader(String...)} lets you use the given names to address values in a {@link CSVRecord}, and + * assumes that your CSV source does not contain a first record that also defines column names. + * + * If it does, then you are overriding this metadata with your names and you should skip the first record by calling + * {@link #withSkipHeaderRecord(boolean)} with {@code true}. + *

+ * + *

Parsing

+ * + *

+ * You can use a format directly to parse a reader. For example, to parse an Excel file with columns header, write: + *

+ * + *
+ * Reader in = ...;
+ * CSVFormat.EXCEL.withHeader("Col1", "Col2", "Col3").parse(in);
+ * 
+ * + *

+ * For other input types, like resources, files, and URLs, use the static methods on {@link CSVParser}. + *

+ * + *

Referencing columns safely

+ * + *

+ * If your source contains a header record, you can simplify your code and safely reference columns, by using + * {@link #withHeader(String...)} with no arguments: + *

+ * + *
+ * CSVFormat.EXCEL.withHeader();
+ * 
+ * + *

+ * This causes the parser to read the first record and use its values as column names. + * + * Then, call one of the {@link CSVRecord} get method that takes a String column name argument: + *

+ * + *
+ * String value = record.get("Col1");
+ * 
+ * + *

+ * This makes your code impervious to changes in column order in the CSV file. + *

+ * + *

Notes

+ * + *

+ * This class is immutable. + *

+ */ +public final class CSVFormat implements Serializable { + + /** + * Predefines formats. + * + * @since 1.2 + */ + public enum Predefined { + + /** + * @see CSVFormat#DEFAULT + */ + Default(CSVFormat.DEFAULT), + + /** + * @see CSVFormat#EXCEL + */ + Excel(CSVFormat.EXCEL), + + /** + * @see CSVFormat#INFORMIX_UNLOAD + * @since 1.3 + */ + InformixUnload(CSVFormat.INFORMIX_UNLOAD), + + /** + * @see CSVFormat#INFORMIX_UNLOAD_CSV + * @since 1.3 + */ + InformixUnloadCsv(CSVFormat.INFORMIX_UNLOAD_CSV), + + /** + * @see CSVFormat#MONGODB_CSV + * @since 1.7 + */ + MongoDBCsv(CSVFormat.MONGODB_CSV), + + /** + * @see CSVFormat#MONGODB_TSV + * @since 1.7 + */ + MongoDBTsv(CSVFormat.MONGODB_TSV), + + /** + * @see CSVFormat#MYSQL + */ + MySQL(CSVFormat.MYSQL), + + /** + * @see CSVFormat#ORACLE + */ + Oracle(CSVFormat.ORACLE), + + /** + * @see CSVFormat#POSTGRESQL_CSV + * @since 1.5 + */ + PostgreSQLCsv(CSVFormat.POSTGRESQL_CSV), + + /** + * @see CSVFormat#POSTGRESQL_CSV + */ + PostgreSQLText(CSVFormat.POSTGRESQL_TEXT), + + /** + * @see CSVFormat#RFC4180 + */ + RFC4180(CSVFormat.RFC4180), + + /** + * @see CSVFormat#TDF + */ + TDF(CSVFormat.TDF); + + private final CSVFormat format; + + Predefined(final CSVFormat format) { + this.format = format; + } + + /** + * Gets the format. + * + * @return the format. + */ + public CSVFormat getFormat() { + return format; + } + } + + /** + * Standard Comma Separated Value format, as for {@link #RFC4180} but allowing empty lines. + * + *

+ * Settings are: + *

+ * + * + * @see Predefined#Default + */ + public static final CSVFormat DEFAULT = new CSVFormat(COMMA, DOUBLE_QUOTE_CHAR, null, null, null, false, true, CRLF, + null, null, null, false, false, false, false, false, false, true); + + /** + * Excel file format (using a comma as the value delimiter). Note that the actual value delimiter used by Excel is + * locale dependent, it might be necessary to customize this format to accommodate to your regional settings. + * + *

+ * For example for parsing or generating a CSV file on a French system the following format will be used: + *

+ * + *
+     * CSVFormat fmt = CSVFormat.EXCEL.withDelimiter(';');
+     * 
+ * + *

+ * Settings are: + *

+ * + *

+ * Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean) + * withAllowMissingColumnNames(true)} and {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}. + *

+ * + * @see Predefined#Excel + */ + // @formatter:off + public static final CSVFormat EXCEL = DEFAULT + .withIgnoreEmptyLines(false) + .withAllowMissingColumnNames(); + // @formatter:on + + /** + * Default Informix CSV UNLOAD format used by the {@code UNLOAD TO file_name} operation. + * + *

+ * This is a comma-delimited format with a LF character as the line separator. Values are not quoted and special + * characters are escaped with {@code '\'}. The default NULL string is {@code "\\N"}. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MySQL + * @see + * http://www.ibm.com/support/knowledgecenter/SSBJG3_2.5.0/com.ibm.gen_busug.doc/c_fgl_InOutSql_UNLOAD.htm + * @since 1.3 + */ + // @formatter:off + public static final CSVFormat INFORMIX_UNLOAD = DEFAULT + .withDelimiter(PIPE) + .withEscape(BACKSLASH) + .withQuote(DOUBLE_QUOTE_CHAR) + .withRecordSeparator(LF); + // @formatter:on + + /** + * Default Informix CSV UNLOAD format used by the {@code UNLOAD TO file_name} operation (escaping is disabled.) + * + *

+ * This is a comma-delimited format with a LF character as the line separator. Values are not quoted and special + * characters are escaped with {@code '\'}. The default NULL string is {@code "\\N"}. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MySQL + * @see + * http://www.ibm.com/support/knowledgecenter/SSBJG3_2.5.0/com.ibm.gen_busug.doc/c_fgl_InOutSql_UNLOAD.htm + * @since 1.3 + */ + // @formatter:off + public static final CSVFormat INFORMIX_UNLOAD_CSV = DEFAULT + .withDelimiter(COMMA) + .withQuote(DOUBLE_QUOTE_CHAR) + .withRecordSeparator(LF); + // @formatter:on + + /** + * Default MongoDB CSV format used by the {@code mongoexport} operation. + *

+ * Parsing is not supported yet. + *

+ * + *

+ * This is a comma-delimited format. Values are double quoted only if needed and special characters are escaped with + * {@code '"'}. A header line with field names is expected. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MongoDBCsv + * @see MongoDB mongoexport command + * documentation + * @since 1.7 + */ + // @formatter:off + public static final CSVFormat MONGODB_CSV = DEFAULT + .withDelimiter(COMMA) + .withEscape(DOUBLE_QUOTE_CHAR) + .withQuote(DOUBLE_QUOTE_CHAR) + .withQuoteMode(QuoteMode.MINIMAL) + .withSkipHeaderRecord(false); + // @formatter:off + + /** + * Default MongoDB TSV format used by the {@code mongoexport} operation. + *

+ * Parsing is not supported yet. + *

+ * + *

+ * This is a tab-delimited format. Values are double quoted only if needed and special + * characters are escaped with {@code '"'}. A header line with field names is expected. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MongoDBCsv + * @see MongoDB mongoexport command + * documentation + * @since 1.7 + */ + // @formatter:off + public static final CSVFormat MONGODB_TSV = DEFAULT + .withDelimiter(TAB) + .withEscape(DOUBLE_QUOTE_CHAR) + .withQuote(DOUBLE_QUOTE_CHAR) + .withQuoteMode(QuoteMode.MINIMAL) + .withSkipHeaderRecord(false); + // @formatter:off + + /** + * Default MySQL format used by the {@code SELECT INTO OUTFILE} and {@code LOAD DATA INFILE} operations. + * + *

+ * This is a tab-delimited format with a LF character as the line separator. Values are not quoted and special + * characters are escaped with {@code '\'}. The default NULL string is {@code "\\N"}. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MySQL + * @see http://dev.mysql.com/doc/refman/5.1/en/load + * -data.html + */ + // @formatter:off + public static final CSVFormat MYSQL = DEFAULT + .withDelimiter(TAB) + .withEscape(BACKSLASH) + .withIgnoreEmptyLines(false) + .withQuote(null) + .withRecordSeparator(LF) + .withNullString("\\N") + .withQuoteMode(QuoteMode.ALL_NON_NULL); + // @formatter:off + + /** + * Default Oracle format used by the SQL*Loader utility. + * + *

+ * This is a comma-delimited format with the system line separator character as the record separator.Values are + * double quoted when needed and special characters are escaped with {@code '"'}. The default NULL string is + * {@code ""}. Values are trimmed. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#Oracle + * @see Oracle CSV Format Specification + * @since 1.6 + */ + // @formatter:off + public static final CSVFormat ORACLE = DEFAULT + .withDelimiter(COMMA) + .withEscape(BACKSLASH) + .withIgnoreEmptyLines(false) + .withQuote(DOUBLE_QUOTE_CHAR) + .withNullString("\\N") + .withTrim() + .withSystemRecordSeparator() + .withQuoteMode(QuoteMode.MINIMAL); + // @formatter:off + + /** + * Default PostgreSQL CSV format used by the {@code COPY} operation. + * + *

+ * This is a comma-delimited format with a LF character as the line separator. Values are double quoted and special + * characters are escaped with {@code '"'}. The default NULL string is {@code ""}. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MySQL + * @see PostgreSQL COPY command + * documentation + * @since 1.5 + */ + // @formatter:off + public static final CSVFormat POSTGRESQL_CSV = DEFAULT + .withDelimiter(COMMA) + .withEscape(DOUBLE_QUOTE_CHAR) + .withIgnoreEmptyLines(false) + .withQuote(DOUBLE_QUOTE_CHAR) + .withRecordSeparator(LF) + .withNullString(EMPTY) + .withQuoteMode(QuoteMode.ALL_NON_NULL); + // @formatter:off + + /** + * Default PostgreSQL text format used by the {@code COPY} operation. + * + *

+ * This is a tab-delimited format with a LF character as the line separator. Values are double quoted and special + * characters are escaped with {@code '"'}. The default NULL string is {@code "\\N"}. + *

+ * + *

+ * Settings are: + *

+ * + * + * @see Predefined#MySQL + * @see PostgreSQL COPY command + * documentation + * @since 1.5 + */ + // @formatter:off + public static final CSVFormat POSTGRESQL_TEXT = DEFAULT + .withDelimiter(TAB) + .withEscape(BACKSLASH) + .withIgnoreEmptyLines(false) + .withQuote(DOUBLE_QUOTE_CHAR) + .withRecordSeparator(LF) + .withNullString("\\N") + .withQuoteMode(QuoteMode.ALL_NON_NULL); + // @formatter:off + + /** + * Comma separated format as defined by RFC 4180. + * + *

+ * Settings are: + *

+ * + * + * @see Predefined#RFC4180 + */ + public static final CSVFormat RFC4180 = DEFAULT.withIgnoreEmptyLines(false); + + private static final long serialVersionUID = 1L; + + /** + * Tab-delimited format. + * + *

+ * Settings are: + *

+ * + * + * @see Predefined#TDF + */ + // @formatter:off + public static final CSVFormat TDF = DEFAULT + .withDelimiter(TAB) + .withIgnoreSurroundingSpaces(); + // @formatter:on + + /** + * Returns true if the given character is a line break character. + * + * @param c + * the character to check + * + * @return true if {@code c} is a line break character + */ + private static boolean isLineBreak(final char c) { + return c == LF || c == CR; + } + + /** + * Returns true if the given character is a line break character. + * + * @param c + * the character to check, may be null + * + * @return true if {@code c} is a line break character (and not null) + */ + private static boolean isLineBreak(final Character c) { + return c != null && isLineBreak(c.charValue()); + } + + /** + * Creates a new CSV format with the specified delimiter. + * + *

+ * Use this method if you want to create a CSVFormat from scratch. All fields but the delimiter will be initialized + * with null/false. + *

+ * + * @param delimiter + * the char used for value separation, must not be a line break character + * @return a new CSV format. + * @throws IllegalArgumentException + * if the delimiter is a line break character + * + * @see #DEFAULT + * @see #RFC4180 + * @see #MYSQL + * @see #EXCEL + * @see #TDF + */ + public static CSVFormat newFormat(final char delimiter) { + return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false, + false, false, false, false, true); + } + + /** + * Gets one of the predefined formats from {@link CSVFormat.Predefined}. + * + * @param format + * name + * @return one of the predefined formats + * @since 1.2 + */ + public static CSVFormat valueOf(final String format) { + return CSVFormat.Predefined.valueOf(format).getFormat(); + } + + private final boolean allowDuplicateHeaderNames; + + private final boolean allowMissingColumnNames; + + private final boolean autoFlush; + + private final Character commentMarker; // null if commenting is disabled + + private final char delimiter; + + private final Character escapeCharacter; // null if escaping is disabled + + private final String[] header; // array of header column names + + private final String[] headerComments; // array of header comment lines + + private final boolean ignoreEmptyLines; + + private final boolean ignoreHeaderCase; // should ignore header names case + + private final boolean ignoreSurroundingSpaces; // Should leading/trailing spaces be ignored around values? + + private final String nullString; // the string to be used for null values + + private final Character quoteCharacter; // null if quoting is disabled + + private final String quotedNullString; + + private final QuoteMode quoteMode; + + private final String recordSeparator; // for outputs + + private final boolean skipHeaderRecord; + + private final boolean trailingDelimiter; + + private final boolean trim; + + /** + * Creates a customized CSV format. + * + * @param delimiter + * the char used for value separation, must not be a line break character + * @param quoteChar + * the Character used as value encapsulation marker, may be {@code null} to disable + * @param quoteMode + * the quote mode + * @param commentStart + * the Character used for comment identification, may be {@code null} to disable + * @param escape + * the Character used to escape special characters in values, may be {@code null} to disable + * @param ignoreSurroundingSpaces + * {@code true} when whitespaces enclosing values should be ignored + * @param ignoreEmptyLines + * {@code true} when the parser should skip empty lines + * @param recordSeparator + * the line separator to use for output + * @param nullString + * the line separator to use for output + * @param headerComments + * the comments to be printed by the Printer before the actual CSV data + * @param header + * the header + * @param skipHeaderRecord + * TODO + * @param allowMissingColumnNames + * TODO + * @param ignoreHeaderCase + * TODO + * @param trim + * TODO + * @param trailingDelimiter + * TODO + * @param autoFlush + * @throws IllegalArgumentException + * if the delimiter is a line break character + */ + private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMode quoteMode, + final Character commentStart, final Character escape, final boolean ignoreSurroundingSpaces, + final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, + final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, + final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, + final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) { + this.delimiter = delimiter; + this.quoteCharacter = quoteChar; + this.quoteMode = quoteMode; + this.commentMarker = commentStart; + this.escapeCharacter = escape; + this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; + this.allowMissingColumnNames = allowMissingColumnNames; + this.ignoreEmptyLines = ignoreEmptyLines; + this.recordSeparator = recordSeparator; + this.nullString = nullString; + this.headerComments = toStringArray(headerComments); + this.header = header == null ? null : header.clone(); + this.skipHeaderRecord = skipHeaderRecord; + this.ignoreHeaderCase = ignoreHeaderCase; + this.trailingDelimiter = trailingDelimiter; + this.trim = trim; + this.autoFlush = autoFlush; + this.quotedNullString = quoteCharacter + nullString + quoteCharacter; + this.allowDuplicateHeaderNames = allowDuplicateHeaderNames; + validate(); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + + final CSVFormat other = (CSVFormat) obj; + if (delimiter != other.delimiter) { + return false; + } + if (trailingDelimiter != other.trailingDelimiter) { + return false; + } + if (autoFlush != other.autoFlush) { + return false; + } + if (trim != other.trim) { + return false; + } + if (allowMissingColumnNames != other.allowMissingColumnNames) { + return false; + } + if (allowDuplicateHeaderNames != other.allowDuplicateHeaderNames) { + return false; + } + if (ignoreHeaderCase != other.ignoreHeaderCase) { + return false; + } + if (quoteMode != other.quoteMode) { + return false; + } + if (quoteCharacter == null) { + if (other.quoteCharacter != null) { + return false; + } + } else if (!quoteCharacter.equals(other.quoteCharacter)) { + return false; + } + if (commentMarker == null) { + if (other.commentMarker != null) { + return false; + } + } else if (!commentMarker.equals(other.commentMarker)) { + return false; + } + if (escapeCharacter == null) { + if (other.escapeCharacter != null) { + return false; + } + } else if (!escapeCharacter.equals(other.escapeCharacter)) { + return false; + } + if (nullString == null) { + if (other.nullString != null) { + return false; + } + } else if (!nullString.equals(other.nullString)) { + return false; + } + if (!Arrays.equals(header, other.header)) { + return false; + } + if (ignoreSurroundingSpaces != other.ignoreSurroundingSpaces) { + return false; + } + if (ignoreEmptyLines != other.ignoreEmptyLines) { + return false; + } + if (skipHeaderRecord != other.skipHeaderRecord) { + return false; + } + if (recordSeparator == null) { + if (other.recordSeparator != null) { + return false; + } + } else if (!recordSeparator.equals(other.recordSeparator)) { + return false; + } + if (!Arrays.equals(headerComments, other.headerComments)) { + return false; + } + return true; + } + + /** + * Formats the specified values. + * + * @param values + * the values to format + * @return the formatted values + */ + public String format(final Object... values) { + final StringWriter out = new StringWriter(); + try (CSVPrinter csvPrinter = new CSVPrinter(out, this)) { + csvPrinter.printRecord(values); + String res = out.toString(); + int len = recordSeparator != null ? res.length() - recordSeparator.length() : res.length(); + return res.substring(0, len); + } catch (final IOException e) { + // should not happen because a StringWriter does not do IO. + throw new IllegalStateException(e); + } + } + + /** + * Returns true if and only if duplicate names are allowed in the headers. + * + * @return whether duplicate header names are allowed + * @since 1.7 + */ + public boolean getAllowDuplicateHeaderNames() { + return allowDuplicateHeaderNames; + } + + /** + * Specifies whether missing column names are allowed when parsing the header line. + * + * @return {@code true} if missing column names are allowed when parsing the header line, {@code false} to throw an + * {@link IllegalArgumentException}. + */ + public boolean getAllowMissingColumnNames() { + return allowMissingColumnNames; + } + + /** + * Returns whether to flush on close. + * + * @return whether to flush on close. + * @since 1.6 + */ + public boolean getAutoFlush() { + return autoFlush; + } + + /** + * Returns the character marking the start of a line comment. + * + * @return the comment start marker, may be {@code null} + */ + public Character getCommentMarker() { + return commentMarker; + } + + /** + * Returns the character delimiting the values (typically ';', ',' or '\t'). + * + * @return the delimiter character + */ + public char getDelimiter() { + return delimiter; + } + + /** + * Returns the escape character. + * + * @return the escape character, may be {@code null} + */ + public Character getEscapeCharacter() { + return escapeCharacter; + } + + /** + * Returns a copy of the header array. + * + * @return a copy of the header array; {@code null} if disabled, the empty array if to be read from the file + */ + public String[] getHeader() { + return header != null ? header.clone() : null; + } + + /** + * Returns a copy of the header comment array. + * + * @return a copy of the header comment array; {@code null} if disabled. + */ + public String[] getHeaderComments() { + return headerComments != null ? headerComments.clone() : null; + } + + /** + * Specifies whether empty lines between records are ignored when parsing input. + * + * @return {@code true} if empty lines between records are ignored, {@code false} if they are turned into empty + * records. + */ + public boolean getIgnoreEmptyLines() { + return ignoreEmptyLines; + } + + /** + * Specifies whether header names will be accessed ignoring case. + * + * @return {@code true} if header names cases are ignored, {@code false} if they are case sensitive. + * @since 1.3 + */ + public boolean getIgnoreHeaderCase() { + return ignoreHeaderCase; + } + + /** + * Specifies whether spaces around values are ignored when parsing input. + * + * @return {@code true} if spaces around values are ignored, {@code false} if they are treated as part of the value. + */ + public boolean getIgnoreSurroundingSpaces() { + return ignoreSurroundingSpaces; + } + + /** + * Gets the String to convert to and from {@code null}. + * + * + * @return the String to convert to and from {@code null}. No substitution occurs if {@code null} + */ + public String getNullString() { + return nullString; + } + + /** + * Returns the character used to encapsulate values containing special characters. + * + * @return the quoteChar character, may be {@code null} + */ + public Character getQuoteCharacter() { + return quoteCharacter; + } + + /** + * Returns the quote policy output fields. + * + * @return the quote policy + */ + public QuoteMode getQuoteMode() { + return quoteMode; + } + + /** + * Returns the record separator delimiting output records. + * + * @return the record separator + */ + public String getRecordSeparator() { + return recordSeparator; + } + + /** + * Returns whether to skip the header record. + * + * @return whether to skip the header record. + */ + public boolean getSkipHeaderRecord() { + return skipHeaderRecord; + } + + /** + * Returns whether to add a trailing delimiter. + * + * @return whether to add a trailing delimiter. + * @since 1.3 + */ + public boolean getTrailingDelimiter() { + return trailingDelimiter; + } + + /** + * Returns whether to trim leading and trailing blanks. + * This is used by {@link #print(Object, Appendable, boolean)} + * Also by {@link CSVParser#addRecordValue(boolean)} + * + * @return whether to trim leading and trailing blanks. + */ + public boolean getTrim() { + return trim; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + + result = prime * result + delimiter; + result = prime * result + ((quoteMode == null) ? 0 : quoteMode.hashCode()); + result = prime * result + ((quoteCharacter == null) ? 0 : quoteCharacter.hashCode()); + result = prime * result + ((commentMarker == null) ? 0 : commentMarker.hashCode()); + result = prime * result + ((escapeCharacter == null) ? 0 : escapeCharacter.hashCode()); + result = prime * result + ((nullString == null) ? 0 : nullString.hashCode()); + result = prime * result + (ignoreSurroundingSpaces ? 1231 : 1237); + result = prime * result + (ignoreHeaderCase ? 1231 : 1237); + result = prime * result + (ignoreEmptyLines ? 1231 : 1237); + result = prime * result + (skipHeaderRecord ? 1231 : 1237); + result = prime * result + (allowDuplicateHeaderNames ? 1231 : 1237); + result = prime * result + (trim ? 1231 : 1237); + result = prime * result + (autoFlush ? 1231 : 1237); + result = prime * result + (trailingDelimiter ? 1231 : 1237); + result = prime * result + (allowMissingColumnNames ? 1231 : 1237); + result = prime * result + ((recordSeparator == null) ? 0 : recordSeparator.hashCode()); + result = prime * result + Arrays.hashCode(header); + result = prime * result + Arrays.hashCode(headerComments); + return result; + } + + /** + * Specifies whether comments are supported by this format. + * + * Note that the comment introducer character is only recognized at the start of a line. + * + * @return {@code true} is comments are supported, {@code false} otherwise + */ + public boolean isCommentMarkerSet() { + return commentMarker != null; + } + + /** + * Returns whether escape are being processed. + * + * @return {@code true} if escapes are processed + */ + public boolean isEscapeCharacterSet() { + return escapeCharacter != null; + } + + /** + * Returns whether a nullString has been defined. + * + * @return {@code true} if a nullString is defined + */ + public boolean isNullStringSet() { + return nullString != null; + } + + /** + * Returns whether a quoteChar has been defined. + * + * @return {@code true} if a quoteChar is defined + */ + public boolean isQuoteCharacterSet() { + return quoteCharacter != null; + } + + /** + * Parses the specified content. + * + *

+ * See also the various static parse methods on {@link CSVParser}. + *

+ * + * @param in + * the input stream + * @return a parser over a stream of {@link CSVRecord}s. + * @throws IOException + * If an I/O error occurs + */ + public CSVParser parse(final Reader in) throws IOException { + return new CSVParser(in, this); + } + + /** + * Prints to the specified output. + * + *

+ * See also {@link CSVPrinter}. + *

+ * + * @param out + * the output. + * @return a printer to an output. + * @throws IOException + * thrown if the optional header cannot be printed. + */ + public CSVPrinter print(final Appendable out) throws IOException { + return new CSVPrinter(out, this); + } + + /** + * Prints to the specified output. + * + *

+ * See also {@link CSVPrinter}. + *

+ * + * @param out + * the output. + * @param charset + * A charset. + * @return a printer to an output. + * @throws IOException + * thrown if the optional header cannot be printed. + * @since 1.5 + */ + @SuppressWarnings("resource") + public CSVPrinter print(final File out, final Charset charset) throws IOException { + // The writer will be closed when close() is called. + return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(out), charset), this); + } + + /** + * Prints the {@code value} as the next value on the line to {@code out}. The value will be escaped or encapsulated + * as needed. Useful when one wants to avoid creating CSVPrinters. + * Trims the value if {@link #getTrim()} is true + * @param value + * value to output. + * @param out + * where to print the value. + * @param newRecord + * if this a new record. + * @throws IOException + * If an I/O error occurs. + * @since 1.4 + */ + public void print(final Object value, final Appendable out, final boolean newRecord) throws IOException { + // null values are considered empty + // Only call CharSequence.toString() if you have to, helps GC-free use cases. + CharSequence charSequence; + if (value == null) { + // https://issues.apache.org/jira/browse/CSV-203 + if (null == nullString) { + charSequence = EMPTY; + } else { + if (QuoteMode.ALL == quoteMode) { + charSequence = quotedNullString; + } else { + charSequence = nullString; + } + } + } else { + if (value instanceof CharSequence) { + charSequence = (CharSequence) value; + } else if (value instanceof Reader) { + print((Reader) value, out, newRecord); + return; + } else { + charSequence = value.toString(); + } + } + charSequence = getTrim() ? trim(charSequence) : charSequence; + print(value, charSequence, out, newRecord); + } + + private void print(final Object object, final CharSequence value, final Appendable out, final boolean newRecord) + throws IOException { + final int offset = 0; + final int len = value.length(); + if (!newRecord) { + out.append(getDelimiter()); + } + if (object == null) { + out.append(value); + } else if (isQuoteCharacterSet()) { + // the original object is needed so can check for Number + printWithQuotes(object, value, out, newRecord); + } else if (isEscapeCharacterSet()) { + printWithEscapes(value, out); + } else { + out.append(value, offset, len); + } + } + + /** + * Prints to the specified output, returns a {@code CSVPrinter} which the caller MUST close. + * + *

+ * See also {@link CSVPrinter}. + *

+ * + * @param out the output. + * @param charset A charset. + * @return a printer to an output. + * @throws IOException thrown if the optional header cannot be printed. + * @since 1.5 + */ + @SuppressWarnings("resource") + public CSVPrinter print(final Path out, final Charset charset) throws IOException { + return print(Files.newBufferedWriter(out, charset)); + } + + private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException { + // Reader is never null + if (!newRecord) { + out.append(getDelimiter()); + } + if (isQuoteCharacterSet()) { + printWithQuotes(reader, out); + } else if (isEscapeCharacterSet()) { + printWithEscapes(reader, out); + } else if (out instanceof Writer) { + IOUtils.copyLarge(reader, (Writer) out); + } else { + IOUtils.copy(reader, out); + } + + } + + /** + * Prints to the {@link System#out}. + * + *

+ * See also {@link CSVPrinter}. + *

+ * + * @return a printer to {@link System#out}. + * @throws IOException + * thrown if the optional header cannot be printed. + * @since 1.5 + */ + public CSVPrinter printer() throws IOException { + return new CSVPrinter(System.out, this); + } + + /** + * Outputs the trailing delimiter (if set) followed by the record separator (if set). + * + * @param out + * where to write + * @throws IOException + * If an I/O error occurs + * @since 1.4 + */ + public void println(final Appendable out) throws IOException { + if (getTrailingDelimiter()) { + out.append(getDelimiter()); + } + if (recordSeparator != null) { + out.append(recordSeparator); + } + } + + /** + * Prints the given {@code values} to {@code out} as a single record of delimiter separated values followed by the + * record separator. + * + *

+ * The values will be quoted if needed. Quotes and new-line characters will be escaped. This method adds the record + * separator to the output after printing the record, so there is no need to call {@link #println(Appendable)}. + *

+ * + * @param out + * where to write. + * @param values + * values to output. + * @throws IOException + * If an I/O error occurs. + * @since 1.4 + */ + public void printRecord(final Appendable out, final Object... values) throws IOException { + for (int i = 0; i < values.length; i++) { + print(values[i], out, i == 0); + } + println(out); + } + + /* + * Note: must only be called if escaping is enabled, otherwise will generate NPE + */ + private void printWithEscapes(final CharSequence value, final Appendable out) throws IOException { + int start = 0; + int pos = 0; + final int len = value.length(); + final int end = len; + + final char delim = getDelimiter(); + final char escape = getEscapeCharacter().charValue(); + + while (pos < end) { + char c = value.charAt(pos); + if (c == CR || c == LF || c == delim || c == escape) { + // write out segment up until this char + if (pos > start) { + out.append(value, start, pos); + } + if (c == LF) { + c = 'n'; + } else if (c == CR) { + c = 'r'; + } + + out.append(escape); + out.append(c); + + start = pos + 1; // start on the current char after this one + } + pos++; + } + + // write last segment + if (pos > start) { + out.append(value, start, pos); + } + } + + private void printWithEscapes(final Reader reader, final Appendable out) throws IOException { + int start = 0; + int pos = 0; + + final char delim = getDelimiter(); + final char escape = getEscapeCharacter().charValue(); + final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); + + int c; + while (-1 != (c = reader.read())) { + builder.append((char) c); + if (c == CR || c == LF || c == delim || c == escape) { + // write out segment up until this char + if (pos > start) { + out.append(builder.substring(start, pos)); + builder.setLength(0); + pos = -1; + } + if (c == LF) { + c = 'n'; + } else if (c == CR) { + c = 'r'; + } + + out.append(escape); + out.append((char) c); + + start = pos + 1; // start on the current char after this one + } + pos++; + } + + // write last segment + if (pos > start) { + out.append(builder.substring(start, pos)); + } + } + + /* + * Note: must only be called if quoting is enabled, otherwise will generate NPE + */ + // the original object is needed so can check for Number + private void printWithQuotes(final Object object, final CharSequence value, final Appendable out, + final boolean newRecord) throws IOException { + boolean quote = false; + int start = 0; + int pos = 0; + final int len = value.length(); + final int end = len; + + final char delimChar = getDelimiter(); + final char quoteChar = getQuoteCharacter().charValue(); + // If escape char not specified, default to the quote char + // This avoids having to keep checking whether there is an escape character + // at the cost of checking against quote twice + final char escapeChar = isEscapeCharacterSet() ? getEscapeCharacter().charValue() : quoteChar; + + QuoteMode quoteModePolicy = getQuoteMode(); + if (quoteModePolicy == null) { + quoteModePolicy = QuoteMode.MINIMAL; + } + switch (quoteModePolicy) { + case ALL: + case ALL_NON_NULL: + quote = true; + break; + case NON_NUMERIC: + quote = !(object instanceof Number); + break; + case NONE: + // Use the existing escaping code + printWithEscapes(value, out); + return; + case MINIMAL: + if (len <= 0) { + // always quote an empty token that is the first + // on the line, as it may be the only thing on the + // line. If it were not quoted in that case, + // an empty line has no tokens. + if (newRecord) { + quote = true; + } + } else { + char c = value.charAt(pos); + + if (c <= COMMENT) { + // Some other chars at the start of a value caused the parser to fail, so for now + // encapsulate if we start in anything less than '#'. We are being conservative + // by including the default comment char too. + quote = true; + } else { + while (pos < end) { + c = value.charAt(pos); + if (c == LF || c == CR || c == quoteChar || c == delimChar || c == escapeChar) { + quote = true; + break; + } + pos++; + } + + if (!quote) { + pos = end - 1; + c = value.charAt(pos); + // Some other chars at the end caused the parser to fail, so for now + // encapsulate if we end in anything less than ' ' + if (c <= SP) { + quote = true; + } + } + } + } + + if (!quote) { + // no encapsulation needed - write out the original value + out.append(value, start, end); + return; + } + break; + default: + throw new IllegalStateException("Unexpected Quote value: " + quoteModePolicy); + } + + if (!quote) { + // no encapsulation needed - write out the original value + out.append(value, start, end); + return; + } + + // we hit something that needed encapsulation + out.append(quoteChar); + + // Pick up where we left off: pos should be positioned on the first character that caused + // the need for encapsulation. + while (pos < end) { + final char c = value.charAt(pos); + if (c == quoteChar || c == escapeChar) { + // write out the chunk up until this point + out.append(value, start, pos); + out.append(escapeChar); // now output the escape + start = pos; // and restart with the matched char + } + pos++; + } + + // write the last segment + out.append(value, start, pos); + out.append(quoteChar); + } + + /** + * Always use quotes unless QuoteMode is NONE, so we not have to look ahead. + * + * @throws IOException + */ + private void printWithQuotes(final Reader reader, final Appendable out) throws IOException { + + if (getQuoteMode() == QuoteMode.NONE) { + printWithEscapes(reader, out); + return; + } + + int pos = 0; + + final char quote = getQuoteCharacter().charValue(); + final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); + + out.append(quote); + + int c; + while (-1 != (c = reader.read())) { + builder.append((char) c); + if (c == quote) { + // write out segment up until this char + if (pos > 0) { + out.append(builder.substring(0, pos)); + builder.setLength(0); + pos = -1; + } + + out.append(quote); + out.append((char) c); + } + pos++; + } + + // write last segment + if (pos > 0) { + out.append(builder.substring(0, pos)); + } + + out.append(quote); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append("Delimiter=<").append(delimiter).append('>'); + if (isEscapeCharacterSet()) { + sb.append(' '); + sb.append("Escape=<").append(escapeCharacter).append('>'); + } + if (isQuoteCharacterSet()) { + sb.append(' '); + sb.append("QuoteChar=<").append(quoteCharacter).append('>'); + } + if (quoteMode != null) { + sb.append(' '); + sb.append("QuoteMode=<").append(quoteMode).append('>'); + } + if (isCommentMarkerSet()) { + sb.append(' '); + sb.append("CommentStart=<").append(commentMarker).append('>'); + } + if (isNullStringSet()) { + sb.append(' '); + sb.append("NullString=<").append(nullString).append('>'); + } + if (recordSeparator != null) { + sb.append(' '); + sb.append("RecordSeparator=<").append(recordSeparator).append('>'); + } + if (getIgnoreEmptyLines()) { + sb.append(" EmptyLines:ignored"); + } + if (getIgnoreSurroundingSpaces()) { + sb.append(" SurroundingSpaces:ignored"); + } + if (getIgnoreHeaderCase()) { + sb.append(" IgnoreHeaderCase:ignored"); + } + sb.append(" SkipHeaderRecord:").append(skipHeaderRecord); + if (headerComments != null) { + sb.append(' '); + sb.append("HeaderComments:").append(Arrays.toString(headerComments)); + } + if (header != null) { + sb.append(' '); + sb.append("Header:").append(Arrays.toString(header)); + } + return sb.toString(); + } + + private String[] toStringArray(final Object[] values) { + if (values == null) { + return null; + } + final String[] strings = new String[values.length]; + for (int i = 0; i < values.length; i++) { + final Object value = values[i]; + strings[i] = value == null ? null : value.toString(); + } + return strings; + } + + private CharSequence trim(final CharSequence charSequence) { + if (charSequence instanceof String) { + return ((String) charSequence).trim(); + } + final int count = charSequence.length(); + int len = count; + int pos = 0; + + while (pos < len && charSequence.charAt(pos) <= SP) { + pos++; + } + while (pos < len && charSequence.charAt(len - 1) <= SP) { + len--; + } + return pos > 0 || len < count ? charSequence.subSequence(pos, len) : charSequence; + } + + /** + * Verifies the consistency of the parameters and throws an IllegalArgumentException if necessary. + * + * @throws IllegalArgumentException + */ + private void validate() throws IllegalArgumentException { + if (isLineBreak(delimiter)) { + throw new IllegalArgumentException("The delimiter cannot be a line break"); + } + + if (quoteCharacter != null && delimiter == quoteCharacter.charValue()) { + throw new IllegalArgumentException( + "The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); + } + + if (escapeCharacter != null && delimiter == escapeCharacter.charValue()) { + throw new IllegalArgumentException( + "The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); + } + + if (commentMarker != null && delimiter == commentMarker.charValue()) { + throw new IllegalArgumentException( + "The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); + } + + if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) { + throw new IllegalArgumentException( + "The comment start character and the quoteChar cannot be the same ('" + commentMarker + "')"); + } + + if (escapeCharacter != null && escapeCharacter.equals(commentMarker)) { + throw new IllegalArgumentException( + "The comment start and the escape character cannot be the same ('" + commentMarker + "')"); + } + + if (escapeCharacter == null && quoteMode == QuoteMode.NONE) { + throw new IllegalArgumentException("No quotes mode set but no escape character is set"); + } + + // validate header + if (header != null && !allowDuplicateHeaderNames) { + final Set dupCheck = new HashSet<>(); + for (final String hdr : header) { + if (!dupCheck.add(hdr)) { + throw new IllegalArgumentException( + "The header contains a duplicate entry: '" + hdr + "' in " + Arrays.toString(header)); + } + } + } + } + + /** + * Returns a new {@code CSVFormat} that allows duplicate header names. + * + * @return a new {@code CSVFormat} that allows duplicate header names + * @since 1.7 + */ + public CSVFormat withAllowDuplicateHeaderNames() { + return withAllowDuplicateHeaderNames(true); + } + + /** + * Returns a new {@code CSVFormat} with duplicate header names behavior set to the given value. + * + * @param allowDuplicateHeaderNames the duplicate header names behavior, true to allow, false to disallow. + * @return a new {@code CSVFormat} with duplicate header names behavior set to the given value. + * @since 1.7 + */ + public CSVFormat withAllowDuplicateHeaderNames(final boolean allowDuplicateHeaderNames) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the missing column names behavior of the format set to {@code true} + * + * @return A new CSVFormat that is equal to this but with the specified missing column names behavior. + * @see #withAllowMissingColumnNames(boolean) + * @since 1.1 + */ + public CSVFormat withAllowMissingColumnNames() { + return this.withAllowMissingColumnNames(true); + } + + /** + * Returns a new {@code CSVFormat} with the missing column names behavior of the format set to the given value. + * + * @param allowMissingColumnNames + * the missing column names behavior, {@code true} to allow missing column names in the header line, + * {@code false} to cause an {@link IllegalArgumentException} to be thrown. + * @return A new CSVFormat that is equal to this but with the specified missing column names behavior. + */ + public CSVFormat withAllowMissingColumnNames(final boolean allowMissingColumnNames) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with whether to flush on close. + * + * @param autoFlush + * whether to flush on close. + * + * @return A new CSVFormat that is equal to this but with the specified autoFlush setting. + * @since 1.6 + */ + public CSVFormat withAutoFlush(final boolean autoFlush) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the comment start marker of the format set to the specified character. + * + * Note that the comment start character is only recognized at the start of a line. + * + * @param commentMarker + * the comment start marker + * @return A new CSVFormat that is equal to this one but with the specified character as the comment start marker + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withCommentMarker(final char commentMarker) { + return withCommentMarker(Character.valueOf(commentMarker)); + } + + /** + * Returns a new {@code CSVFormat} with the comment start marker of the format set to the specified character. + * + * Note that the comment start character is only recognized at the start of a line. + * + * @param commentMarker + * the comment start marker, use {@code null} to disable + * @return A new CSVFormat that is equal to this one but with the specified character as the comment start marker + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withCommentMarker(final Character commentMarker) { + if (isLineBreak(commentMarker)) { + throw new IllegalArgumentException("The comment start marker character cannot be a line break"); + } + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the delimiter of the format set to the specified character. + * + * @param delimiter + * the delimiter character + * @return A new CSVFormat that is equal to this with the specified character as delimiter + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withDelimiter(final char delimiter) { + if (isLineBreak(delimiter)) { + throw new IllegalArgumentException("The delimiter cannot be a line break"); + } + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the escape character of the format set to the specified character. + * + * @param escape + * the escape character + * @return A new CSVFormat that is equal to his but with the specified character as the escape character + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withEscape(final char escape) { + return withEscape(Character.valueOf(escape)); + } + + /** + * Returns a new {@code CSVFormat} with the escape character of the format set to the specified character. + * + * @param escape + * the escape character, use {@code null} to disable + * @return A new CSVFormat that is equal to this but with the specified character as the escape character + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withEscape(final Character escape) { + if (isLineBreak(escape)) { + throw new IllegalArgumentException("The escape character cannot be a line break"); + } + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escape, ignoreSurroundingSpaces, + ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord, + allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} using the first record as header. + * + *

+ * Calling this method is equivalent to calling: + *

+ * + *
+     * CSVFormat format = aFormat.withHeader().withSkipHeaderRecord();
+     * 
+ * + * @return A new CSVFormat that is equal to this but using the first record as header. + * @see #withSkipHeaderRecord(boolean) + * @see #withHeader(String...) + * @since 1.3 + */ + public CSVFormat withFirstRecordAsHeader() { + return withHeader().withSkipHeaderRecord(); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format defined by the enum class. + * + *

+ * Example: + *

+ * + *
+     * public enum Header {
+     *     Name, Email, Phone
+     * }
+     *
+     * CSVFormat format = aformat.withHeader(Header.class);
+     * 
+ *

+ * The header is also used by the {@link CSVPrinter}. + *

+ * + * @param headerEnum + * the enum defining the header, {@code null} if disabled, empty if parsed automatically, user specified + * otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @see #withHeader(String...) + * @see #withSkipHeaderRecord(boolean) + * @since 1.3 + */ + public CSVFormat withHeader(final Class> headerEnum) { + String[] header = null; + if (headerEnum != null) { + final Enum[] enumValues = headerEnum.getEnumConstants(); + header = new String[enumValues.length]; + for (int i = 0; i < enumValues.length; i++) { + header[i] = enumValues[i].name(); + } + } + return withHeader(header); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format set from the result set metadata. The header can + * either be parsed automatically from the input file with: + * + *
+     * CSVFormat format = aformat.withHeader();
+     * 
+ * + * or specified manually with: + * + *
+     * CSVFormat format = aformat.withHeader(resultSet);
+     * 
+ *

+ * The header is also used by the {@link CSVPrinter}. + *

+ * + * @param resultSet + * the resultSet for the header, {@code null} if disabled, empty if parsed automatically, user specified + * otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @throws SQLException + * SQLException if a database access error occurs or this method is called on a closed result set. + * @since 1.1 + */ + public CSVFormat withHeader(final ResultSet resultSet) throws SQLException { + return withHeader(resultSet != null ? resultSet.getMetaData() : null); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format set from the result set metadata. The header can + * either be parsed automatically from the input file with: + * + *
+     * CSVFormat format = aformat.withHeader();
+     * 
+ * + * or specified manually with: + * + *
+     * CSVFormat format = aformat.withHeader(metaData);
+     * 
+ *

+ * The header is also used by the {@link CSVPrinter}. + *

+ * + * @param metaData + * the metaData for the header, {@code null} if disabled, empty if parsed automatically, user specified + * otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @throws SQLException + * SQLException if a database access error occurs or this method is called on a closed result set. + * @since 1.1 + */ + public CSVFormat withHeader(final ResultSetMetaData metaData) throws SQLException { + String[] labels = null; + if (metaData != null) { + final int columnCount = metaData.getColumnCount(); + labels = new String[columnCount]; + for (int i = 0; i < columnCount; i++) { + labels[i] = metaData.getColumnLabel(i + 1); + } + } + return withHeader(labels); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format set to the given values. The header can either be + * parsed automatically from the input file with: + * + *
+     * CSVFormat format = aformat.withHeader();
+     * 
+ * + * or specified manually with: + * + *
+     * CSVFormat format = aformat.withHeader("name", "email", "phone");
+     * 
+ *

+ * The header is also used by the {@link CSVPrinter}. + *

+ * + * @param header + * the header, {@code null} if disabled, empty if parsed automatically, user specified otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @see #withSkipHeaderRecord(boolean) + */ + public CSVFormat withHeader(final String... header) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the header comments of the format set to the given values. The comments will + * be printed first, before the headers. This setting is ignored by the parser. + * + *
+     * CSVFormat format = aformat.withHeaderComments("Generated by Apache Commons CSV 1.1.", new Date());
+     * 
+ * + * @param headerComments + * the headerComments which will be printed by the Printer before the actual CSV data. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @see #withSkipHeaderRecord(boolean) + * @since 1.1 + */ + public CSVFormat withHeaderComments(final Object... headerComments) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the empty line skipping behavior of the format set to {@code true}. + * + * @return A new CSVFormat that is equal to this but with the specified empty line skipping behavior. + * @since {@link #withIgnoreEmptyLines(boolean)} + * @since 1.1 + */ + public CSVFormat withIgnoreEmptyLines() { + return this.withIgnoreEmptyLines(true); + } + + /** + * Returns a new {@code CSVFormat} with the empty line skipping behavior of the format set to the given value. + * + * @param ignoreEmptyLines + * the empty line skipping behavior, {@code true} to ignore the empty lines between the records, + * {@code false} to translate empty lines to empty records. + * @return A new CSVFormat that is equal to this but with the specified empty line skipping behavior. + */ + public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the header ignore case behavior set to {@code true}. + * + * @return A new CSVFormat that will ignore case header name. + * @see #withIgnoreHeaderCase(boolean) + * @since 1.3 + */ + public CSVFormat withIgnoreHeaderCase() { + return this.withIgnoreHeaderCase(true); + } + + /** + * Returns a new {@code CSVFormat} with whether header names should be accessed ignoring case. + * + * @param ignoreHeaderCase + * the case mapping behavior, {@code true} to access name/values, {@code false} to leave the mapping as + * is. + * @return A new CSVFormat that will ignore case header name if specified as {@code true} + * @since 1.3 + */ + public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the parser trimming behavior of the format set to {@code true}. + * + * @return A new CSVFormat that is equal to this but with the specified parser trimming behavior. + * @see #withIgnoreSurroundingSpaces(boolean) + * @since 1.1 + */ + public CSVFormat withIgnoreSurroundingSpaces() { + return this.withIgnoreSurroundingSpaces(true); + } + + /** + * Returns a new {@code CSVFormat} with the parser trimming behavior of the format set to the given value. + * + * @param ignoreSurroundingSpaces the parser trimming behavior, {@code true} to remove the surrounding spaces, + * {@code false} to leave the spaces as is. + * @return A new CSVFormat that is equal to this but with the specified trimming behavior. + */ + public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with conversions to and from null for strings on input and output. + * + * + * @param nullString + * the String to convert to and from {@code null}. No substitution occurs if {@code null} + * + * @return A new CSVFormat that is equal to this but with the specified null conversion string. + */ + public CSVFormat withNullString(final String nullString) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the quoteChar of the format set to the specified character. + * + * @param quoteChar + * the quoteChar character + * @return A new CSVFormat that is equal to this but with the specified character as quoteChar + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withQuote(final char quoteChar) { + return withQuote(Character.valueOf(quoteChar)); + } + + /** + * Returns a new {@code CSVFormat} with the quoteChar of the format set to the specified character. + * + * @param quoteChar + * the quoteChar character, use {@code null} to disable + * @return A new CSVFormat that is equal to this but with the specified character as quoteChar + * @throws IllegalArgumentException + * thrown if the specified character is a line break + */ + public CSVFormat withQuote(final Character quoteChar) { + if (isLineBreak(quoteChar)) { + throw new IllegalArgumentException("The quoteChar cannot be a line break"); + } + return new CSVFormat(delimiter, quoteChar, quoteMode, commentMarker, escapeCharacter, ignoreSurroundingSpaces, + ignoreEmptyLines, recordSeparator, nullString, headerComments, header, skipHeaderRecord, + allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the output quote policy of the format set to the specified value. + * + * @param quoteModePolicy + * the quote policy to use for output. + * + * @return A new CSVFormat that is equal to this but with the specified quote policy + */ + public CSVFormat withQuoteMode(final QuoteMode quoteModePolicy) { + return new CSVFormat(delimiter, quoteCharacter, quoteModePolicy, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the record separator of the format set to the specified character. + * + *

+ * Note: This setting is only used during printing and does not affect parsing. Parsing currently + * only works for inputs with '\n', '\r' and "\r\n" + *

+ * + * @param recordSeparator + * the record separator to use for output. + * + * @return A new CSVFormat that is equal to this but with the specified output record separator + */ + public CSVFormat withRecordSeparator(final char recordSeparator) { + return withRecordSeparator(String.valueOf(recordSeparator)); + } + + /** + * Returns a new {@code CSVFormat} with the record separator of the format set to the specified String. + * + *

+ * Note: This setting is only used during printing and does not affect parsing. Parsing currently + * only works for inputs with '\n', '\r' and "\r\n" + *

+ * + * @param recordSeparator + * the record separator to use for output. + * + * @return A new CSVFormat that is equal to this but with the specified output record separator + * @throws IllegalArgumentException + * if recordSeparator is none of CR, LF or CRLF + */ + public CSVFormat withRecordSeparator(final String recordSeparator) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with skipping the header record set to {@code true}. + * + * @return A new CSVFormat that is equal to this but with the specified skipHeaderRecord setting. + * @see #withSkipHeaderRecord(boolean) + * @see #withHeader(String...) + * @since 1.1 + */ + public CSVFormat withSkipHeaderRecord() { + return this.withSkipHeaderRecord(true); + } + + /** + * Returns a new {@code CSVFormat} with whether to skip the header record. + * + * @param skipHeaderRecord + * whether to skip the header record. + * + * @return A new CSVFormat that is equal to this but with the specified skipHeaderRecord setting. + * @see #withHeader(String...) + */ + public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the record separator of the format set to the operating system's line + * separator string, typically CR+LF on Windows and LF on Linux. + * + *

+ * Note: This setting is only used during printing and does not affect parsing. Parsing currently + * only works for inputs with '\n', '\r' and "\r\n" + *

+ * + * @return A new CSVFormat that is equal to this but with the operating system's line separator string. + * @since 1.6 + */ + public CSVFormat withSystemRecordSeparator() { + return withRecordSeparator(System.getProperty("line.separator")); + } + + /** + * Returns a new {@code CSVFormat} to add a trailing delimiter. + * + * @return A new CSVFormat that is equal to this but with the trailing delimiter setting. + * @since 1.3 + */ + public CSVFormat withTrailingDelimiter() { + return withTrailingDelimiter(true); + } + + /** + * Returns a new {@code CSVFormat} with whether to add a trailing delimiter. + * + * @param trailingDelimiter + * whether to add a trailing delimiter. + * + * @return A new CSVFormat that is equal to this but with the specified trailing delimiter setting. + * @since 1.3 + */ + public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} to trim leading and trailing blanks. + * See {@link #getTrim()} for details of where this is used. + * + * @return A new CSVFormat that is equal to this but with the trim setting on. + * @since 1.3 + */ + public CSVFormat withTrim() { + return withTrim(true); + } + + /** + * Returns a new {@code CSVFormat} with whether to trim leading and trailing blanks. + * See {@link #getTrim()} for details of where this is used. + * + * @param trim + * whether to trim leading and trailing blanks. + * + * @return A new CSVFormat that is equal to this but with the specified trim setting. + * @since 1.3 + */ + public CSVFormat withTrim(final boolean trim) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } +} diff --git a/src/test/resources/org/apache/commons/csv/CSVParser.java b/src/test/resources/org/apache/commons/csv/CSVParser.java new file mode 100644 index 00000000..bf6eb6d6 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSVParser.java @@ -0,0 +1,715 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Token.Type.TOKEN; + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Objects; +import java.util.TreeMap; + +/** + * Parses CSV files according to the specified format. + * + * Because CSV appears in many different dialects, the parser supports many formats by allowing the + * specification of a {@link CSVFormat}. + * + * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. + * + *

Creating instances

+ *

+ * There are several static factory methods that can be used to create instances for various types of resources: + *

+ * + *

+ * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. + * + * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: + *

+ *
+ * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) {
+ *     ...
+ * }
+ * 
+ * + *

Parsing record wise

+ *

+ * To parse a CSV input from a file, you write: + *

+ * + *
+ * File csvData = new File("/path/to/csv");
+ * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180);
+ * for (CSVRecord csvRecord : parser) {
+ *     ...
+ * }
+ * 
+ * + *

+ * This will read the parse the contents of the file using the + * RFC 4180 format. + *

+ * + *

+ * To parse CSV input in a format like Excel, you write: + *

+ * + *
+ * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL);
+ * for (CSVRecord csvRecord : parser) {
+ *     ...
+ * }
+ * 
+ * + *

+ * If the predefined formats don't match the format at hands, custom formats can be defined. More information about + * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. + *

+ * + *

Parsing into memory

+ *

+ * If parsing record wise is not desired, the contents of the input can be read completely into memory. + *

+ * + *
+ * Reader in = new StringReader("a;b\nc;d");
+ * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL);
+ * List<CSVRecord> list = parser.getRecords();
+ * 
+ * + *

+ * There are two constraints that have to be kept in mind: + *

+ * + *
    + *
  1. Parsing into memory starts at the current position of the parser. If you have already parsed records from + * the input, those records will not end up in the in memory representation of your CSV data.
  2. + *
  3. Parsing into memory may consume a lot of system resources depending on the input. For example if you're + * parsing a 150MB file of CSV data the contents will be read completely into memory.
  4. + *
+ * + *

Notes

+ *

+ * Internal parser state is completely covered by the format and the reader-state. + *

+ * + * @see package documentation for more details + */ +public final class CSVParser implements Iterable, Closeable { + + class CSVRecordIterator implements Iterator { + private CSVRecord current; + + private CSVRecord getNextRecord() { + try { + return CSVParser.this.nextRecord(); + } catch (final IOException e) { + throw new IllegalStateException( + e.getClass().getSimpleName() + " reading next record: " + e.toString(), e); + } + } + + @Override + public boolean hasNext() { + if (CSVParser.this.isClosed()) { + return false; + } + if (this.current == null) { + this.current = this.getNextRecord(); + } + + return this.current != null; + } + + @Override + public CSVRecord next() { + if (CSVParser.this.isClosed()) { + throw new NoSuchElementException("CSVParser has been closed"); + } + CSVRecord next = this.current; + this.current = null; + + if (next == null) { + // hasNext() wasn't called before + next = this.getNextRecord(); + if (next == null) { + throw new NoSuchElementException("No more CSV records available"); + } + } + + return next; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + } + + /** + * Header information based on name and position. + */ + private static final class Headers { + /** + * Header column positions (0-based) + */ + final Map headerMap; + + /** + * Header names in column order + */ + final List headerNames; + + Headers(final Map headerMap, final List headerNames) { + this.headerMap = headerMap; + this.headerNames = headerNames; + } + } + + /** + * Creates a parser for the given {@link File}. + * + * @param file + * a CSV file. Must not be null. + * @param charset + * The Charset to decode the given file. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either file or format are null. + * @throws IOException + * If an I/O error occurs + */ + @SuppressWarnings("resource") + public static CSVParser parse(final File file, final Charset charset, final CSVFormat format) throws IOException { + Objects.requireNonNull(file, "file"); + Objects.requireNonNull(format, "format"); + return new CSVParser(new InputStreamReader(new FileInputStream(file), charset), format); + } + + /** + * Creates a CSV parser using the given {@link CSVFormat}. + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param inputStream + * an InputStream containing CSV-formatted input. Must not be null. + * @param charset + * The Charset to decode the given file. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new CSVParser configured with the given reader and format. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @since 1.5 + */ + @SuppressWarnings("resource") + public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) + throws IOException { + Objects.requireNonNull(inputStream, "inputStream"); + Objects.requireNonNull(format, "format"); + return parse(new InputStreamReader(inputStream, charset), format); + } + + /** + * Creates and returns a parser for the given {@link Path}, which the caller MUST close. + * + * @param path + * a CSV file. Must not be null. + * @param charset + * The Charset to decode the given file. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either file or format are null. + * @throws IOException + * If an I/O error occurs + * @since 1.5 + */ + @SuppressWarnings("resource") + public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { + Objects.requireNonNull(path, "path"); + Objects.requireNonNull(format, "format"); + return parse(Files.newInputStream(path), charset, format); + } + + /** + * Creates a CSV parser using the given {@link CSVFormat} + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new CSVParser configured with the given reader and format. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @since 1.5 + */ + public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { + return new CSVParser(reader, format); + } + + // the following objects are shared to reduce garbage + + /** + * Creates a parser for the given {@link String}. + * + * @param string + * a CSV string. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either string or format are null. + * @throws IOException + * If an I/O error occurs + */ + public static CSVParser parse(final String string, final CSVFormat format) throws IOException { + Objects.requireNonNull(string, "string"); + Objects.requireNonNull(format, "format"); + + return new CSVParser(new StringReader(string), format); + } + + /** + * Creates and returns a parser for the given URL, which the caller MUST close. + * + *

+ * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless + * you close the {@code url}. + *

+ * + * @param url + * a URL. Must not be null. + * @param charset + * the charset for the resource. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either url, charset or format are null. + * @throws IOException + * If an I/O error occurs + */ + @SuppressWarnings("resource") + public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { + Objects.requireNonNull(url, "url"); + Objects.requireNonNull(charset, "charset"); + Objects.requireNonNull(format, "format"); + + return new CSVParser(new InputStreamReader(url.openStream(), charset), format); + } + + private final CSVFormat format; + + /** A mapping of column names to column indices */ + private final Map headerMap; + + /** The column order to avoid re-computing it. */ + private final List headerNames; + + private final Lexer lexer; + + private final CSVRecordIterator csvRecordIterator; + + /** A record buffer for getRecord(). Grows as necessary and is reused. */ + private final List recordList = new ArrayList<>(); + + /** + * The next record number to assign. + */ + private long recordNumber; + + /** + * Lexer offset when the parser does not start parsing at the beginning of the source. Usually used in combination + * with {@link #recordNumber}. + */ + private final long characterOffset; + + private final Token reusableToken = new Token(); + + /** + * Customized CSV parser using the given {@link CSVFormat} + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + */ + public CSVParser(final Reader reader, final CSVFormat format) throws IOException { + this(reader, format, 0, 1); + } + + /** + * Customized CSV parser using the given {@link CSVFormat} + * + *

+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *

+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @param characterOffset + * Lexer offset when the parser does not start parsing at the beginning of the source. + * @param recordNumber + * The next record number to assign + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @since 1.1 + */ + @SuppressWarnings("resource") + public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) + throws IOException { + Objects.requireNonNull(reader, "reader"); + Objects.requireNonNull(format, "format"); + + this.format = format; + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); + this.csvRecordIterator = new CSVRecordIterator(); + final Headers headers = createHeaders(); + this.headerMap = headers.headerMap; + this.headerNames = headers.headerNames; + this.characterOffset = characterOffset; + this.recordNumber = recordNumber - 1; + } + + private void addRecordValue(final boolean lastRecord) { + final String input = this.reusableToken.content.toString(); + final String inputClean = this.format.getTrim() ? input.trim() : input; + if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { + return; + } + final String nullString = this.format.getNullString(); + this.recordList.add(inputClean.equals(nullString) ? null : inputClean); + } + + /** + * Closes resources. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void close() throws IOException { + if (this.lexer != null) { + this.lexer.close(); + } + } + + private Map createEmptyHeaderMap() { + return this.format.getIgnoreHeaderCase() ? + new TreeMap<>(String.CASE_INSENSITIVE_ORDER) : + new LinkedHashMap<>(); + } + + /** + * Creates the name to index mapping if the format defines a header. + * + * @return null if the format has no header. + * @throws IOException if there is a problem reading the header or skipping the first record + */ + private Headers createHeaders() throws IOException { + Map hdrMap = null; + List headerNames = null; + final String[] formatHeader = this.format.getHeader(); + if (formatHeader != null) { + hdrMap = createEmptyHeaderMap(); + String[] headerRecord = null; + if (formatHeader.length == 0) { + // read the header from the first line of the file + final CSVRecord nextRecord = this.nextRecord(); + if (nextRecord != null) { + headerRecord = nextRecord.values(); + } + } else { + if (this.format.getSkipHeaderRecord()) { + this.nextRecord(); + } + headerRecord = formatHeader; + } + + // build the name to index mappings + if (headerRecord != null) { + for (int i = 0; i < headerRecord.length; i++) { + final String header = headerRecord[i]; + final boolean emptyHeader = header == null || header.trim().isEmpty(); + if (emptyHeader && !this.format.getAllowMissingColumnNames()) { + throw new IllegalArgumentException( + "A header name is missing in " + Arrays.toString(headerRecord)); + } + // Note: This will always allow a duplicate header if the header is empty + final boolean containsHeader = header != null && hdrMap.containsKey(header); + if (containsHeader && !emptyHeader && !this.format.getAllowDuplicateHeaderNames()) { + throw new IllegalArgumentException( + String.format( + "The header contains a duplicate name: \"%s\" in %s. If this is valid then use CSVFormat.withAllowDuplicateHeaderNames().", + header, Arrays.toString(headerRecord))); + } + if (header != null) { + hdrMap.put(header, Integer.valueOf(i)); + if (headerNames == null) { + headerNames = new ArrayList<>(headerRecord.length); + } + headerNames.add(header); + } + } + } + } + if (headerNames == null) { + headerNames = Collections.emptyList(); //immutable + } else { + headerNames = Collections.unmodifiableList(headerNames); + } + return new Headers(hdrMap, headerNames); + } + + /** + * Returns the current line number in the input stream. + * + *

+ * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to + * the record number. + *

+ * + * @return current line number + */ + public long getCurrentLineNumber() { + return this.lexer.getCurrentLineNumber(); + } + + /** + * Gets the first end-of-line string encountered. + * + * @return the first end-of-line string + * @since 1.5 + */ + public String getFirstEndOfLine() { + return lexer.getFirstEol(); + } + + /** + * Returns a copy of the header map. + *

+ * The map keys are column names. The map values are 0-based indices. + *

+ *

+ * Note: The map can only provide a one-to-one mapping when the format did not + * contain null or duplicate column names. + *

+ * + * @return a copy of the header map. + */ + public Map getHeaderMap() { + if (this.headerMap == null) { + return null; + } + final Map map = createEmptyHeaderMap(); + map.putAll(this.headerMap); + return map; + } + + /** + * Returns the header map. + * + * @return the header map. + */ + Map getHeaderMapRaw() { + return this.headerMap; + } + + /** + * Returns a read-only list of header names that iterates in column order. + *

+ * Note: The list provides strings that can be used as keys in the header map. + * The list will not contain null column names if they were present in the input + * format. + *

+ * + * @return read-only list of header names that iterates in column order. + * @see #getHeaderMap() + * @since 1.7 + */ + public List getHeaderNames() { + return headerNames; + } + + /** + * Returns the current record number in the input stream. + * + *

+ * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to + * the line number. + *

+ * + * @return current record number + */ + public long getRecordNumber() { + return this.recordNumber; + } + + /** + * Parses the CSV input according to the given format and returns the content as a list of + * {@link CSVRecord CSVRecords}. + * + *

+ * The returned content starts at the current parse-position in the stream. + *

+ * + * @return list of {@link CSVRecord CSVRecords}, may be empty + * @throws IOException + * on parse error or input read-failure + */ + public List getRecords() throws IOException { + CSVRecord rec; + final List records = new ArrayList<>(); + while ((rec = this.nextRecord()) != null) { + records.add(rec); + } + return records; + } + + /** + * Gets whether this parser is closed. + * + * @return whether this parser is closed. + */ + public boolean isClosed() { + return this.lexer.isClosed(); + } + + /** + * Returns an iterator on the records. + * + *

+ * An {@link IOException} caught during the iteration are re-thrown as an + * {@link IllegalStateException}. + *

+ *

+ * If the parser is closed a call to {@link Iterator#next()} will throw a + * {@link NoSuchElementException}. + *

+ */ + @Override + public Iterator iterator() { + return csvRecordIterator; + } + + /** + * Parses the next record from the current point in the stream. + * + * @return the record as an array of values, or {@code null} if the end of the stream has been reached + * @throws IOException + * on parse error or input read-failure + */ + CSVRecord nextRecord() throws IOException { + CSVRecord result = null; + this.recordList.clear(); + StringBuilder sb = null; + final long startCharPosition = lexer.getCharacterPosition() + this.characterOffset; + do { + this.reusableToken.reset(); + this.lexer.nextToken(this.reusableToken); + switch (this.reusableToken.type) { + case TOKEN: + this.addRecordValue(false); + break; + case EORECORD: + this.addRecordValue(true); + break; + case EOF: + if (this.reusableToken.isReady) { + this.addRecordValue(true); + } + break; + case INVALID: + throw new IOException("(line " + this.getCurrentLineNumber() + ") invalid parse sequence"); + case COMMENT: // Ignored currently + if (sb == null) { // first comment for this record + sb = new StringBuilder(); + } else { + sb.append(Constants.LF); + } + sb.append(this.reusableToken.content); + this.reusableToken.type = TOKEN; // Read another token + break; + default: + throw new IllegalStateException("Unexpected Token type: " + this.reusableToken.type); + } + } while (this.reusableToken.type == TOKEN); + + if (!this.recordList.isEmpty()) { + this.recordNumber++; + final String comment = sb == null ? null : sb.toString(); + result = new CSVRecord(this, this.recordList.toArray(new String[this.recordList.size()]), + comment, this.recordNumber, startCharPosition); + } + return result; + } + +} diff --git a/src/test/resources/org/apache/commons/csv/CSVPrinter.java b/src/test/resources/org/apache/commons/csv/CSVPrinter.java new file mode 100644 index 00000000..a0cc6126 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSVPrinter.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Constants.CR; +import static org.apache.commons.csv.Constants.LF; +import static org.apache.commons.csv.Constants.SP; + +import java.io.Closeable; +import java.io.Flushable; +import java.io.IOException; +import java.sql.Clob; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.Objects; + +/** + * Prints values in a {@link CSVFormat CSV format}. + * + *

Values can be appended to the output by calling the {@link #print(Object)} method. + * Values are printed according to {@link String#valueOf(Object)}. + * To complete a record the {@link #println()} method has to be called. + * Comments can be appended by calling {@link #printComment(String)}. + * However a comment will only be written to the output if the {@link CSVFormat} supports comments. + *

+ * + *

The printer also supports appending a complete record at once by calling {@link #printRecord(Object...)} + * or {@link #printRecord(Iterable)}. + * Furthermore {@link #printRecords(Object...)}, {@link #printRecords(Iterable)} and {@link #printRecords(ResultSet)} + * methods can be used to print several records at once. + *

+ * + *

Example:

+ * + *
+ * try (CSVPrinter printer = new CSVPrinter(new FileWriter("csv.txt"), CSVFormat.EXCEL)) {
+ *     printer.printRecord("id", "userName", "firstName", "lastName", "birthday");
+ *     printer.printRecord(1, "john73", "John", "Doe", LocalDate.of(1973, 9, 15));
+ *     printer.println();
+ *     printer.printRecord(2, "mary", "Mary", "Meyer", LocalDate.of(1985, 3, 29));
+ * } catch (IOException ex) {
+ *     ex.printStackTrace();
+ * }
+ * 
+ * + *

This code will write the following to csv.txt:

+ *
+ * id,userName,firstName,lastName,birthday
+ * 1,john73,John,Doe,1973-09-15
+ *
+ * 2,mary,Mary,Meyer,1985-03-29
+ * 
+ */ +public final class CSVPrinter implements Flushable, Closeable { + + /** The place that the values get written. */ + private final Appendable out; + private final CSVFormat format; + + /** True if we just began a new record. */ + private boolean newRecord = true; + + /** + * Creates a printer that will print values to the given stream following the CSVFormat. + *

+ * Currently, only a pure encapsulation format or a pure escaping format is supported. Hybrid formats (encapsulation + * and escaping with a different character) are not supported. + *

+ * + * @param out + * stream to which to print. Must not be null. + * @param format + * the CSV format. Must not be null. + * @throws IOException + * thrown if the optional header cannot be printed. + * @throws IllegalArgumentException + * thrown if the parameters of the format are inconsistent or if either out or format are null. + */ + public CSVPrinter(final Appendable out, final CSVFormat format) throws IOException { + Objects.requireNonNull(out, "out"); + Objects.requireNonNull(format, "format"); + + this.out = out; + this.format = format; + // TODO: Is it a good idea to do this here instead of on the first call to a print method? + // It seems a pain to have to track whether the header has already been printed or not. + if (format.getHeaderComments() != null) { + for (final String line : format.getHeaderComments()) { + if (line != null) { + this.printComment(line); + } + } + } + if (format.getHeader() != null && !format.getSkipHeaderRecord()) { + this.printRecord((Object[]) format.getHeader()); + } + } + + // ====================================================== + // printing implementation + // ====================================================== + + @Override + public void close() throws IOException { + close(false); + } + + /** + * Closes the underlying stream with an optional flush first. + * @param flush whether to flush before the actual close. + * + * @throws IOException + * If an I/O error occurs + * @since 1.6 + */ + public void close(final boolean flush) throws IOException { + if (flush || format.getAutoFlush()) { + flush(); + } + if (out instanceof Closeable) { + ((Closeable) out).close(); + } + } + + /** + * Flushes the underlying stream. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void flush() throws IOException { + if (out instanceof Flushable) { + ((Flushable) out).flush(); + } + } + + /** + * Gets the target Appendable. + * + * @return the target Appendable. + */ + public Appendable getOut() { + return this.out; + } + + /** + * Prints the string as the next value on the line. The value will be escaped or encapsulated as needed. + * + * @param value + * value to be output. + * @throws IOException + * If an I/O error occurs + */ + public void print(final Object value) throws IOException { + format.print(value, out, newRecord); + newRecord = false; + } + + /** + * Prints a comment on a new line among the delimiter separated values. + * + *

+ * Comments will always begin on a new line and occupy at least one full line. The character specified to start + * comments and a space will be inserted at the beginning of each new line in the comment. + *

+ * + *

+ * If comments are disabled in the current CSV format this method does nothing. + *

+ * + *

This method detects line breaks inside the comment string and inserts {@link CSVFormat#getRecordSeparator()} + * to start a new line of the comment. Note that this might produce unexpected results for formats that do not use + * line breaks as record separator.

+ * + * @param comment + * the comment to output + * @throws IOException + * If an I/O error occurs + */ + public void printComment(final String comment) throws IOException { + if (!format.isCommentMarkerSet()) { + return; + } + if (!newRecord) { + println(); + } + out.append(format.getCommentMarker().charValue()); + out.append(SP); + for (int i = 0; i < comment.length(); i++) { + final char c = comment.charAt(i); + switch (c) { + case CR: + if (i + 1 < comment.length() && comment.charAt(i + 1) == LF) { + i++; + } + //$FALL-THROUGH$ break intentionally excluded. + case LF: + println(); + out.append(format.getCommentMarker().charValue()); + out.append(SP); + break; + default: + out.append(c); + break; + } + } + println(); + } + + /** + * Outputs the record separator. + * + * @throws IOException + * If an I/O error occurs + */ + public void println() throws IOException { + format.println(out); + newRecord = true; + } + + /** + * Prints the given values a single record of delimiter separated values followed by the record separator. + * + *

+ * The values will be quoted if needed. Quotes and newLine characters will be escaped. This method adds the record + * separator to the output after printing the record, so there is no need to call {@link #println()}. + *

+ * + * @param values + * values to output. + * @throws IOException + * If an I/O error occurs + */ + public void printRecord(final Iterable values) throws IOException { + for (final Object value : values) { + print(value); + } + println(); + } + + /** + * Prints the given values a single record of delimiter separated values followed by the record separator. + * + *

+ * The values will be quoted if needed. Quotes and newLine characters will be escaped. This method adds the record + * separator to the output after printing the record, so there is no need to call {@link #println()}. + *

+ * + * @param values + * values to output. + * @throws IOException + * If an I/O error occurs + */ + public void printRecord(final Object... values) throws IOException { + format.printRecord(out, values); + newRecord = true; + } + + /** + * Prints all the objects in the given collection handling nested collections/arrays as records. + * + *

+ * If the given collection only contains simple objects, this method will print a single record like + * {@link #printRecord(Iterable)}. If the given collections contains nested collections/arrays those nested elements + * will each be printed as records using {@link #printRecord(Object...)}. + *

+ * + *

+ * Given the following data structure: + *

+ * + *
+     * 
+     * List<String[]> data = ...
+     * data.add(new String[]{ "A", "B", "C" });
+     * data.add(new String[]{ "1", "2", "3" });
+     * data.add(new String[]{ "A1", "B2", "C3" });
+     * 
+     * 
+ * + *

+ * Calling this method will print: + *

+ * + *
+     * 
+     * A, B, C
+     * 1, 2, 3
+     * A1, B2, C3
+     * 
+     * 
+ * + * @param values + * the values to print. + * @throws IOException + * If an I/O error occurs + */ + public void printRecords(final Iterable values) throws IOException { + for (final Object value : values) { + if (value instanceof Object[]) { + this.printRecord((Object[]) value); + } else if (value instanceof Iterable) { + this.printRecord((Iterable) value); + } else { + this.printRecord(value); + } + } + } + + /** + * Prints all the objects in the given array handling nested collections/arrays as records. + * + *

+ * If the given array only contains simple objects, this method will print a single record like + * {@link #printRecord(Object...)}. If the given collections contains nested collections/arrays those nested + * elements will each be printed as records using {@link #printRecord(Object...)}. + *

+ * + *

+ * Given the following data structure: + *

+ * + *
+     * 
+     * String[][] data = new String[3][]
+     * data[0] = String[]{ "A", "B", "C" };
+     * data[1] = new String[]{ "1", "2", "3" };
+     * data[2] = new String[]{ "A1", "B2", "C3" };
+     * 
+     * 
+ * + *

+ * Calling this method will print: + *

+ * + *
+     * 
+     * A, B, C
+     * 1, 2, 3
+     * A1, B2, C3
+     * 
+     * 
+ * + * @param values + * the values to print. + * @throws IOException + * If an I/O error occurs + */ + public void printRecords(final Object... values) throws IOException { + printRecords(Arrays.asList(values)); + } + + /** + * Prints all the objects in the given JDBC result set. + * + * @param resultSet + * result set the values to print. + * @throws IOException + * If an I/O error occurs + * @throws SQLException + * if a database access error occurs + */ + public void printRecords(final ResultSet resultSet) throws SQLException, IOException { + final int columnCount = resultSet.getMetaData().getColumnCount(); + while (resultSet.next()) { + for (int i = 1; i <= columnCount; i++) { + final Object object = resultSet.getObject(i); + // TODO Who manages the Clob? The JDBC driver or must we close it? Is it driver-dependent? + print(object instanceof Clob ? ((Clob) object).getCharacterStream() : object); + } + println(); + } + } +} diff --git a/src/test/resources/org/apache/commons/csv/CSVRecord.java b/src/test/resources/org/apache/commons/csv/CSVRecord.java new file mode 100644 index 00000000..5181bc9a --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSVRecord.java @@ -0,0 +1,329 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; + +/** + * A CSV record parsed from a CSV file. + * + *

+ * Note: Support for {@link Serializable} is scheduled to be removed in version 2.0. + * In version 1.8 the mapping between the column header and the column index was + * removed from the serialised state. The class maintains serialization compatibility + * with versions pre-1.8 for the record values; these must be accessed by index + * following deserialization. There will be loss of any functionally linked to the header + * mapping when transferring serialised forms pre-1.8 to 1.8 and vice versa. + *

+ */ +public final class CSVRecord implements Serializable, Iterable { + + private static final String[] EMPTY_STRING_ARRAY = new String[0]; + + private static final long serialVersionUID = 1L; + + private final long characterPosition; + + /** The accumulated comments (if any) */ + private final String comment; + + /** The record number. */ + private final long recordNumber; + + /** The values of the record */ + private final String[] values; + + /** The parser that originates this record. This is not serialized. */ + private final transient CSVParser parser; + + CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber, + final long characterPosition) { + this.recordNumber = recordNumber; + this.values = values != null ? values : EMPTY_STRING_ARRAY; + this.parser = parser; + this.comment = comment; + this.characterPosition = characterPosition; + } + + /** + * Returns a value by {@link Enum}. + * + * @param e + * an enum + * @return the String at the given enum String + */ + public String get(final Enum e) { + return get(Objects.toString(e, null)); + } + + /** + * Returns a value by index. + * + * @param i + * a column index (0-based) + * @return the String at the given index + */ + public String get(final int i) { + return values[i]; + } + + /** + * Returns a value by name. + * + *

+ * Note: This requires a field mapping obtained from the original parser. + * A check using {@link #isMapped(String)} should be used to determine if a + * mapping exists from the provided {@code name} to a field index. In this case an + * exception will only be thrown if the record does not contain a field corresponding + * to the mapping, that is the record length is not consistent with the mapping size. + *

+ * + * @param name + * the name of the column to be retrieved. + * @return the column value, maybe null depending on {@link CSVFormat#getNullString()}. + * @throws IllegalStateException + * if no header mapping was provided + * @throws IllegalArgumentException + * if {@code name} is not mapped or if the record is inconsistent + * @see #isMapped(String) + * @see #isConsistent() + * @see #getParser() + * @see CSVFormat#withNullString(String) + */ + public String get(final String name) { + final Map headerMap = getHeaderMapRaw(); + if (headerMap == null) { + throw new IllegalStateException( + "No header mapping was specified, the record values can't be accessed by name"); + } + final Integer index = headerMap.get(name); + if (index == null) { + throw new IllegalArgumentException(String.format("Mapping for %s not found, expected one of %s", name, + headerMap.keySet())); + } + try { + return values[index.intValue()]; + } catch (final ArrayIndexOutOfBoundsException e) { + throw new IllegalArgumentException(String.format( + "Index for header '%s' is %d but CSVRecord only has %d values!", name, index, + Integer.valueOf(values.length))); + } + } + + /** + * Returns the start position of this record as a character position in the source stream. This may or may not + * correspond to the byte position depending on the character set. + * + * @return the position of this record in the source stream. + */ + public long getCharacterPosition() { + return characterPosition; + } + + /** + * Returns the comment for this record, if any. + * Note that comments are attached to the following record. + * If there is no following record (i.e. the comment is at EOF) + * the comment will be ignored. + * + * @return the comment for this record, or null if no comment for this record is available. + */ + public String getComment() { + return comment; + } + + private Map getHeaderMapRaw() { + return parser == null ? null : parser.getHeaderMapRaw(); + } + + /** + * Returns the parser. + * + *

+ * Note: The parser is not part of the serialized state of the record. A null check + * should be used when the record may have originated from a serialized form. + *

+ * + * @return the parser. + * @since 1.7 + */ + public CSVParser getParser() { + return parser; + } + + /** + * Returns the number of this record in the parsed CSV file. + * + *

+ * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to + * the current line number of the parser that created this record. + *

+ * + * @return the number of this record. + * @see CSVParser#getCurrentLineNumber() + */ + public long getRecordNumber() { + return recordNumber; + } + + /** + * Checks whether this record has a comment, false otherwise. + * Note that comments are attached to the following record. + * If there is no following record (i.e. the comment is at EOF) + * the comment will be ignored. + * + * @return true if this record has a comment, false otherwise + * @since 1.3 + */ + public boolean hasComment() { + return comment != null; + } + + /** + * Tells whether the record size matches the header size. + * + *

+ * Returns true if the sizes for this record match and false if not. Some programs can export files that fail this + * test but still produce parsable files. + *

+ * + * @return true of this record is valid, false if not + */ + public boolean isConsistent() { + final Map headerMap = getHeaderMapRaw(); + return headerMap == null || headerMap.size() == values.length; + } + + /** + * Checks whether a given column is mapped, i.e. its name has been defined to the parser. + * + * @param name + * the name of the column to be retrieved. + * @return whether a given column is mapped. + */ + public boolean isMapped(final String name) { + final Map headerMap = getHeaderMapRaw(); + return headerMap != null && headerMap.containsKey(name); + } + + /** + * Checks whether a column with given index has a value. + * + * @param index + * a column index (0-based) + * @return whether a column with given index has a value + */ + public boolean isSet(final int index) { + return 0 <= index && index < values.length; + } + + /** + * Checks whether a given columns is mapped and has a value. + * + * @param name + * the name of the column to be retrieved. + * @return whether a given columns is mapped and has a value + */ + public boolean isSet(final String name) { + return isMapped(name) && getHeaderMapRaw().get(name).intValue() < values.length; + } + + /** + * Returns an iterator over the values of this record. + * + * @return an iterator over the values of this record. + */ + @Override + public Iterator iterator() { + return toList().iterator(); + } + + /** + * Puts all values of this record into the given Map. + * + * @param map + * The Map to populate. + * @return the given map. + * @since 1.9 + */ + public > M putIn(final M map) { + if (getHeaderMapRaw() == null) { + return map; + } + for (final Entry entry : getHeaderMapRaw().entrySet()) { + final int col = entry.getValue().intValue(); + if (col < values.length) { + map.put(entry.getKey(), values[col]); + } + } + return map; + } + + /** + * Returns the number of values in this record. + * + * @return the number of values. + */ + public int size() { + return values.length; + } + + /** + * Converts the values to a List. + * + * TODO: Maybe make this public? + * + * @return a new List + */ + private List toList() { + return Arrays.asList(values); + } + + /** + * Copies this record into a new Map of header name to record value. + * + * @return A new Map. The map is empty if the record has no headers. + */ + public Map toMap() { + return putIn(new LinkedHashMap(values.length)); + } + + /** + * Returns a string representation of the contents of this record. The result is constructed by comment, mapping, + * recordNumber and by passing the internal values array to {@link Arrays#toString(Object[])}. + * + * @return a String representation of this record. + */ + @Override + public String toString() { + return "CSVRecord [comment='" + comment + "', recordNumber=" + recordNumber + ", values=" + + Arrays.toString(values) + "]"; + } + + String[] values() { + return values; + } + +} diff --git a/src/test/resources/org/apache/commons/csv/Constants.java b/src/test/resources/org/apache/commons/csv/Constants.java new file mode 100644 index 00000000..b7dc770a --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/Constants.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +/** + * Constants for this package. + */ +final class Constants { + + static final char BACKSLASH = '\\'; + + static final char BACKSPACE = '\b'; + + static final char COMMA = ','; + + /** + * Starts a comment, the remainder of the line is the comment. + */ + static final char COMMENT = '#'; + + static final char CR = '\r'; + + /** RFC 4180 defines line breaks as CRLF */ + static final String CRLF = "\r\n"; + + static final Character DOUBLE_QUOTE_CHAR = Character.valueOf('"'); + + static final String EMPTY = ""; + + /** The end of stream symbol */ + static final int END_OF_STREAM = -1; + + static final char FF = '\f'; + + static final char LF = '\n'; + + /** + * Unicode line separator. + */ + static final String LINE_SEPARATOR = "\u2028"; + + /** + * Unicode next line. + */ + static final String NEXT_LINE = "\u0085"; + + /** + * Unicode paragraph separator. + */ + static final String PARAGRAPH_SEPARATOR = "\u2029"; + + static final char PIPE = '|'; + + /** ASCII record separator */ + static final char RS = 30; + + static final char SP = ' '; + + static final char TAB = '\t'; + + /** Undefined state for the lookahead char */ + static final int UNDEFINED = -2; + + /** ASCII unit separator */ + static final char US = 31; + +} diff --git a/src/test/resources/org/apache/commons/csv/ExtendedBufferedReader.java b/src/test/resources/org/apache/commons/csv/ExtendedBufferedReader.java new file mode 100644 index 00000000..b9ca79df --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/ExtendedBufferedReader.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Constants.CR; +import static org.apache.commons.csv.Constants.END_OF_STREAM; +import static org.apache.commons.csv.Constants.LF; +import static org.apache.commons.csv.Constants.UNDEFINED; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; + +/** + * A special buffered reader which supports sophisticated read access. + *

+ * In particular the reader supports a look-ahead option, which allows you to see the next char returned by + * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}. + *

+ */ +final class ExtendedBufferedReader extends BufferedReader { + + /** The last char returned */ + private int lastChar = UNDEFINED; + + /** The count of EOLs (CR/LF/CRLF) seen so far */ + private long eolCounter; + + /** The position, which is number of characters read so far */ + private long position; + + private boolean closed; + + /** + * Created extended buffered reader using default buffer-size + */ + ExtendedBufferedReader(final Reader reader) { + super(reader); + } + + /** + * Closes the stream. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void close() throws IOException { + // Set ivars before calling super close() in case close() throws an IOException. + closed = true; + lastChar = END_OF_STREAM; + super.close(); + } + + /** + * Returns the current line number + * + * @return the current line number + */ + long getCurrentLineNumber() { + // Check if we are at EOL or EOF or just starting + if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == END_OF_STREAM) { + return eolCounter; // counter is accurate + } + return eolCounter + 1; // Allow for counter being incremented only at EOL + } + + /** + * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by + * any of the read methods. This will not include a character read using the {@link #lookAhead()} method. If no + * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached + * on the last read then this will return {@link Constants#END_OF_STREAM}. + * + * @return the last character that was read + */ + int getLastChar() { + return lastChar; + } + + /** + * Gets the character position in the reader. + * + * @return the current position in the reader (counting characters, not bytes since this is a Reader) + */ + long getPosition() { + return this.position; + } + + public boolean isClosed() { + return closed; + } + + /** + * Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will + * still return this value. Does not affect line number or last character. + * + * @return the next character + * + * @throws IOException + * if there is an error in reading + */ + int lookAhead() throws IOException { + super.mark(1); + final int c = super.read(); + super.reset(); + + return c; + } + + @Override + public int read() throws IOException { + final int current = super.read(); + if (current == CR || current == LF && lastChar != CR) { + eolCounter++; + } + lastChar = current; + this.position++; + return lastChar; + } + + @Override + public int read(final char[] buf, final int offset, final int length) throws IOException { + if (length == 0) { + return 0; + } + + final int len = super.read(buf, offset, length); + + if (len > 0) { + + for (int i = offset; i < offset + len; i++) { + final char ch = buf[i]; + if (ch == LF) { + if (CR != (i > 0 ? buf[i - 1] : lastChar)) { + eolCounter++; + } + } else if (ch == CR) { + eolCounter++; + } + } + + lastChar = buf[offset + len - 1]; + + } else if (len == -1) { + lastChar = END_OF_STREAM; + } + + position += len; + return len; + } + + /** + * Calls {@link BufferedReader#readLine()} which drops the line terminator(s). This method should only be called + * when processing a comment, otherwise information can be lost. + *

+ * Increments {@link #eolCounter} + *

+ * Sets {@link #lastChar} to {@link Constants#END_OF_STREAM} at EOF, otherwise to LF + * + * @return the line that was read, or null if reached EOF. + */ + @Override + public String readLine() throws IOException { + final String line = super.readLine(); + + if (line != null) { + lastChar = LF; // needed for detecting start of line + eolCounter++; + } else { + lastChar = END_OF_STREAM; + } + + return line; + } + +} diff --git a/src/test/resources/org/apache/commons/csv/IOUtils.java b/src/test/resources/org/apache/commons/csv/IOUtils.java new file mode 100644 index 00000000..1771d4dc --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/IOUtils.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv; + +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; +import java.nio.CharBuffer; + +/** Copied from Apache Commons IO. */ +class IOUtils { + + /** + *

+ * Copied from Apache Commons IO. + *

+ * The default buffer size ({@value}). + */ + static final int DEFAULT_BUFFER_SIZE = 1024 * 4; + + /** + *

+ * Copied from Apache Commons IO. + *

+ * Represents the end-of-file (or stream). + * @since 2.5 (made public) + */ + private static final int EOF = -1; + + /** + * Copies chars from a large (over 2GB) {@code Reader} to an {@code Appendable}. + *

+ * This method buffers the input internally, so there is no need to use a + * {@code BufferedReader}. + *

+ * The buffer size is given by {@link #DEFAULT_BUFFER_SIZE}. + * + * @param input the {@code Reader} to read from + * @param output the {@code Appendable} to append to + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 2.7 + */ + static long copy(final Reader input, final Appendable output) throws IOException { + return copy(input, output, CharBuffer.allocate(DEFAULT_BUFFER_SIZE)); + } + + /** + * Copies chars from a large (over 2GB) {@code Reader} to an {@code Appendable}. + *

+ * This method uses the provided buffer, so there is no need to use a + * {@code BufferedReader}. + *

+ * + * @param input the {@code Reader} to read from + * @param output the {@code Appendable} to write to + * @param buffer the buffer to be used for the copy + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 2.7 + */ + static long copy(final Reader input, final Appendable output, final CharBuffer buffer) throws IOException { + long count = 0; + int n; + while (EOF != (n = input.read(buffer))) { + buffer.flip(); + output.append(buffer, 0, n); + count += n; + } + return count; + } + + /** + *

+ * Copied from Apache Commons IO. + *

+ * Copies chars from a large (over 2GB) {@code Reader} to a {@code Writer}. + *

+ * This method buffers the input internally, so there is no need to use a + * {@code BufferedReader}. + *

+ * The buffer size is given by {@link #DEFAULT_BUFFER_SIZE}. + * + * @param input the {@code Reader} to read from + * @param output the {@code Writer} to write to + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 1.3 + */ + static long copyLarge(final Reader input, final Writer output) throws IOException { + return copyLarge(input, output, new char[DEFAULT_BUFFER_SIZE]); + } + + /** + *

+ * Copied from Apache Commons IO. + *

+ * Copies chars from a large (over 2GB) {@code Reader} to a {@code Writer}. + *

+ * This method uses the provided buffer, so there is no need to use a + * {@code BufferedReader}. + *

+ * + * @param input the {@code Reader} to read from + * @param output the {@code Writer} to write to + * @param buffer the buffer to be used for the copy + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 2.2 + */ + static long copyLarge(final Reader input, final Writer output, final char[] buffer) throws IOException { + long count = 0; + int n; + while (EOF != (n = input.read(buffer))) { + output.write(buffer, 0, n); + count += n; + } + return count; + } + +} diff --git a/src/test/resources/org/apache/commons/csv/Lexer.java b/src/test/resources/org/apache/commons/csv/Lexer.java new file mode 100644 index 00000000..2795ca29 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/Lexer.java @@ -0,0 +1,461 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Constants.BACKSPACE; +import static org.apache.commons.csv.Constants.CR; +import static org.apache.commons.csv.Constants.END_OF_STREAM; +import static org.apache.commons.csv.Constants.FF; +import static org.apache.commons.csv.Constants.LF; +import static org.apache.commons.csv.Constants.TAB; +import static org.apache.commons.csv.Constants.UNDEFINED; +import static org.apache.commons.csv.Token.Type.COMMENT; +import static org.apache.commons.csv.Token.Type.EOF; +import static org.apache.commons.csv.Token.Type.EORECORD; +import static org.apache.commons.csv.Token.Type.INVALID; +import static org.apache.commons.csv.Token.Type.TOKEN; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Lexical analyzer. + */ +final class Lexer implements Closeable { + + private static final String CR_STRING = Character.toString(CR); + private static final String LF_STRING = Character.toString(LF); + + /** + * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it + * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two + * chars (using surrogates) and thus there should never be a collision with a real text char. + */ + private static final char DISABLED = '\ufffe'; + + private final char delimiter; + private final char escape; + private final char quoteChar; + private final char commentStart; + + private final boolean ignoreSurroundingSpaces; + private final boolean ignoreEmptyLines; + + /** The input stream */ + private final ExtendedBufferedReader reader; + private String firstEol; + + Lexer(final CSVFormat format, final ExtendedBufferedReader reader) { + this.reader = reader; + this.delimiter = format.getDelimiter(); + this.escape = mapNullToDisabled(format.getEscapeCharacter()); + this.quoteChar = mapNullToDisabled(format.getQuoteCharacter()); + this.commentStart = mapNullToDisabled(format.getCommentMarker()); + this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); + this.ignoreEmptyLines = format.getIgnoreEmptyLines(); + } + + /** + * Closes resources. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void close() throws IOException { + reader.close(); + } + + /** + * Returns the current character position + * + * @return the current character position + */ + long getCharacterPosition() { + return reader.getPosition(); + } + + /** + * Returns the current line number + * + * @return the current line number + */ + long getCurrentLineNumber() { + return reader.getCurrentLineNumber(); + } + + String getFirstEol(){ + return firstEol; + } + + boolean isClosed() { + return reader.isClosed(); + } + + boolean isCommentStart(final int ch) { + return ch == commentStart; + } + + boolean isDelimiter(final int ch) { + return ch == delimiter; + } + + /** + * @return true if the given character indicates end of file + */ + boolean isEndOfFile(final int ch) { + return ch == END_OF_STREAM; + } + + boolean isEscape(final int ch) { + return ch == escape; + } + + private boolean isMetaChar(final int ch) { + return ch == delimiter || + ch == escape || + ch == quoteChar || + ch == commentStart; + } + + boolean isQuoteChar(final int ch) { + return ch == quoteChar; + } + + /** + * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file. + * + * @param ch the character to check + * @return true if the character is at the start of a line. + */ + boolean isStartOfLine(final int ch) { + return ch == LF || ch == CR || ch == UNDEFINED; + } + + /** + * @return true if the given char is a whitespace character + */ + boolean isWhitespace(final int ch) { + return !isDelimiter(ch) && Character.isWhitespace((char) ch); + } + + private char mapNullToDisabled(final Character c) { + return c == null ? DISABLED : c.charValue(); + } + + /** + * Returns the next token. + *

+ * A token corresponds to a term, a record change or an end-of-file indicator. + *

+ * + * @param token + * an existing Token object to reuse. The caller is responsible to initialize the Token. + * @return the next token found + * @throws java.io.IOException + * on stream access error + */ + Token nextToken(final Token token) throws IOException { + + // get the last read char (required for empty line detection) + int lastChar = reader.getLastChar(); + + // read the next char and set eol + int c = reader.read(); + /* + * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF + * - they are equivalent here. + */ + boolean eol = readEndOfLine(c); + + // empty line detection: eol AND (last char was EOL or beginning) + if (ignoreEmptyLines) { + while (eol && isStartOfLine(lastChar)) { + // go on char ahead ... + lastChar = c; + c = reader.read(); + eol = readEndOfLine(c); + // reached end of file without any content (empty line at the end) + if (isEndOfFile(c)) { + token.type = EOF; + // don't set token.isReady here because no content + return token; + } + } + } + + // did we reach eof during the last iteration already ? EOF + if (isEndOfFile(lastChar) || !isDelimiter(lastChar) && isEndOfFile(c)) { + token.type = EOF; + // don't set token.isReady here because no content + return token; + } + + if (isStartOfLine(lastChar) && isCommentStart(c)) { + final String line = reader.readLine(); + if (line == null) { + token.type = EOF; + // don't set token.isReady here because no content + return token; + } + final String comment = line.trim(); + token.content.append(comment); + token.type = COMMENT; + return token; + } + + // important: make sure a new char gets consumed in each iteration + while (token.type == INVALID) { + // ignore whitespaces at beginning of a token + if (ignoreSurroundingSpaces) { + while (isWhitespace(c) && !eol) { + c = reader.read(); + eol = readEndOfLine(c); + } + } + + // ok, start of token reached: encapsulated, or token + if (isDelimiter(c)) { + // empty token return TOKEN("") + token.type = TOKEN; + } else if (eol) { + // empty token return EORECORD("") + // noop: token.content.append(""); + token.type = EORECORD; + } else if (isQuoteChar(c)) { + // consume encapsulated token + parseEncapsulatedToken(token); + } else if (isEndOfFile(c)) { + // end of file return EOF() + // noop: token.content.append(""); + token.type = EOF; + token.isReady = true; // there is data at EOF + } else { + // next token must be a simple token + // add removed blanks when not ignoring whitespace chars... + parseSimpleToken(token, c); + } + } + return token; + } + + /** + * Parses an encapsulated token. + *

+ * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included + * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after + * an encapsulated token are ignored. The token is finished when one of the following conditions become true: + *

    + *
  • an unescaped encapsulator has been reached, and is followed by optional whitespace then:
  • + *
      + *
    • delimiter (TOKEN)
    • + *
    • end of line (EORECORD)
    • + *
    + *
  • end of stream has been reached (EOF)
+ * + * @param token + * the current token + * @return a valid token object + * @throws IOException + * on invalid state: EOF before closing encapsulator or invalid character before delimiter or EOL + */ + private Token parseEncapsulatedToken(final Token token) throws IOException { + // save current line number in case needed for IOE + final long startLineNumber = getCurrentLineNumber(); + int c; + while (true) { + c = reader.read(); + + if (isEscape(c)) { + final int unescaped = readEscape(); + if (unescaped == END_OF_STREAM) { // unexpected char after escape + token.content.append((char) c).append((char) reader.getLastChar()); + } else { + token.content.append((char) unescaped); + } + } else if (isQuoteChar(c)) { + if (isQuoteChar(reader.lookAhead())) { + // double or escaped encapsulator -> add single encapsulator to token + c = reader.read(); + token.content.append((char) c); + } else { + // token finish mark (encapsulator) reached: ignore whitespace till delimiter + while (true) { + c = reader.read(); + if (isDelimiter(c)) { + token.type = TOKEN; + return token; + } else if (isEndOfFile(c)) { + token.type = EOF; + token.isReady = true; // There is data at EOF + return token; + } else if (readEndOfLine(c)) { + token.type = EORECORD; + return token; + } else if (!isWhitespace(c)) { + // error invalid char between token and next delimiter + throw new IOException("(line " + getCurrentLineNumber() + + ") invalid char between encapsulated token and delimiter"); + } + } + } + } else if (isEndOfFile(c)) { + // error condition (end of file before end of token) + throw new IOException("(startline " + startLineNumber + + ") EOF reached before encapsulated token finished"); + } else { + // consume character + token.content.append((char) c); + } + } + } + + /** + * Parses a simple token. + *

+ * Simple token are tokens which are not surrounded by encapsulators. A simple token might contain escaped + * delimiters (as \, or \;). The token is finished when one of the following conditions become true: + *

    + *
  • end of line has been reached (EORECORD)
  • + *
  • end of stream has been reached (EOF)
  • + *
  • an unescaped delimiter has been reached (TOKEN)
  • + *
+ * + * @param token + * the current token + * @param ch + * the current character + * @return the filled token + * @throws IOException + * on stream access error + */ + private Token parseSimpleToken(final Token token, int ch) throws IOException { + // Faster to use while(true)+break than while(token.type == INVALID) + while (true) { + if (readEndOfLine(ch)) { + token.type = EORECORD; + break; + } else if (isEndOfFile(ch)) { + token.type = EOF; + token.isReady = true; // There is data at EOF + break; + } else if (isDelimiter(ch)) { + token.type = TOKEN; + break; + } else if (isEscape(ch)) { + final int unescaped = readEscape(); + if (unescaped == END_OF_STREAM) { // unexpected char after escape + token.content.append((char) ch).append((char) reader.getLastChar()); + } else { + token.content.append((char) unescaped); + } + ch = reader.read(); // continue + } else { + token.content.append((char) ch); + ch = reader.read(); // continue + } + } + + if (ignoreSurroundingSpaces) { + trimTrailingSpaces(token.content); + } + + return token; + } + + /** + * Greedily accepts \n, \r and \r\n This checker consumes silently the second control-character... + * + * @return true if the given or next character is a line-terminator + */ + boolean readEndOfLine(int ch) throws IOException { + // check if we have \r\n... + if (ch == CR && reader.lookAhead() == LF) { + // note: does not change ch outside of this method! + ch = reader.read(); + // Save the EOL state + if (firstEol == null) { + this.firstEol = Constants.CRLF; + } + } + // save EOL state here. + if (firstEol == null) { + if (ch == LF) { + this.firstEol = LF_STRING; + } else if (ch == CR) { + this.firstEol = CR_STRING; + } + } + + return ch == LF || ch == CR; + } + + // TODO escape handling needs more work + /** + * Handle an escape sequence. + * The current character must be the escape character. + * On return, the next character is available by calling {@link ExtendedBufferedReader#getLastChar()} + * on the input stream. + * + * @return the unescaped character (as an int) or {@link Constants#END_OF_STREAM} if char following the escape is + * invalid. + * @throws IOException if there is a problem reading the stream or the end of stream is detected: + * the escape character is not allowed at end of stream + */ + int readEscape() throws IOException { + // the escape char has just been read (normally a backslash) + final int ch = reader.read(); + switch (ch) { + case 'r': + return CR; + case 'n': + return LF; + case 't': + return TAB; + case 'b': + return BACKSPACE; + case 'f': + return FF; + case CR: + case LF: + case FF: // TODO is this correct? + case TAB: // TODO is this correct? Do tabs need to be escaped? + case BACKSPACE: // TODO is this correct? + return ch; + case END_OF_STREAM: + throw new IOException("EOF whilst processing escape sequence"); + default: + // Now check for meta-characters + if (isMetaChar(ch)) { + return ch; + } + // indicate unexpected char - available from in.getLastChar() + return END_OF_STREAM; + } + } + + void trimTrailingSpaces(final StringBuilder buffer) { + int length = buffer.length(); + while (length > 0 && Character.isWhitespace(buffer.charAt(length - 1))) { + length = length - 1; + } + if (length != buffer.length()) { + buffer.setLength(length); + } + } +} diff --git a/src/test/resources/org/apache/commons/csv/QuoteMode.java b/src/test/resources/org/apache/commons/csv/QuoteMode.java new file mode 100644 index 00000000..272deb73 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/QuoteMode.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv; + +/** + * Defines quoting behavior when printing. + */ +public enum QuoteMode { + + /** + * Quotes all fields. + */ + ALL, + + /** + * Quotes all non-null fields. + */ + ALL_NON_NULL, + + /** + * Quotes fields which contain special characters such as a the field delimiter, quote character or any of the + * characters in the line separator string. + */ + MINIMAL, + + /** + * Quotes all non-numeric fields. + */ + NON_NUMERIC, + + /** + * Never quotes fields. When the delimiter occurs in data, the printer prefixes it with the escape character. If the + * escape character is not set, format validation throws an exception. + */ + NONE +} diff --git a/src/test/resources/org/apache/commons/csv/Token.java b/src/test/resources/org/apache/commons/csv/Token.java new file mode 100644 index 00000000..dff7d018 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/Token.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Token.Type.INVALID; + +/** + * Internal token representation. + *

+ * It is used as contract between the lexer and the parser. + */ +final class Token { + + enum Type { + /** Token has no valid content, i.e. is in its initialized state. */ + INVALID, + + /** Token with content, at beginning or in the middle of a line. */ + TOKEN, + + /** Token (which can have content) when the end of file is reached. */ + EOF, + + /** Token with content when the end of a line is reached. */ + EORECORD, + + /** Token is a comment line. */ + COMMENT + } + + /** length of the initial token (content-)buffer */ + private static final int INITIAL_TOKEN_LENGTH = 50; + + /** Token type */ + Token.Type type = INVALID; + + /** The content buffer. */ + final StringBuilder content = new StringBuilder(INITIAL_TOKEN_LENGTH); + + /** Token ready flag: indicates a valid token with content (ready for the parser). */ + boolean isReady; + + void reset() { + content.setLength(0); + type = INVALID; + isReady = false; + } + + /** + * Eases IDE debugging. + * + * @return a string helpful for debugging. + */ + @Override + public String toString() { + return type.name() + " [" + content.toString() + "]"; + } +} diff --git a/src/test/resources/csv-167/sample1.csv b/src/test/resources/org/apache/commons/csv/csv-167/sample1.csv similarity index 100% rename from src/test/resources/csv-167/sample1.csv rename to src/test/resources/org/apache/commons/csv/csv-167/sample1.csv diff --git a/src/test/resources/org/apache/commons/csv/package-info.java b/src/test/resources/org/apache/commons/csv/package-info.java new file mode 100644 index 00000000..29e7fef6 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/package-info.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Apache Commons CSV Format Support. + * + *

CSV are widely used as interfaces to legacy systems or manual data-imports. + * CSV stands for "Comma Separated Values" (or sometimes "Character Separated + * Values"). The CSV data format is defined in + * RFC 4180 + * but many dialects exist.

+ * + *

Common to all file dialects is its basic structure: The CSV data-format + * is record oriented, whereas each record starts on a new textual line. A + * record is build of a list of values. Keep in mind that not all records + * must have an equal number of values:

+ *
+ *       csv    := records*
+ *       record := values*
+ * 
+ * + *

The following list contains the CSV aspects the Commons CSV parser supports:

+ *
+ *
Separators (for lines)
+ *
The record separators are hardcoded and cannot be changed. The must be '\r', '\n' or '\r\n'.
+ * + *
Delimiter (for values)
+ *
The delimiter for values is freely configurable (default ',').
+ * + *
Comments
+ *
Some CSV-dialects support a simple comment syntax. A comment is a record + * which must start with a designated character (the commentStarter). A record + * of this kind is treated as comment and gets removed from the input (default none)
+ * + *
Encapsulator
+ *
Two encapsulator characters (default '"') are used to enclose -> complex values.
+ * + *
Simple values
+ *
A simple value consist of all characters (except the delimiter) until + * (but not including) the next delimiter or a record-terminator. Optionally + * all surrounding whitespaces of a simple value can be ignored (default: true).
+ * + *
Complex values
+ *
Complex values are encapsulated within a pair of the defined encapsulator characters. + * The encapsulator itself must be escaped or doubled when used inside complex values. + * Complex values preserve all kind of formatting (including newlines -> multiline-values)
+ * + *
Empty line skipping
+ *
Optionally empty lines in CSV files can be skipped. + * Otherwise, empty lines will return a record with a single empty value.
+ *
+ * + *

In addition to individually defined dialects, two predefined dialects (strict-csv, and excel-csv) + * can be set directly.

+ * + *

Example usage:

+ *
+ * Reader in = new StringReader("a,b,c");
+ * for (CSVRecord record : CSVFormat.DEFAULT.parse(in)) {
+ *     for (String field : record) {
+ *         System.out.print("\"" + field + "\", ");
+ *     }
+ *     System.out.println();
+ * }
+ * 
+ */ + +package org.apache.commons.csv; diff --git a/src/test/resources/perf/worldcitiespop.txt.gz b/src/test/resources/org/apache/commons/csv/perf/worldcitiespop.txt.gz similarity index 100% rename from src/test/resources/perf/worldcitiespop.txt.gz rename to src/test/resources/org/apache/commons/csv/perf/worldcitiespop.txt.gz