diff --git a/pom.xml b/pom.xml
index f04501c1..bd5ec200 100644
--- a/pom.xml
+++ b/pom.xml
@@ -240,25 +240,25 @@
+ * You can use one of the predefined formats: + *
+ * + *+ * For example: + *
+ * + *+ * CSVParser parser = CSVFormat.EXCEL.parse(reader); + *+ * + *
+ * The {@link CSVParser} provides static methods to parse other input types, for example: + *
+ * + *+ * CSVParser parser = CSVParser.parse(file, StandardCharsets.US_ASCII, CSVFormat.EXCEL); + *+ * + *
+ * You can extend a format by calling the {@code with} methods. For example: + *
+ * + *+ * CSVFormat.EXCEL.withNullString("N/A").withIgnoreSurroundingSpaces(true); + *+ * + *
+ * To define the column names you want to use to access records, write: + *
+ * + *+ * CSVFormat.EXCEL.withHeader("Col1", "Col2", "Col3"); + *+ * + *
+ * Calling {@link #withHeader(String...)} lets you use the given names to address values in a {@link CSVRecord}, and + * assumes that your CSV source does not contain a first record that also defines column names. + * + * If it does, then you are overriding this metadata with your names and you should skip the first record by calling + * {@link #withSkipHeaderRecord(boolean)} with {@code true}. + *
+ * + *+ * You can use a format directly to parse a reader. For example, to parse an Excel file with columns header, write: + *
+ * + *+ * Reader in = ...; + * CSVFormat.EXCEL.withHeader("Col1", "Col2", "Col3").parse(in); + *+ * + *
+ * For other input types, like resources, files, and URLs, use the static methods on {@link CSVParser}. + *
+ * + *+ * If your source contains a header record, you can simplify your code and safely reference columns, by using + * {@link #withHeader(String...)} with no arguments: + *
+ * + *+ * CSVFormat.EXCEL.withHeader(); + *+ * + *
+ * This causes the parser to read the first record and use its values as column names. + * + * Then, call one of the {@link CSVRecord} get method that takes a String column name argument: + *
+ * + *+ * String value = record.get("Col1"); + *+ * + *
+ * This makes your code impervious to changes in column order in the CSV file. + *
+ * + *+ * This class is immutable. + *
+ */ +public final class CSVFormat implements Serializable { + + /** + * Predefines formats. + * + * @since 1.2 + */ + public enum Predefined { + + /** + * @see CSVFormat#DEFAULT + */ + Default(CSVFormat.DEFAULT), + + /** + * @see CSVFormat#EXCEL + */ + Excel(CSVFormat.EXCEL), + + /** + * @see CSVFormat#INFORMIX_UNLOAD + * @since 1.3 + */ + InformixUnload(CSVFormat.INFORMIX_UNLOAD), + + /** + * @see CSVFormat#INFORMIX_UNLOAD_CSV + * @since 1.3 + */ + InformixUnloadCsv(CSVFormat.INFORMIX_UNLOAD_CSV), + + /** + * @see CSVFormat#MONGODB_CSV + * @since 1.7 + */ + MongoDBCsv(CSVFormat.MONGODB_CSV), + + /** + * @see CSVFormat#MONGODB_TSV + * @since 1.7 + */ + MongoDBTsv(CSVFormat.MONGODB_TSV), + + /** + * @see CSVFormat#MYSQL + */ + MySQL(CSVFormat.MYSQL), + + /** + * @see CSVFormat#ORACLE + */ + Oracle(CSVFormat.ORACLE), + + /** + * @see CSVFormat#POSTGRESQL_CSV + * @since 1.5 + */ + PostgreSQLCsv(CSVFormat.POSTGRESQL_CSV), + + /** + * @see CSVFormat#POSTGRESQL_CSV + */ + PostgreSQLText(CSVFormat.POSTGRESQL_TEXT), + + /** + * @see CSVFormat#RFC4180 + */ + RFC4180(CSVFormat.RFC4180), + + /** + * @see CSVFormat#TDF + */ + TDF(CSVFormat.TDF); + + private final CSVFormat format; + + Predefined(final CSVFormat format) { + this.format = format; + } + + /** + * Gets the format. + * + * @return the format. + */ + public CSVFormat getFormat() { + return format; + } + } + + /** + * Standard Comma Separated Value format, as for {@link #RFC4180} but allowing empty lines. + * + *+ * Settings are: + *
+ *+ * For example for parsing or generating a CSV file on a French system the following format will be used: + *
+ * + *+ * CSVFormat fmt = CSVFormat.EXCEL.withDelimiter(';'); + *+ * + *
+ * Settings are: + *
+ *+ * Note: This is currently like {@link #RFC4180} plus {@link #withAllowMissingColumnNames(boolean) + * withAllowMissingColumnNames(true)} and {@link #withIgnoreEmptyLines(boolean) withIgnoreEmptyLines(false)}. + *
+ * + * @see Predefined#Excel + */ + // @formatter:off + public static final CSVFormat EXCEL = DEFAULT + .withIgnoreEmptyLines(false) + .withAllowMissingColumnNames(); + // @formatter:on + + /** + * Default Informix CSV UNLOAD format used by the {@code UNLOAD TO file_name} operation. + * + *+ * This is a comma-delimited format with a LF character as the line separator. Values are not quoted and special + * characters are escaped with {@code '\'}. The default NULL string is {@code "\\N"}. + *
+ * + *+ * Settings are: + *
+ *+ * This is a comma-delimited format with a LF character as the line separator. Values are not quoted and special + * characters are escaped with {@code '\'}. The default NULL string is {@code "\\N"}. + *
+ * + *+ * Settings are: + *
+ *+ * Parsing is not supported yet. + *
+ * + *+ * This is a comma-delimited format. Values are double quoted only if needed and special characters are escaped with + * {@code '"'}. A header line with field names is expected. + *
+ * + *+ * Settings are: + *
+ *+ * Parsing is not supported yet. + *
+ * + *+ * This is a tab-delimited format. Values are double quoted only if needed and special + * characters are escaped with {@code '"'}. A header line with field names is expected. + *
+ * + *+ * Settings are: + *
+ *+ * This is a tab-delimited format with a LF character as the line separator. Values are not quoted and special + * characters are escaped with {@code '\'}. The default NULL string is {@code "\\N"}. + *
+ * + *+ * Settings are: + *
+ *+ * This is a comma-delimited format with the system line separator character as the record separator.Values are + * double quoted when needed and special characters are escaped with {@code '"'}. The default NULL string is + * {@code ""}. Values are trimmed. + *
+ * + *+ * Settings are: + *
+ *+ * This is a comma-delimited format with a LF character as the line separator. Values are double quoted and special + * characters are escaped with {@code '"'}. The default NULL string is {@code ""}. + *
+ * + *+ * Settings are: + *
+ *+ * This is a tab-delimited format with a LF character as the line separator. Values are double quoted and special + * characters are escaped with {@code '"'}. The default NULL string is {@code "\\N"}. + *
+ * + *+ * Settings are: + *
+ *+ * Settings are: + *
+ *+ * Settings are: + *
+ *+ * Use this method if you want to create a CSVFormat from scratch. All fields but the delimiter will be initialized + * with null/false. + *
+ * + * @param delimiter + * the char used for value separation, must not be a line break character + * @return a new CSV format. + * @throws IllegalArgumentException + * if the delimiter is a line break character + * + * @see #DEFAULT + * @see #RFC4180 + * @see #MYSQL + * @see #EXCEL + * @see #TDF + */ + public static CSVFormat newFormat(final char delimiter) { + return new CSVFormat(delimiter, null, null, null, null, false, false, null, null, null, null, false, false, + false, false, false, false, true); + } + + /** + * Gets one of the predefined formats from {@link CSVFormat.Predefined}. + * + * @param format + * name + * @return one of the predefined formats + * @since 1.2 + */ + public static CSVFormat valueOf(final String format) { + return CSVFormat.Predefined.valueOf(format).getFormat(); + } + + private final boolean allowDuplicateHeaderNames; + + private final boolean allowMissingColumnNames; + + private final boolean autoFlush; + + private final Character commentMarker; // null if commenting is disabled + + private final char delimiter; + + private final Character escapeCharacter; // null if escaping is disabled + + private final String[] header; // array of header column names + + private final String[] headerComments; // array of header comment lines + + private final boolean ignoreEmptyLines; + + private final boolean ignoreHeaderCase; // should ignore header names case + + private final boolean ignoreSurroundingSpaces; // Should leading/trailing spaces be ignored around values? + + private final String nullString; // the string to be used for null values + + private final Character quoteCharacter; // null if quoting is disabled + + private final String quotedNullString; + + private final QuoteMode quoteMode; + + private final String recordSeparator; // for outputs + + private final boolean skipHeaderRecord; + + private final boolean trailingDelimiter; + + private final boolean trim; + + /** + * Creates a customized CSV format. + * + * @param delimiter + * the char used for value separation, must not be a line break character + * @param quoteChar + * the Character used as value encapsulation marker, may be {@code null} to disable + * @param quoteMode + * the quote mode + * @param commentStart + * the Character used for comment identification, may be {@code null} to disable + * @param escape + * the Character used to escape special characters in values, may be {@code null} to disable + * @param ignoreSurroundingSpaces + * {@code true} when whitespaces enclosing values should be ignored + * @param ignoreEmptyLines + * {@code true} when the parser should skip empty lines + * @param recordSeparator + * the line separator to use for output + * @param nullString + * the line separator to use for output + * @param headerComments + * the comments to be printed by the Printer before the actual CSV data + * @param header + * the header + * @param skipHeaderRecord + * TODO + * @param allowMissingColumnNames + * TODO + * @param ignoreHeaderCase + * TODO + * @param trim + * TODO + * @param trailingDelimiter + * TODO + * @param autoFlush + * @throws IllegalArgumentException + * if the delimiter is a line break character + */ + private CSVFormat(final char delimiter, final Character quoteChar, final QuoteMode quoteMode, + final Character commentStart, final Character escape, final boolean ignoreSurroundingSpaces, + final boolean ignoreEmptyLines, final String recordSeparator, final String nullString, + final Object[] headerComments, final String[] header, final boolean skipHeaderRecord, + final boolean allowMissingColumnNames, final boolean ignoreHeaderCase, final boolean trim, + final boolean trailingDelimiter, final boolean autoFlush, final boolean allowDuplicateHeaderNames) { + this.delimiter = delimiter; + this.quoteCharacter = quoteChar; + this.quoteMode = quoteMode; + this.commentMarker = commentStart; + this.escapeCharacter = escape; + this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; + this.allowMissingColumnNames = allowMissingColumnNames; + this.ignoreEmptyLines = ignoreEmptyLines; + this.recordSeparator = recordSeparator; + this.nullString = nullString; + this.headerComments = toStringArray(headerComments); + this.header = header == null ? null : header.clone(); + this.skipHeaderRecord = skipHeaderRecord; + this.ignoreHeaderCase = ignoreHeaderCase; + this.trailingDelimiter = trailingDelimiter; + this.trim = trim; + this.autoFlush = autoFlush; + this.quotedNullString = quoteCharacter + nullString + quoteCharacter; + this.allowDuplicateHeaderNames = allowDuplicateHeaderNames; + validate(); + } + + @Override + public boolean equals(final Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + + final CSVFormat other = (CSVFormat) obj; + if (delimiter != other.delimiter) { + return false; + } + if (trailingDelimiter != other.trailingDelimiter) { + return false; + } + if (autoFlush != other.autoFlush) { + return false; + } + if (trim != other.trim) { + return false; + } + if (allowMissingColumnNames != other.allowMissingColumnNames) { + return false; + } + if (allowDuplicateHeaderNames != other.allowDuplicateHeaderNames) { + return false; + } + if (ignoreHeaderCase != other.ignoreHeaderCase) { + return false; + } + if (quoteMode != other.quoteMode) { + return false; + } + if (quoteCharacter == null) { + if (other.quoteCharacter != null) { + return false; + } + } else if (!quoteCharacter.equals(other.quoteCharacter)) { + return false; + } + if (commentMarker == null) { + if (other.commentMarker != null) { + return false; + } + } else if (!commentMarker.equals(other.commentMarker)) { + return false; + } + if (escapeCharacter == null) { + if (other.escapeCharacter != null) { + return false; + } + } else if (!escapeCharacter.equals(other.escapeCharacter)) { + return false; + } + if (nullString == null) { + if (other.nullString != null) { + return false; + } + } else if (!nullString.equals(other.nullString)) { + return false; + } + if (!Arrays.equals(header, other.header)) { + return false; + } + if (ignoreSurroundingSpaces != other.ignoreSurroundingSpaces) { + return false; + } + if (ignoreEmptyLines != other.ignoreEmptyLines) { + return false; + } + if (skipHeaderRecord != other.skipHeaderRecord) { + return false; + } + if (recordSeparator == null) { + if (other.recordSeparator != null) { + return false; + } + } else if (!recordSeparator.equals(other.recordSeparator)) { + return false; + } + if (!Arrays.equals(headerComments, other.headerComments)) { + return false; + } + return true; + } + + /** + * Formats the specified values. + * + * @param values + * the values to format + * @return the formatted values + */ + public String format(final Object... values) { + final StringWriter out = new StringWriter(); + try (CSVPrinter csvPrinter = new CSVPrinter(out, this)) { + csvPrinter.printRecord(values); + String res = out.toString(); + int len = recordSeparator != null ? res.length() - recordSeparator.length() : res.length(); + return res.substring(0, len); + } catch (final IOException e) { + // should not happen because a StringWriter does not do IO. + throw new IllegalStateException(e); + } + } + + /** + * Returns true if and only if duplicate names are allowed in the headers. + * + * @return whether duplicate header names are allowed + * @since 1.7 + */ + public boolean getAllowDuplicateHeaderNames() { + return allowDuplicateHeaderNames; + } + + /** + * Specifies whether missing column names are allowed when parsing the header line. + * + * @return {@code true} if missing column names are allowed when parsing the header line, {@code false} to throw an + * {@link IllegalArgumentException}. + */ + public boolean getAllowMissingColumnNames() { + return allowMissingColumnNames; + } + + /** + * Returns whether to flush on close. + * + * @return whether to flush on close. + * @since 1.6 + */ + public boolean getAutoFlush() { + return autoFlush; + } + + /** + * Returns the character marking the start of a line comment. + * + * @return the comment start marker, may be {@code null} + */ + public Character getCommentMarker() { + return commentMarker; + } + + /** + * Returns the character delimiting the values (typically ';', ',' or '\t'). + * + * @return the delimiter character + */ + public char getDelimiter() { + return delimiter; + } + + /** + * Returns the escape character. + * + * @return the escape character, may be {@code null} + */ + public Character getEscapeCharacter() { + return escapeCharacter; + } + + /** + * Returns a copy of the header array. + * + * @return a copy of the header array; {@code null} if disabled, the empty array if to be read from the file + */ + public String[] getHeader() { + return header != null ? header.clone() : null; + } + + /** + * Returns a copy of the header comment array. + * + * @return a copy of the header comment array; {@code null} if disabled. + */ + public String[] getHeaderComments() { + return headerComments != null ? headerComments.clone() : null; + } + + /** + * Specifies whether empty lines between records are ignored when parsing input. + * + * @return {@code true} if empty lines between records are ignored, {@code false} if they are turned into empty + * records. + */ + public boolean getIgnoreEmptyLines() { + return ignoreEmptyLines; + } + + /** + * Specifies whether header names will be accessed ignoring case. + * + * @return {@code true} if header names cases are ignored, {@code false} if they are case sensitive. + * @since 1.3 + */ + public boolean getIgnoreHeaderCase() { + return ignoreHeaderCase; + } + + /** + * Specifies whether spaces around values are ignored when parsing input. + * + * @return {@code true} if spaces around values are ignored, {@code false} if they are treated as part of the value. + */ + public boolean getIgnoreSurroundingSpaces() { + return ignoreSurroundingSpaces; + } + + /** + * Gets the String to convert to and from {@code null}. + *+ * See also the various static parse methods on {@link CSVParser}. + *
+ * + * @param in + * the input stream + * @return a parser over a stream of {@link CSVRecord}s. + * @throws IOException + * If an I/O error occurs + */ + public CSVParser parse(final Reader in) throws IOException { + return new CSVParser(in, this); + } + + /** + * Prints to the specified output. + * + *+ * See also {@link CSVPrinter}. + *
+ * + * @param out + * the output. + * @return a printer to an output. + * @throws IOException + * thrown if the optional header cannot be printed. + */ + public CSVPrinter print(final Appendable out) throws IOException { + return new CSVPrinter(out, this); + } + + /** + * Prints to the specified output. + * + *+ * See also {@link CSVPrinter}. + *
+ * + * @param out + * the output. + * @param charset + * A charset. + * @return a printer to an output. + * @throws IOException + * thrown if the optional header cannot be printed. + * @since 1.5 + */ + @SuppressWarnings("resource") + public CSVPrinter print(final File out, final Charset charset) throws IOException { + // The writer will be closed when close() is called. + return new CSVPrinter(new OutputStreamWriter(new FileOutputStream(out), charset), this); + } + + /** + * Prints the {@code value} as the next value on the line to {@code out}. The value will be escaped or encapsulated + * as needed. Useful when one wants to avoid creating CSVPrinters. + * Trims the value if {@link #getTrim()} is true + * @param value + * value to output. + * @param out + * where to print the value. + * @param newRecord + * if this a new record. + * @throws IOException + * If an I/O error occurs. + * @since 1.4 + */ + public void print(final Object value, final Appendable out, final boolean newRecord) throws IOException { + // null values are considered empty + // Only call CharSequence.toString() if you have to, helps GC-free use cases. + CharSequence charSequence; + if (value == null) { + // https://issues.apache.org/jira/browse/CSV-203 + if (null == nullString) { + charSequence = EMPTY; + } else { + if (QuoteMode.ALL == quoteMode) { + charSequence = quotedNullString; + } else { + charSequence = nullString; + } + } + } else { + if (value instanceof CharSequence) { + charSequence = (CharSequence) value; + } else if (value instanceof Reader) { + print((Reader) value, out, newRecord); + return; + } else { + charSequence = value.toString(); + } + } + charSequence = getTrim() ? trim(charSequence) : charSequence; + print(value, charSequence, out, newRecord); + } + + private void print(final Object object, final CharSequence value, final Appendable out, final boolean newRecord) + throws IOException { + final int offset = 0; + final int len = value.length(); + if (!newRecord) { + out.append(getDelimiter()); + } + if (object == null) { + out.append(value); + } else if (isQuoteCharacterSet()) { + // the original object is needed so can check for Number + printWithQuotes(object, value, out, newRecord); + } else if (isEscapeCharacterSet()) { + printWithEscapes(value, out); + } else { + out.append(value, offset, len); + } + } + + /** + * Prints to the specified output, returns a {@code CSVPrinter} which the caller MUST close. + * + *+ * See also {@link CSVPrinter}. + *
+ * + * @param out the output. + * @param charset A charset. + * @return a printer to an output. + * @throws IOException thrown if the optional header cannot be printed. + * @since 1.5 + */ + @SuppressWarnings("resource") + public CSVPrinter print(final Path out, final Charset charset) throws IOException { + return print(Files.newBufferedWriter(out, charset)); + } + + private void print(final Reader reader, final Appendable out, final boolean newRecord) throws IOException { + // Reader is never null + if (!newRecord) { + out.append(getDelimiter()); + } + if (isQuoteCharacterSet()) { + printWithQuotes(reader, out); + } else if (isEscapeCharacterSet()) { + printWithEscapes(reader, out); + } else if (out instanceof Writer) { + IOUtils.copyLarge(reader, (Writer) out); + } else { + IOUtils.copy(reader, out); + } + + } + + /** + * Prints to the {@link System#out}. + * + *+ * See also {@link CSVPrinter}. + *
+ * + * @return a printer to {@link System#out}. + * @throws IOException + * thrown if the optional header cannot be printed. + * @since 1.5 + */ + public CSVPrinter printer() throws IOException { + return new CSVPrinter(System.out, this); + } + + /** + * Outputs the trailing delimiter (if set) followed by the record separator (if set). + * + * @param out + * where to write + * @throws IOException + * If an I/O error occurs + * @since 1.4 + */ + public void println(final Appendable out) throws IOException { + if (getTrailingDelimiter()) { + out.append(getDelimiter()); + } + if (recordSeparator != null) { + out.append(recordSeparator); + } + } + + /** + * Prints the given {@code values} to {@code out} as a single record of delimiter separated values followed by the + * record separator. + * + *+ * The values will be quoted if needed. Quotes and new-line characters will be escaped. This method adds the record + * separator to the output after printing the record, so there is no need to call {@link #println(Appendable)}. + *
+ * + * @param out + * where to write. + * @param values + * values to output. + * @throws IOException + * If an I/O error occurs. + * @since 1.4 + */ + public void printRecord(final Appendable out, final Object... values) throws IOException { + for (int i = 0; i < values.length; i++) { + print(values[i], out, i == 0); + } + println(out); + } + + /* + * Note: must only be called if escaping is enabled, otherwise will generate NPE + */ + private void printWithEscapes(final CharSequence value, final Appendable out) throws IOException { + int start = 0; + int pos = 0; + final int len = value.length(); + final int end = len; + + final char delim = getDelimiter(); + final char escape = getEscapeCharacter().charValue(); + + while (pos < end) { + char c = value.charAt(pos); + if (c == CR || c == LF || c == delim || c == escape) { + // write out segment up until this char + if (pos > start) { + out.append(value, start, pos); + } + if (c == LF) { + c = 'n'; + } else if (c == CR) { + c = 'r'; + } + + out.append(escape); + out.append(c); + + start = pos + 1; // start on the current char after this one + } + pos++; + } + + // write last segment + if (pos > start) { + out.append(value, start, pos); + } + } + + private void printWithEscapes(final Reader reader, final Appendable out) throws IOException { + int start = 0; + int pos = 0; + + final char delim = getDelimiter(); + final char escape = getEscapeCharacter().charValue(); + final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); + + int c; + while (-1 != (c = reader.read())) { + builder.append((char) c); + if (c == CR || c == LF || c == delim || c == escape) { + // write out segment up until this char + if (pos > start) { + out.append(builder.substring(start, pos)); + builder.setLength(0); + pos = -1; + } + if (c == LF) { + c = 'n'; + } else if (c == CR) { + c = 'r'; + } + + out.append(escape); + out.append((char) c); + + start = pos + 1; // start on the current char after this one + } + pos++; + } + + // write last segment + if (pos > start) { + out.append(builder.substring(start, pos)); + } + } + + /* + * Note: must only be called if quoting is enabled, otherwise will generate NPE + */ + // the original object is needed so can check for Number + private void printWithQuotes(final Object object, final CharSequence value, final Appendable out, + final boolean newRecord) throws IOException { + boolean quote = false; + int start = 0; + int pos = 0; + final int len = value.length(); + final int end = len; + + final char delimChar = getDelimiter(); + final char quoteChar = getQuoteCharacter().charValue(); + // If escape char not specified, default to the quote char + // This avoids having to keep checking whether there is an escape character + // at the cost of checking against quote twice + final char escapeChar = isEscapeCharacterSet() ? getEscapeCharacter().charValue() : quoteChar; + + QuoteMode quoteModePolicy = getQuoteMode(); + if (quoteModePolicy == null) { + quoteModePolicy = QuoteMode.MINIMAL; + } + switch (quoteModePolicy) { + case ALL: + case ALL_NON_NULL: + quote = true; + break; + case NON_NUMERIC: + quote = !(object instanceof Number); + break; + case NONE: + // Use the existing escaping code + printWithEscapes(value, out); + return; + case MINIMAL: + if (len <= 0) { + // always quote an empty token that is the first + // on the line, as it may be the only thing on the + // line. If it were not quoted in that case, + // an empty line has no tokens. + if (newRecord) { + quote = true; + } + } else { + char c = value.charAt(pos); + + if (c <= COMMENT) { + // Some other chars at the start of a value caused the parser to fail, so for now + // encapsulate if we start in anything less than '#'. We are being conservative + // by including the default comment char too. + quote = true; + } else { + while (pos < end) { + c = value.charAt(pos); + if (c == LF || c == CR || c == quoteChar || c == delimChar || c == escapeChar) { + quote = true; + break; + } + pos++; + } + + if (!quote) { + pos = end - 1; + c = value.charAt(pos); + // Some other chars at the end caused the parser to fail, so for now + // encapsulate if we end in anything less than ' ' + if (c <= SP) { + quote = true; + } + } + } + } + + if (!quote) { + // no encapsulation needed - write out the original value + out.append(value, start, end); + return; + } + break; + default: + throw new IllegalStateException("Unexpected Quote value: " + quoteModePolicy); + } + + if (!quote) { + // no encapsulation needed - write out the original value + out.append(value, start, end); + return; + } + + // we hit something that needed encapsulation + out.append(quoteChar); + + // Pick up where we left off: pos should be positioned on the first character that caused + // the need for encapsulation. + while (pos < end) { + final char c = value.charAt(pos); + if (c == quoteChar || c == escapeChar) { + // write out the chunk up until this point + out.append(value, start, pos); + out.append(escapeChar); // now output the escape + start = pos; // and restart with the matched char + } + pos++; + } + + // write the last segment + out.append(value, start, pos); + out.append(quoteChar); + } + + /** + * Always use quotes unless QuoteMode is NONE, so we not have to look ahead. + * + * @throws IOException + */ + private void printWithQuotes(final Reader reader, final Appendable out) throws IOException { + + if (getQuoteMode() == QuoteMode.NONE) { + printWithEscapes(reader, out); + return; + } + + int pos = 0; + + final char quote = getQuoteCharacter().charValue(); + final StringBuilder builder = new StringBuilder(IOUtils.DEFAULT_BUFFER_SIZE); + + out.append(quote); + + int c; + while (-1 != (c = reader.read())) { + builder.append((char) c); + if (c == quote) { + // write out segment up until this char + if (pos > 0) { + out.append(builder.substring(0, pos)); + builder.setLength(0); + pos = -1; + } + + out.append(quote); + out.append((char) c); + } + pos++; + } + + // write last segment + if (pos > 0) { + out.append(builder.substring(0, pos)); + } + + out.append(quote); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append("Delimiter=<").append(delimiter).append('>'); + if (isEscapeCharacterSet()) { + sb.append(' '); + sb.append("Escape=<").append(escapeCharacter).append('>'); + } + if (isQuoteCharacterSet()) { + sb.append(' '); + sb.append("QuoteChar=<").append(quoteCharacter).append('>'); + } + if (quoteMode != null) { + sb.append(' '); + sb.append("QuoteMode=<").append(quoteMode).append('>'); + } + if (isCommentMarkerSet()) { + sb.append(' '); + sb.append("CommentStart=<").append(commentMarker).append('>'); + } + if (isNullStringSet()) { + sb.append(' '); + sb.append("NullString=<").append(nullString).append('>'); + } + if (recordSeparator != null) { + sb.append(' '); + sb.append("RecordSeparator=<").append(recordSeparator).append('>'); + } + if (getIgnoreEmptyLines()) { + sb.append(" EmptyLines:ignored"); + } + if (getIgnoreSurroundingSpaces()) { + sb.append(" SurroundingSpaces:ignored"); + } + if (getIgnoreHeaderCase()) { + sb.append(" IgnoreHeaderCase:ignored"); + } + sb.append(" SkipHeaderRecord:").append(skipHeaderRecord); + if (headerComments != null) { + sb.append(' '); + sb.append("HeaderComments:").append(Arrays.toString(headerComments)); + } + if (header != null) { + sb.append(' '); + sb.append("Header:").append(Arrays.toString(header)); + } + return sb.toString(); + } + + private String[] toStringArray(final Object[] values) { + if (values == null) { + return null; + } + final String[] strings = new String[values.length]; + for (int i = 0; i < values.length; i++) { + final Object value = values[i]; + strings[i] = value == null ? null : value.toString(); + } + return strings; + } + + private CharSequence trim(final CharSequence charSequence) { + if (charSequence instanceof String) { + return ((String) charSequence).trim(); + } + final int count = charSequence.length(); + int len = count; + int pos = 0; + + while (pos < len && charSequence.charAt(pos) <= SP) { + pos++; + } + while (pos < len && charSequence.charAt(len - 1) <= SP) { + len--; + } + return pos > 0 || len < count ? charSequence.subSequence(pos, len) : charSequence; + } + + /** + * Verifies the consistency of the parameters and throws an IllegalArgumentException if necessary. + * + * @throws IllegalArgumentException + */ + private void validate() throws IllegalArgumentException { + if (isLineBreak(delimiter)) { + throw new IllegalArgumentException("The delimiter cannot be a line break"); + } + + if (quoteCharacter != null && delimiter == quoteCharacter.charValue()) { + throw new IllegalArgumentException( + "The quoteChar character and the delimiter cannot be the same ('" + quoteCharacter + "')"); + } + + if (escapeCharacter != null && delimiter == escapeCharacter.charValue()) { + throw new IllegalArgumentException( + "The escape character and the delimiter cannot be the same ('" + escapeCharacter + "')"); + } + + if (commentMarker != null && delimiter == commentMarker.charValue()) { + throw new IllegalArgumentException( + "The comment start character and the delimiter cannot be the same ('" + commentMarker + "')"); + } + + if (quoteCharacter != null && quoteCharacter.equals(commentMarker)) { + throw new IllegalArgumentException( + "The comment start character and the quoteChar cannot be the same ('" + commentMarker + "')"); + } + + if (escapeCharacter != null && escapeCharacter.equals(commentMarker)) { + throw new IllegalArgumentException( + "The comment start and the escape character cannot be the same ('" + commentMarker + "')"); + } + + if (escapeCharacter == null && quoteMode == QuoteMode.NONE) { + throw new IllegalArgumentException("No quotes mode set but no escape character is set"); + } + + // validate header + if (header != null && !allowDuplicateHeaderNames) { + final Set+ * Calling this method is equivalent to calling: + *
+ * + *+ * CSVFormat format = aFormat.withHeader().withSkipHeaderRecord(); + *+ * + * @return A new CSVFormat that is equal to this but using the first record as header. + * @see #withSkipHeaderRecord(boolean) + * @see #withHeader(String...) + * @since 1.3 + */ + public CSVFormat withFirstRecordAsHeader() { + return withHeader().withSkipHeaderRecord(); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format defined by the enum class. + * + *
+ * Example: + *
+ * + *+ * public enum Header { + * Name, Email, Phone + * } + * + * CSVFormat format = aformat.withHeader(Header.class); + *+ *
+ * The header is also used by the {@link CSVPrinter}. + *
+ * + * @param headerEnum + * the enum defining the header, {@code null} if disabled, empty if parsed automatically, user specified + * otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @see #withHeader(String...) + * @see #withSkipHeaderRecord(boolean) + * @since 1.3 + */ + public CSVFormat withHeader(final Class extends Enum>> headerEnum) { + String[] header = null; + if (headerEnum != null) { + final Enum>[] enumValues = headerEnum.getEnumConstants(); + header = new String[enumValues.length]; + for (int i = 0; i < enumValues.length; i++) { + header[i] = enumValues[i].name(); + } + } + return withHeader(header); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format set from the result set metadata. The header can + * either be parsed automatically from the input file with: + * + *+ * CSVFormat format = aformat.withHeader(); + *+ * + * or specified manually with: + * + *
+ * CSVFormat format = aformat.withHeader(resultSet); + *+ *
+ * The header is also used by the {@link CSVPrinter}. + *
+ * + * @param resultSet + * the resultSet for the header, {@code null} if disabled, empty if parsed automatically, user specified + * otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @throws SQLException + * SQLException if a database access error occurs or this method is called on a closed result set. + * @since 1.1 + */ + public CSVFormat withHeader(final ResultSet resultSet) throws SQLException { + return withHeader(resultSet != null ? resultSet.getMetaData() : null); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format set from the result set metadata. The header can + * either be parsed automatically from the input file with: + * + *+ * CSVFormat format = aformat.withHeader(); + *+ * + * or specified manually with: + * + *
+ * CSVFormat format = aformat.withHeader(metaData); + *+ *
+ * The header is also used by the {@link CSVPrinter}. + *
+ * + * @param metaData + * the metaData for the header, {@code null} if disabled, empty if parsed automatically, user specified + * otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @throws SQLException + * SQLException if a database access error occurs or this method is called on a closed result set. + * @since 1.1 + */ + public CSVFormat withHeader(final ResultSetMetaData metaData) throws SQLException { + String[] labels = null; + if (metaData != null) { + final int columnCount = metaData.getColumnCount(); + labels = new String[columnCount]; + for (int i = 0; i < columnCount; i++) { + labels[i] = metaData.getColumnLabel(i + 1); + } + } + return withHeader(labels); + } + + /** + * Returns a new {@code CSVFormat} with the header of the format set to the given values. The header can either be + * parsed automatically from the input file with: + * + *+ * CSVFormat format = aformat.withHeader(); + *+ * + * or specified manually with: + * + *
+ * CSVFormat format = aformat.withHeader("name", "email", "phone"); + *+ *
+ * The header is also used by the {@link CSVPrinter}. + *
+ * + * @param header + * the header, {@code null} if disabled, empty if parsed automatically, user specified otherwise. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @see #withSkipHeaderRecord(boolean) + */ + public CSVFormat withHeader(final String... header) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the header comments of the format set to the given values. The comments will + * be printed first, before the headers. This setting is ignored by the parser. + * + *+ * CSVFormat format = aformat.withHeaderComments("Generated by Apache Commons CSV 1.1.", new Date()); + *+ * + * @param headerComments + * the headerComments which will be printed by the Printer before the actual CSV data. + * + * @return A new CSVFormat that is equal to this but with the specified header + * @see #withSkipHeaderRecord(boolean) + * @since 1.1 + */ + public CSVFormat withHeaderComments(final Object... headerComments) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the empty line skipping behavior of the format set to {@code true}. + * + * @return A new CSVFormat that is equal to this but with the specified empty line skipping behavior. + * @since {@link #withIgnoreEmptyLines(boolean)} + * @since 1.1 + */ + public CSVFormat withIgnoreEmptyLines() { + return this.withIgnoreEmptyLines(true); + } + + /** + * Returns a new {@code CSVFormat} with the empty line skipping behavior of the format set to the given value. + * + * @param ignoreEmptyLines + * the empty line skipping behavior, {@code true} to ignore the empty lines between the records, + * {@code false} to translate empty lines to empty records. + * @return A new CSVFormat that is equal to this but with the specified empty line skipping behavior. + */ + public CSVFormat withIgnoreEmptyLines(final boolean ignoreEmptyLines) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the header ignore case behavior set to {@code true}. + * + * @return A new CSVFormat that will ignore case header name. + * @see #withIgnoreHeaderCase(boolean) + * @since 1.3 + */ + public CSVFormat withIgnoreHeaderCase() { + return this.withIgnoreHeaderCase(true); + } + + /** + * Returns a new {@code CSVFormat} with whether header names should be accessed ignoring case. + * + * @param ignoreHeaderCase + * the case mapping behavior, {@code true} to access name/values, {@code false} to leave the mapping as + * is. + * @return A new CSVFormat that will ignore case header name if specified as {@code true} + * @since 1.3 + */ + public CSVFormat withIgnoreHeaderCase(final boolean ignoreHeaderCase) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the parser trimming behavior of the format set to {@code true}. + * + * @return A new CSVFormat that is equal to this but with the specified parser trimming behavior. + * @see #withIgnoreSurroundingSpaces(boolean) + * @since 1.1 + */ + public CSVFormat withIgnoreSurroundingSpaces() { + return this.withIgnoreSurroundingSpaces(true); + } + + /** + * Returns a new {@code CSVFormat} with the parser trimming behavior of the format set to the given value. + * + * @param ignoreSurroundingSpaces the parser trimming behavior, {@code true} to remove the surrounding spaces, + * {@code false} to leave the spaces as is. + * @return A new CSVFormat that is equal to this but with the specified trimming behavior. + */ + public CSVFormat withIgnoreSurroundingSpaces(final boolean ignoreSurroundingSpaces) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with conversions to and from null for strings on input and output. + *
+ * Note: This setting is only used during printing and does not affect parsing. Parsing currently + * only works for inputs with '\n', '\r' and "\r\n" + *
+ * + * @param recordSeparator + * the record separator to use for output. + * + * @return A new CSVFormat that is equal to this but with the specified output record separator + */ + public CSVFormat withRecordSeparator(final char recordSeparator) { + return withRecordSeparator(String.valueOf(recordSeparator)); + } + + /** + * Returns a new {@code CSVFormat} with the record separator of the format set to the specified String. + * + *+ * Note: This setting is only used during printing and does not affect parsing. Parsing currently + * only works for inputs with '\n', '\r' and "\r\n" + *
+ * + * @param recordSeparator + * the record separator to use for output. + * + * @return A new CSVFormat that is equal to this but with the specified output record separator + * @throws IllegalArgumentException + * if recordSeparator is none of CR, LF or CRLF + */ + public CSVFormat withRecordSeparator(final String recordSeparator) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with skipping the header record set to {@code true}. + * + * @return A new CSVFormat that is equal to this but with the specified skipHeaderRecord setting. + * @see #withSkipHeaderRecord(boolean) + * @see #withHeader(String...) + * @since 1.1 + */ + public CSVFormat withSkipHeaderRecord() { + return this.withSkipHeaderRecord(true); + } + + /** + * Returns a new {@code CSVFormat} with whether to skip the header record. + * + * @param skipHeaderRecord + * whether to skip the header record. + * + * @return A new CSVFormat that is equal to this but with the specified skipHeaderRecord setting. + * @see #withHeader(String...) + */ + public CSVFormat withSkipHeaderRecord(final boolean skipHeaderRecord) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} with the record separator of the format set to the operating system's line + * separator string, typically CR+LF on Windows and LF on Linux. + * + *+ * Note: This setting is only used during printing and does not affect parsing. Parsing currently + * only works for inputs with '\n', '\r' and "\r\n" + *
+ * + * @return A new CSVFormat that is equal to this but with the operating system's line separator string. + * @since 1.6 + */ + public CSVFormat withSystemRecordSeparator() { + return withRecordSeparator(System.getProperty("line.separator")); + } + + /** + * Returns a new {@code CSVFormat} to add a trailing delimiter. + * + * @return A new CSVFormat that is equal to this but with the trailing delimiter setting. + * @since 1.3 + */ + public CSVFormat withTrailingDelimiter() { + return withTrailingDelimiter(true); + } + + /** + * Returns a new {@code CSVFormat} with whether to add a trailing delimiter. + * + * @param trailingDelimiter + * whether to add a trailing delimiter. + * + * @return A new CSVFormat that is equal to this but with the specified trailing delimiter setting. + * @since 1.3 + */ + public CSVFormat withTrailingDelimiter(final boolean trailingDelimiter) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } + + /** + * Returns a new {@code CSVFormat} to trim leading and trailing blanks. + * See {@link #getTrim()} for details of where this is used. + * + * @return A new CSVFormat that is equal to this but with the trim setting on. + * @since 1.3 + */ + public CSVFormat withTrim() { + return withTrim(true); + } + + /** + * Returns a new {@code CSVFormat} with whether to trim leading and trailing blanks. + * See {@link #getTrim()} for details of where this is used. + * + * @param trim + * whether to trim leading and trailing blanks. + * + * @return A new CSVFormat that is equal to this but with the specified trim setting. + * @since 1.3 + */ + public CSVFormat withTrim(final boolean trim) { + return new CSVFormat(delimiter, quoteCharacter, quoteMode, commentMarker, escapeCharacter, + ignoreSurroundingSpaces, ignoreEmptyLines, recordSeparator, nullString, headerComments, header, + skipHeaderRecord, allowMissingColumnNames, ignoreHeaderCase, trim, trailingDelimiter, autoFlush, + allowDuplicateHeaderNames); + } +} diff --git a/src/test/resources/org/apache/commons/csv/CSVParser.java b/src/test/resources/org/apache/commons/csv/CSVParser.java new file mode 100644 index 00000000..bf6eb6d6 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/CSVParser.java @@ -0,0 +1,715 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Token.Type.TOKEN; + +import java.io.Closeable; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringReader; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Objects; +import java.util.TreeMap; + +/** + * Parses CSV files according to the specified format. + * + * Because CSV appears in many different dialects, the parser supports many formats by allowing the + * specification of a {@link CSVFormat}. + * + * The parser works record wise. It is not possible to go back, once a record has been parsed from the input stream. + * + *+ * There are several static factory methods that can be used to create instances for various types of resources: + *
+ *+ * Alternatively parsers can also be created by passing a {@link Reader} directly to the sole constructor. + * + * For those who like fluent APIs, parsers can be created using {@link CSVFormat#parse(java.io.Reader)} as a shortcut: + *
+ *+ * for(CSVRecord record : CSVFormat.EXCEL.parse(in)) { + * ... + * } + *+ * + *
+ * To parse a CSV input from a file, you write: + *
+ * + *+ * File csvData = new File("/path/to/csv"); + * CSVParser parser = CSVParser.parse(csvData, CSVFormat.RFC4180); + * for (CSVRecord csvRecord : parser) { + * ... + * } + *+ * + *
+ * This will read the parse the contents of the file using the + * RFC 4180 format. + *
+ * + *+ * To parse CSV input in a format like Excel, you write: + *
+ * + *+ * CSVParser parser = CSVParser.parse(csvData, CSVFormat.EXCEL); + * for (CSVRecord csvRecord : parser) { + * ... + * } + *+ * + *
+ * If the predefined formats don't match the format at hands, custom formats can be defined. More information about + * customising CSVFormats is available in {@link CSVFormat CSVFormat Javadoc}. + *
+ * + *+ * If parsing record wise is not desired, the contents of the input can be read completely into memory. + *
+ * + *+ * Reader in = new StringReader("a;b\nc;d"); + * CSVParser parser = new CSVParser(in, CSVFormat.EXCEL); + * List<CSVRecord> list = parser.getRecords(); + *+ * + *
+ * There are two constraints that have to be kept in mind: + *
+ * + *+ * Internal parser state is completely covered by the format and the reader-state. + *
+ * + * @see package documentation for more details + */ +public final class CSVParser implements Iterable+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *
+ * + * @param inputStream + * an InputStream containing CSV-formatted input. Must not be null. + * @param charset + * The Charset to decode the given file. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new CSVParser configured with the given reader and format. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @since 1.5 + */ + @SuppressWarnings("resource") + public static CSVParser parse(final InputStream inputStream, final Charset charset, final CSVFormat format) + throws IOException { + Objects.requireNonNull(inputStream, "inputStream"); + Objects.requireNonNull(format, "format"); + return parse(new InputStreamReader(inputStream, charset), format); + } + + /** + * Creates and returns a parser for the given {@link Path}, which the caller MUST close. + * + * @param path + * a CSV file. Must not be null. + * @param charset + * The Charset to decode the given file. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either file or format are null. + * @throws IOException + * If an I/O error occurs + * @since 1.5 + */ + @SuppressWarnings("resource") + public static CSVParser parse(final Path path, final Charset charset, final CSVFormat format) throws IOException { + Objects.requireNonNull(path, "path"); + Objects.requireNonNull(format, "format"); + return parse(Files.newInputStream(path), charset, format); + } + + /** + * Creates a CSV parser using the given {@link CSVFormat} + * + *+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *
+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new CSVParser configured with the given reader and format. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @since 1.5 + */ + public static CSVParser parse(final Reader reader, final CSVFormat format) throws IOException { + return new CSVParser(reader, format); + } + + // the following objects are shared to reduce garbage + + /** + * Creates a parser for the given {@link String}. + * + * @param string + * a CSV string. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either string or format are null. + * @throws IOException + * If an I/O error occurs + */ + public static CSVParser parse(final String string, final CSVFormat format) throws IOException { + Objects.requireNonNull(string, "string"); + Objects.requireNonNull(format, "format"); + + return new CSVParser(new StringReader(string), format); + } + + /** + * Creates and returns a parser for the given URL, which the caller MUST close. + * + *+ * If you do not read all records from the given {@code url}, you should call {@link #close()} on the parser, unless + * you close the {@code url}. + *
+ * + * @param url + * a URL. Must not be null. + * @param charset + * the charset for the resource. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @return a new parser + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either url, charset or format are null. + * @throws IOException + * If an I/O error occurs + */ + @SuppressWarnings("resource") + public static CSVParser parse(final URL url, final Charset charset, final CSVFormat format) throws IOException { + Objects.requireNonNull(url, "url"); + Objects.requireNonNull(charset, "charset"); + Objects.requireNonNull(format, "format"); + + return new CSVParser(new InputStreamReader(url.openStream(), charset), format); + } + + private final CSVFormat format; + + /** A mapping of column names to column indices */ + private final Map+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *
+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + */ + public CSVParser(final Reader reader, final CSVFormat format) throws IOException { + this(reader, format, 0, 1); + } + + /** + * Customized CSV parser using the given {@link CSVFormat} + * + *+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser, + * unless you close the {@code reader}. + *
+ * + * @param reader + * a Reader containing CSV-formatted input. Must not be null. + * @param format + * the CSVFormat used for CSV parsing. Must not be null. + * @param characterOffset + * Lexer offset when the parser does not start parsing at the beginning of the source. + * @param recordNumber + * The next record number to assign + * @throws IllegalArgumentException + * If the parameters of the format are inconsistent or if either reader or format are null. + * @throws IOException + * If there is a problem reading the header or skipping the first record + * @since 1.1 + */ + @SuppressWarnings("resource") + public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber) + throws IOException { + Objects.requireNonNull(reader, "reader"); + Objects.requireNonNull(format, "format"); + + this.format = format; + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader)); + this.csvRecordIterator = new CSVRecordIterator(); + final Headers headers = createHeaders(); + this.headerMap = headers.headerMap; + this.headerNames = headers.headerNames; + this.characterOffset = characterOffset; + this.recordNumber = recordNumber - 1; + } + + private void addRecordValue(final boolean lastRecord) { + final String input = this.reusableToken.content.toString(); + final String inputClean = this.format.getTrim() ? input.trim() : input; + if (lastRecord && inputClean.isEmpty() && this.format.getTrailingDelimiter()) { + return; + } + final String nullString = this.format.getNullString(); + this.recordList.add(inputClean.equals(nullString) ? null : inputClean); + } + + /** + * Closes resources. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void close() throws IOException { + if (this.lexer != null) { + this.lexer.close(); + } + } + + private Map+ * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to + * the record number. + *
+ * + * @return current line number + */ + public long getCurrentLineNumber() { + return this.lexer.getCurrentLineNumber(); + } + + /** + * Gets the first end-of-line string encountered. + * + * @return the first end-of-line string + * @since 1.5 + */ + public String getFirstEndOfLine() { + return lexer.getFirstEol(); + } + + /** + * Returns a copy of the header map. + *+ * The map keys are column names. The map values are 0-based indices. + *
+ *+ * Note: The map can only provide a one-to-one mapping when the format did not + * contain null or duplicate column names. + *
+ * + * @return a copy of the header map. + */ + public Map+ * Note: The list provides strings that can be used as keys in the header map. + * The list will not contain null column names if they were present in the input + * format. + *
+ * + * @return read-only list of header names that iterates in column order. + * @see #getHeaderMap() + * @since 1.7 + */ + public List+ * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to + * the line number. + *
+ * + * @return current record number + */ + public long getRecordNumber() { + return this.recordNumber; + } + + /** + * Parses the CSV input according to the given format and returns the content as a list of + * {@link CSVRecord CSVRecords}. + * + *+ * The returned content starts at the current parse-position in the stream. + *
+ * + * @return list of {@link CSVRecord CSVRecords}, may be empty + * @throws IOException + * on parse error or input read-failure + */ + public List+ * An {@link IOException} caught during the iteration are re-thrown as an + * {@link IllegalStateException}. + *
+ *+ * If the parser is closed a call to {@link Iterator#next()} will throw a + * {@link NoSuchElementException}. + *
+ */ + @Override + public IteratorValues can be appended to the output by calling the {@link #print(Object)} method. + * Values are printed according to {@link String#valueOf(Object)}. + * To complete a record the {@link #println()} method has to be called. + * Comments can be appended by calling {@link #printComment(String)}. + * However a comment will only be written to the output if the {@link CSVFormat} supports comments. + *
+ * + *The printer also supports appending a complete record at once by calling {@link #printRecord(Object...)} + * or {@link #printRecord(Iterable)}. + * Furthermore {@link #printRecords(Object...)}, {@link #printRecords(Iterable)} and {@link #printRecords(ResultSet)} + * methods can be used to print several records at once. + *
+ * + *Example:
+ * + *+ * try (CSVPrinter printer = new CSVPrinter(new FileWriter("csv.txt"), CSVFormat.EXCEL)) { + * printer.printRecord("id", "userName", "firstName", "lastName", "birthday"); + * printer.printRecord(1, "john73", "John", "Doe", LocalDate.of(1973, 9, 15)); + * printer.println(); + * printer.printRecord(2, "mary", "Mary", "Meyer", LocalDate.of(1985, 3, 29)); + * } catch (IOException ex) { + * ex.printStackTrace(); + * } + *+ * + *
This code will write the following to csv.txt:
+ *+ * id,userName,firstName,lastName,birthday + * 1,john73,John,Doe,1973-09-15 + * + * 2,mary,Mary,Meyer,1985-03-29 + *+ */ +public final class CSVPrinter implements Flushable, Closeable { + + /** The place that the values get written. */ + private final Appendable out; + private final CSVFormat format; + + /** True if we just began a new record. */ + private boolean newRecord = true; + + /** + * Creates a printer that will print values to the given stream following the CSVFormat. + *
+ * Currently, only a pure encapsulation format or a pure escaping format is supported. Hybrid formats (encapsulation + * and escaping with a different character) are not supported. + *
+ * + * @param out + * stream to which to print. Must not be null. + * @param format + * the CSV format. Must not be null. + * @throws IOException + * thrown if the optional header cannot be printed. + * @throws IllegalArgumentException + * thrown if the parameters of the format are inconsistent or if either out or format are null. + */ + public CSVPrinter(final Appendable out, final CSVFormat format) throws IOException { + Objects.requireNonNull(out, "out"); + Objects.requireNonNull(format, "format"); + + this.out = out; + this.format = format; + // TODO: Is it a good idea to do this here instead of on the first call to a print method? + // It seems a pain to have to track whether the header has already been printed or not. + if (format.getHeaderComments() != null) { + for (final String line : format.getHeaderComments()) { + if (line != null) { + this.printComment(line); + } + } + } + if (format.getHeader() != null && !format.getSkipHeaderRecord()) { + this.printRecord((Object[]) format.getHeader()); + } + } + + // ====================================================== + // printing implementation + // ====================================================== + + @Override + public void close() throws IOException { + close(false); + } + + /** + * Closes the underlying stream with an optional flush first. + * @param flush whether to flush before the actual close. + * + * @throws IOException + * If an I/O error occurs + * @since 1.6 + */ + public void close(final boolean flush) throws IOException { + if (flush || format.getAutoFlush()) { + flush(); + } + if (out instanceof Closeable) { + ((Closeable) out).close(); + } + } + + /** + * Flushes the underlying stream. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void flush() throws IOException { + if (out instanceof Flushable) { + ((Flushable) out).flush(); + } + } + + /** + * Gets the target Appendable. + * + * @return the target Appendable. + */ + public Appendable getOut() { + return this.out; + } + + /** + * Prints the string as the next value on the line. The value will be escaped or encapsulated as needed. + * + * @param value + * value to be output. + * @throws IOException + * If an I/O error occurs + */ + public void print(final Object value) throws IOException { + format.print(value, out, newRecord); + newRecord = false; + } + + /** + * Prints a comment on a new line among the delimiter separated values. + * + *+ * Comments will always begin on a new line and occupy at least one full line. The character specified to start + * comments and a space will be inserted at the beginning of each new line in the comment. + *
+ * + *+ * If comments are disabled in the current CSV format this method does nothing. + *
+ * + *This method detects line breaks inside the comment string and inserts {@link CSVFormat#getRecordSeparator()} + * to start a new line of the comment. Note that this might produce unexpected results for formats that do not use + * line breaks as record separator.
+ * + * @param comment + * the comment to output + * @throws IOException + * If an I/O error occurs + */ + public void printComment(final String comment) throws IOException { + if (!format.isCommentMarkerSet()) { + return; + } + if (!newRecord) { + println(); + } + out.append(format.getCommentMarker().charValue()); + out.append(SP); + for (int i = 0; i < comment.length(); i++) { + final char c = comment.charAt(i); + switch (c) { + case CR: + if (i + 1 < comment.length() && comment.charAt(i + 1) == LF) { + i++; + } + //$FALL-THROUGH$ break intentionally excluded. + case LF: + println(); + out.append(format.getCommentMarker().charValue()); + out.append(SP); + break; + default: + out.append(c); + break; + } + } + println(); + } + + /** + * Outputs the record separator. + * + * @throws IOException + * If an I/O error occurs + */ + public void println() throws IOException { + format.println(out); + newRecord = true; + } + + /** + * Prints the given values a single record of delimiter separated values followed by the record separator. + * + *+ * The values will be quoted if needed. Quotes and newLine characters will be escaped. This method adds the record + * separator to the output after printing the record, so there is no need to call {@link #println()}. + *
+ * + * @param values + * values to output. + * @throws IOException + * If an I/O error occurs + */ + public void printRecord(final Iterable> values) throws IOException { + for (final Object value : values) { + print(value); + } + println(); + } + + /** + * Prints the given values a single record of delimiter separated values followed by the record separator. + * + *+ * The values will be quoted if needed. Quotes and newLine characters will be escaped. This method adds the record + * separator to the output after printing the record, so there is no need to call {@link #println()}. + *
+ * + * @param values + * values to output. + * @throws IOException + * If an I/O error occurs + */ + public void printRecord(final Object... values) throws IOException { + format.printRecord(out, values); + newRecord = true; + } + + /** + * Prints all the objects in the given collection handling nested collections/arrays as records. + * + *+ * If the given collection only contains simple objects, this method will print a single record like + * {@link #printRecord(Iterable)}. If the given collections contains nested collections/arrays those nested elements + * will each be printed as records using {@link #printRecord(Object...)}. + *
+ * + *+ * Given the following data structure: + *
+ * + *
+ *
+ * List<String[]> data = ...
+ * data.add(new String[]{ "A", "B", "C" });
+ * data.add(new String[]{ "1", "2", "3" });
+ * data.add(new String[]{ "A1", "B2", "C3" });
+ *
+ *
+ *
+ * + * Calling this method will print: + *
+ * + *
+ *
+ * A, B, C
+ * 1, 2, 3
+ * A1, B2, C3
+ *
+ *
+ *
+ * @param values
+ * the values to print.
+ * @throws IOException
+ * If an I/O error occurs
+ */
+ public void printRecords(final Iterable> values) throws IOException {
+ for (final Object value : values) {
+ if (value instanceof Object[]) {
+ this.printRecord((Object[]) value);
+ } else if (value instanceof Iterable) {
+ this.printRecord((Iterable>) value);
+ } else {
+ this.printRecord(value);
+ }
+ }
+ }
+
+ /**
+ * Prints all the objects in the given array handling nested collections/arrays as records.
+ *
+ * + * If the given array only contains simple objects, this method will print a single record like + * {@link #printRecord(Object...)}. If the given collections contains nested collections/arrays those nested + * elements will each be printed as records using {@link #printRecord(Object...)}. + *
+ * + *+ * Given the following data structure: + *
+ * + *
+ *
+ * String[][] data = new String[3][]
+ * data[0] = String[]{ "A", "B", "C" };
+ * data[1] = new String[]{ "1", "2", "3" };
+ * data[2] = new String[]{ "A1", "B2", "C3" };
+ *
+ *
+ *
+ * + * Calling this method will print: + *
+ * + *
+ *
+ * A, B, C
+ * 1, 2, 3
+ * A1, B2, C3
+ *
+ *
+ *
+ * @param values
+ * the values to print.
+ * @throws IOException
+ * If an I/O error occurs
+ */
+ public void printRecords(final Object... values) throws IOException {
+ printRecords(Arrays.asList(values));
+ }
+
+ /**
+ * Prints all the objects in the given JDBC result set.
+ *
+ * @param resultSet
+ * result set the values to print.
+ * @throws IOException
+ * If an I/O error occurs
+ * @throws SQLException
+ * if a database access error occurs
+ */
+ public void printRecords(final ResultSet resultSet) throws SQLException, IOException {
+ final int columnCount = resultSet.getMetaData().getColumnCount();
+ while (resultSet.next()) {
+ for (int i = 1; i <= columnCount; i++) {
+ final Object object = resultSet.getObject(i);
+ // TODO Who manages the Clob? The JDBC driver or must we close it? Is it driver-dependent?
+ print(object instanceof Clob ? ((Clob) object).getCharacterStream() : object);
+ }
+ println();
+ }
+ }
+}
diff --git a/src/test/resources/org/apache/commons/csv/CSVRecord.java b/src/test/resources/org/apache/commons/csv/CSVRecord.java
new file mode 100644
index 00000000..5181bc9a
--- /dev/null
+++ b/src/test/resources/org/apache/commons/csv/CSVRecord.java
@@ -0,0 +1,329 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.commons.csv;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Objects;
+
+/**
+ * A CSV record parsed from a CSV file.
+ *
+ * + * Note: Support for {@link Serializable} is scheduled to be removed in version 2.0. + * In version 1.8 the mapping between the column header and the column index was + * removed from the serialised state. The class maintains serialization compatibility + * with versions pre-1.8 for the record values; these must be accessed by index + * following deserialization. There will be loss of any functionally linked to the header + * mapping when transferring serialised forms pre-1.8 to 1.8 and vice versa. + *
+ */ +public final class CSVRecord implements Serializable, Iterable+ * Note: This requires a field mapping obtained from the original parser. + * A check using {@link #isMapped(String)} should be used to determine if a + * mapping exists from the provided {@code name} to a field index. In this case an + * exception will only be thrown if the record does not contain a field corresponding + * to the mapping, that is the record length is not consistent with the mapping size. + *
+ * + * @param name + * the name of the column to be retrieved. + * @return the column value, maybe null depending on {@link CSVFormat#getNullString()}. + * @throws IllegalStateException + * if no header mapping was provided + * @throws IllegalArgumentException + * if {@code name} is not mapped or if the record is inconsistent + * @see #isMapped(String) + * @see #isConsistent() + * @see #getParser() + * @see CSVFormat#withNullString(String) + */ + public String get(final String name) { + final Map+ * Note: The parser is not part of the serialized state of the record. A null check + * should be used when the record may have originated from a serialized form. + *
+ * + * @return the parser. + * @since 1.7 + */ + public CSVParser getParser() { + return parser; + } + + /** + * Returns the number of this record in the parsed CSV file. + * + *+ * ATTENTION: If your CSV input has multi-line values, the returned number does not correspond to + * the current line number of the parser that created this record. + *
+ * + * @return the number of this record. + * @see CSVParser#getCurrentLineNumber() + */ + public long getRecordNumber() { + return recordNumber; + } + + /** + * Checks whether this record has a comment, false otherwise. + * Note that comments are attached to the following record. + * If there is no following record (i.e. the comment is at EOF) + * the comment will be ignored. + * + * @return true if this record has a comment, false otherwise + * @since 1.3 + */ + public boolean hasComment() { + return comment != null; + } + + /** + * Tells whether the record size matches the header size. + * + *+ * Returns true if the sizes for this record match and false if not. Some programs can export files that fail this + * test but still produce parsable files. + *
+ * + * @return true of this record is valid, false if not + */ + public boolean isConsistent() { + final Map+ * In particular the reader supports a look-ahead option, which allows you to see the next char returned by + * {@link #read()}. This reader also tracks how many characters have been read with {@link #getPosition()}. + *
+ */ +final class ExtendedBufferedReader extends BufferedReader { + + /** The last char returned */ + private int lastChar = UNDEFINED; + + /** The count of EOLs (CR/LF/CRLF) seen so far */ + private long eolCounter; + + /** The position, which is number of characters read so far */ + private long position; + + private boolean closed; + + /** + * Created extended buffered reader using default buffer-size + */ + ExtendedBufferedReader(final Reader reader) { + super(reader); + } + + /** + * Closes the stream. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void close() throws IOException { + // Set ivars before calling super close() in case close() throws an IOException. + closed = true; + lastChar = END_OF_STREAM; + super.close(); + } + + /** + * Returns the current line number + * + * @return the current line number + */ + long getCurrentLineNumber() { + // Check if we are at EOL or EOF or just starting + if (lastChar == CR || lastChar == LF || lastChar == UNDEFINED || lastChar == END_OF_STREAM) { + return eolCounter; // counter is accurate + } + return eolCounter + 1; // Allow for counter being incremented only at EOL + } + + /** + * Returns the last character that was read as an integer (0 to 65535). This will be the last character returned by + * any of the read methods. This will not include a character read using the {@link #lookAhead()} method. If no + * character has been read then this will return {@link Constants#UNDEFINED}. If the end of the stream was reached + * on the last read then this will return {@link Constants#END_OF_STREAM}. + * + * @return the last character that was read + */ + int getLastChar() { + return lastChar; + } + + /** + * Gets the character position in the reader. + * + * @return the current position in the reader (counting characters, not bytes since this is a Reader) + */ + long getPosition() { + return this.position; + } + + public boolean isClosed() { + return closed; + } + + /** + * Returns the next character in the current reader without consuming it. So the next call to {@link #read()} will + * still return this value. Does not affect line number or last character. + * + * @return the next character + * + * @throws IOException + * if there is an error in reading + */ + int lookAhead() throws IOException { + super.mark(1); + final int c = super.read(); + super.reset(); + + return c; + } + + @Override + public int read() throws IOException { + final int current = super.read(); + if (current == CR || current == LF && lastChar != CR) { + eolCounter++; + } + lastChar = current; + this.position++; + return lastChar; + } + + @Override + public int read(final char[] buf, final int offset, final int length) throws IOException { + if (length == 0) { + return 0; + } + + final int len = super.read(buf, offset, length); + + if (len > 0) { + + for (int i = offset; i < offset + len; i++) { + final char ch = buf[i]; + if (ch == LF) { + if (CR != (i > 0 ? buf[i - 1] : lastChar)) { + eolCounter++; + } + } else if (ch == CR) { + eolCounter++; + } + } + + lastChar = buf[offset + len - 1]; + + } else if (len == -1) { + lastChar = END_OF_STREAM; + } + + position += len; + return len; + } + + /** + * Calls {@link BufferedReader#readLine()} which drops the line terminator(s). This method should only be called + * when processing a comment, otherwise information can be lost. + *+ * Increments {@link #eolCounter} + *
+ * Sets {@link #lastChar} to {@link Constants#END_OF_STREAM} at EOF, otherwise to LF + * + * @return the line that was read, or null if reached EOF. + */ + @Override + public String readLine() throws IOException { + final String line = super.readLine(); + + if (line != null) { + lastChar = LF; // needed for detecting start of line + eolCounter++; + } else { + lastChar = END_OF_STREAM; + } + + return line; + } + +} diff --git a/src/test/resources/org/apache/commons/csv/IOUtils.java b/src/test/resources/org/apache/commons/csv/IOUtils.java new file mode 100644 index 00000000..1771d4dc --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/IOUtils.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.csv; + +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; +import java.nio.CharBuffer; + +/** Copied from Apache Commons IO. */ +class IOUtils { + + /** + *
+ * Copied from Apache Commons IO. + *
+ * The default buffer size ({@value}). + */ + static final int DEFAULT_BUFFER_SIZE = 1024 * 4; + + /** + *+ * Copied from Apache Commons IO. + *
+ * Represents the end-of-file (or stream). + * @since 2.5 (made public) + */ + private static final int EOF = -1; + + /** + * Copies chars from a large (over 2GB) {@code Reader} to an {@code Appendable}. + *+ * This method buffers the input internally, so there is no need to use a + * {@code BufferedReader}. + *
+ * The buffer size is given by {@link #DEFAULT_BUFFER_SIZE}. + * + * @param input the {@code Reader} to read from + * @param output the {@code Appendable} to append to + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 2.7 + */ + static long copy(final Reader input, final Appendable output) throws IOException { + return copy(input, output, CharBuffer.allocate(DEFAULT_BUFFER_SIZE)); + } + + /** + * Copies chars from a large (over 2GB) {@code Reader} to an {@code Appendable}. + *+ * This method uses the provided buffer, so there is no need to use a + * {@code BufferedReader}. + *
+ * + * @param input the {@code Reader} to read from + * @param output the {@code Appendable} to write to + * @param buffer the buffer to be used for the copy + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 2.7 + */ + static long copy(final Reader input, final Appendable output, final CharBuffer buffer) throws IOException { + long count = 0; + int n; + while (EOF != (n = input.read(buffer))) { + buffer.flip(); + output.append(buffer, 0, n); + count += n; + } + return count; + } + + /** + *+ * Copied from Apache Commons IO. + *
+ * Copies chars from a large (over 2GB) {@code Reader} to a {@code Writer}. + *+ * This method buffers the input internally, so there is no need to use a + * {@code BufferedReader}. + *
+ * The buffer size is given by {@link #DEFAULT_BUFFER_SIZE}. + * + * @param input the {@code Reader} to read from + * @param output the {@code Writer} to write to + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 1.3 + */ + static long copyLarge(final Reader input, final Writer output) throws IOException { + return copyLarge(input, output, new char[DEFAULT_BUFFER_SIZE]); + } + + /** + *
+ * Copied from Apache Commons IO. + *
+ * Copies chars from a large (over 2GB) {@code Reader} to a {@code Writer}. + *+ * This method uses the provided buffer, so there is no need to use a + * {@code BufferedReader}. + *
+ * + * @param input the {@code Reader} to read from + * @param output the {@code Writer} to write to + * @param buffer the buffer to be used for the copy + * @return the number of characters copied + * @throws NullPointerException if the input or output is null + * @throws IOException if an I/O error occurs + * @since 2.2 + */ + static long copyLarge(final Reader input, final Writer output, final char[] buffer) throws IOException { + long count = 0; + int n; + while (EOF != (n = input.read(buffer))) { + output.write(buffer, 0, n); + count += n; + } + return count; + } + +} diff --git a/src/test/resources/org/apache/commons/csv/Lexer.java b/src/test/resources/org/apache/commons/csv/Lexer.java new file mode 100644 index 00000000..2795ca29 --- /dev/null +++ b/src/test/resources/org/apache/commons/csv/Lexer.java @@ -0,0 +1,461 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.commons.csv; + +import static org.apache.commons.csv.Constants.BACKSPACE; +import static org.apache.commons.csv.Constants.CR; +import static org.apache.commons.csv.Constants.END_OF_STREAM; +import static org.apache.commons.csv.Constants.FF; +import static org.apache.commons.csv.Constants.LF; +import static org.apache.commons.csv.Constants.TAB; +import static org.apache.commons.csv.Constants.UNDEFINED; +import static org.apache.commons.csv.Token.Type.COMMENT; +import static org.apache.commons.csv.Token.Type.EOF; +import static org.apache.commons.csv.Token.Type.EORECORD; +import static org.apache.commons.csv.Token.Type.INVALID; +import static org.apache.commons.csv.Token.Type.TOKEN; + +import java.io.Closeable; +import java.io.IOException; + +/** + * Lexical analyzer. + */ +final class Lexer implements Closeable { + + private static final String CR_STRING = Character.toString(CR); + private static final String LF_STRING = Character.toString(LF); + + /** + * Constant char to use for disabling comments, escapes and encapsulation. The value -2 is used because it + * won't be confused with an EOF signal (-1), and because the Unicode value {@code FFFE} would be encoded as two + * chars (using surrogates) and thus there should never be a collision with a real text char. + */ + private static final char DISABLED = '\ufffe'; + + private final char delimiter; + private final char escape; + private final char quoteChar; + private final char commentStart; + + private final boolean ignoreSurroundingSpaces; + private final boolean ignoreEmptyLines; + + /** The input stream */ + private final ExtendedBufferedReader reader; + private String firstEol; + + Lexer(final CSVFormat format, final ExtendedBufferedReader reader) { + this.reader = reader; + this.delimiter = format.getDelimiter(); + this.escape = mapNullToDisabled(format.getEscapeCharacter()); + this.quoteChar = mapNullToDisabled(format.getQuoteCharacter()); + this.commentStart = mapNullToDisabled(format.getCommentMarker()); + this.ignoreSurroundingSpaces = format.getIgnoreSurroundingSpaces(); + this.ignoreEmptyLines = format.getIgnoreEmptyLines(); + } + + /** + * Closes resources. + * + * @throws IOException + * If an I/O error occurs + */ + @Override + public void close() throws IOException { + reader.close(); + } + + /** + * Returns the current character position + * + * @return the current character position + */ + long getCharacterPosition() { + return reader.getPosition(); + } + + /** + * Returns the current line number + * + * @return the current line number + */ + long getCurrentLineNumber() { + return reader.getCurrentLineNumber(); + } + + String getFirstEol(){ + return firstEol; + } + + boolean isClosed() { + return reader.isClosed(); + } + + boolean isCommentStart(final int ch) { + return ch == commentStart; + } + + boolean isDelimiter(final int ch) { + return ch == delimiter; + } + + /** + * @return true if the given character indicates end of file + */ + boolean isEndOfFile(final int ch) { + return ch == END_OF_STREAM; + } + + boolean isEscape(final int ch) { + return ch == escape; + } + + private boolean isMetaChar(final int ch) { + return ch == delimiter || + ch == escape || + ch == quoteChar || + ch == commentStart; + } + + boolean isQuoteChar(final int ch) { + return ch == quoteChar; + } + + /** + * Checks if the current character represents the start of a line: a CR, LF or is at the start of the file. + * + * @param ch the character to check + * @return true if the character is at the start of a line. + */ + boolean isStartOfLine(final int ch) { + return ch == LF || ch == CR || ch == UNDEFINED; + } + + /** + * @return true if the given char is a whitespace character + */ + boolean isWhitespace(final int ch) { + return !isDelimiter(ch) && Character.isWhitespace((char) ch); + } + + private char mapNullToDisabled(final Character c) { + return c == null ? DISABLED : c.charValue(); + } + + /** + * Returns the next token. + *
+ * A token corresponds to a term, a record change or an end-of-file indicator. + *
+ * + * @param token + * an existing Token object to reuse. The caller is responsible to initialize the Token. + * @return the next token found + * @throws java.io.IOException + * on stream access error + */ + Token nextToken(final Token token) throws IOException { + + // get the last read char (required for empty line detection) + int lastChar = reader.getLastChar(); + + // read the next char and set eol + int c = reader.read(); + /* + * Note: The following call will swallow LF if c == CR. But we don't need to know if the last char was CR or LF + * - they are equivalent here. + */ + boolean eol = readEndOfLine(c); + + // empty line detection: eol AND (last char was EOL or beginning) + if (ignoreEmptyLines) { + while (eol && isStartOfLine(lastChar)) { + // go on char ahead ... + lastChar = c; + c = reader.read(); + eol = readEndOfLine(c); + // reached end of file without any content (empty line at the end) + if (isEndOfFile(c)) { + token.type = EOF; + // don't set token.isReady here because no content + return token; + } + } + } + + // did we reach eof during the last iteration already ? EOF + if (isEndOfFile(lastChar) || !isDelimiter(lastChar) && isEndOfFile(c)) { + token.type = EOF; + // don't set token.isReady here because no content + return token; + } + + if (isStartOfLine(lastChar) && isCommentStart(c)) { + final String line = reader.readLine(); + if (line == null) { + token.type = EOF; + // don't set token.isReady here because no content + return token; + } + final String comment = line.trim(); + token.content.append(comment); + token.type = COMMENT; + return token; + } + + // important: make sure a new char gets consumed in each iteration + while (token.type == INVALID) { + // ignore whitespaces at beginning of a token + if (ignoreSurroundingSpaces) { + while (isWhitespace(c) && !eol) { + c = reader.read(); + eol = readEndOfLine(c); + } + } + + // ok, start of token reached: encapsulated, or token + if (isDelimiter(c)) { + // empty token return TOKEN("") + token.type = TOKEN; + } else if (eol) { + // empty token return EORECORD("") + // noop: token.content.append(""); + token.type = EORECORD; + } else if (isQuoteChar(c)) { + // consume encapsulated token + parseEncapsulatedToken(token); + } else if (isEndOfFile(c)) { + // end of file return EOF() + // noop: token.content.append(""); + token.type = EOF; + token.isReady = true; // there is data at EOF + } else { + // next token must be a simple token + // add removed blanks when not ignoring whitespace chars... + parseSimpleToken(token, c); + } + } + return token; + } + + /** + * Parses an encapsulated token. + * + * Encapsulated tokens are surrounded by the given encapsulating-string. The encapsulator itself might be included + * in the token using a doubling syntax (as "", '') or using escaping (as in \", \'). Whitespaces before and after + * an encapsulated token are ignored. The token is finished when one of the following conditions become true: + *CSV are widely used as interfaces to legacy systems or manual data-imports. + * CSV stands for "Comma Separated Values" (or sometimes "Character Separated + * Values"). The CSV data format is defined in + * RFC 4180 + * but many dialects exist.
+ * + *Common to all file dialects is its basic structure: The CSV data-format + * is record oriented, whereas each record starts on a new textual line. A + * record is build of a list of values. Keep in mind that not all records + * must have an equal number of values:
+ *+ * csv := records* + * record := values* + *+ * + *
The following list contains the CSV aspects the Commons CSV parser supports:
+ *In addition to individually defined dialects, two predefined dialects (strict-csv, and excel-csv) + * can be set directly.
+ * + *Example usage:
+ *+ */ + +package org.apache.commons.csv; diff --git a/src/test/resources/perf/worldcitiespop.txt.gz b/src/test/resources/org/apache/commons/csv/perf/worldcitiespop.txt.gz similarity index 100% rename from src/test/resources/perf/worldcitiespop.txt.gz rename to src/test/resources/org/apache/commons/csv/perf/worldcitiespop.txt.gz+ * Reader in = new StringReader("a,b,c"); + * for (CSVRecord record : CSVFormat.DEFAULT.parse(in)) { + * for (String field : record) { + * System.out.print("\"" + field + "\", "); + * } + * System.out.println(); + * } + *