diff --git a/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/pom.xml b/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/pom.xml index a6ed07ee96..6721c98eb9 100644 --- a/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/pom.xml +++ b/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/pom.xml @@ -49,5 +49,10 @@ org.apache.nifi nifi-record + + org.apache.commons + commons-csv + 1.4 + diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java b/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/src/main/java/org/apache/nifi/csv/CSVUtils.java similarity index 76% rename from nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java rename to nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/src/main/java/org/apache/nifi/csv/CSVUtils.java index 17152aa5fc..bc074b329b 100644 --- a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVUtils.java +++ b/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/src/main/java/org/apache/nifi/csv/CSVUtils.java @@ -23,22 +23,22 @@ import org.apache.commons.lang3.StringEscapeUtils; import org.apache.nifi.components.AllowableValue; import org.apache.nifi.components.PropertyDescriptor; import org.apache.nifi.components.PropertyValue; -import org.apache.nifi.controller.ConfigurationContext; +import org.apache.nifi.context.PropertyContext; import org.apache.nifi.processor.util.StandardValidators; public class CSVUtils { - static final AllowableValue CUSTOM = new AllowableValue("custom", "Custom Format", + public static final AllowableValue CUSTOM = new AllowableValue("custom", "Custom Format", "The format of the CSV is configured by using the properties of this Controller Service, such as Value Separator"); - static final AllowableValue RFC_4180 = new AllowableValue("rfc-4180", "RFC 4180", "CSV data follows the RFC 4180 Specification defined at https://tools.ietf.org/html/rfc4180"); - static final AllowableValue EXCEL = new AllowableValue("excel", "Microsoft Excel", "CSV data follows the format used by Microsoft Excel"); - static final AllowableValue TDF = new AllowableValue("tdf", "Tab-Delimited", "CSV data is Tab-Delimited instead of Comma Delimited"); - static final AllowableValue INFORMIX_UNLOAD = new AllowableValue("informix-unload", "Informix Unload", "The format used by Informix when issuing the UNLOAD TO file_name command"); - static final AllowableValue INFORMIX_UNLOAD_CSV = new AllowableValue("informix-unload-csv", "Informix Unload Escape Disabled", + public static final AllowableValue RFC_4180 = new AllowableValue("rfc-4180", "RFC 4180", "CSV data follows the RFC 4180 Specification defined at https://tools.ietf.org/html/rfc4180"); + public static final AllowableValue EXCEL = new AllowableValue("excel", "Microsoft Excel", "CSV data follows the format used by Microsoft Excel"); + public static final AllowableValue TDF = new AllowableValue("tdf", "Tab-Delimited", "CSV data is Tab-Delimited instead of Comma Delimited"); + public static final AllowableValue INFORMIX_UNLOAD = new AllowableValue("informix-unload", "Informix Unload", "The format used by Informix when issuing the UNLOAD TO file_name command"); + public static final AllowableValue INFORMIX_UNLOAD_CSV = new AllowableValue("informix-unload-csv", "Informix Unload Escape Disabled", "The format used by Informix when issuing the UNLOAD TO file_name command with escaping disabled"); - static final AllowableValue MYSQL = new AllowableValue("mysql", "MySQL Format", "CSV data follows the format used by MySQL"); + public static final AllowableValue MYSQL = new AllowableValue("mysql", "MySQL Format", "CSV data follows the format used by MySQL"); - static final PropertyDescriptor CSV_FORMAT = new PropertyDescriptor.Builder() + public static final PropertyDescriptor CSV_FORMAT = new PropertyDescriptor.Builder() .name("CSV Format") .description("Specifies which \"format\" the CSV data is in, or specifies if custom formatting should be used.") .expressionLanguageSupported(false) @@ -46,7 +46,7 @@ public class CSVUtils { .defaultValue(CUSTOM.getValue()) .required(true) .build(); - static final PropertyDescriptor VALUE_SEPARATOR = new PropertyDescriptor.Builder() + public static final PropertyDescriptor VALUE_SEPARATOR = new PropertyDescriptor.Builder() .name("Value Separator") .description("The character that is used to separate values/fields in a CSV Record") .addValidator(CSVValidators.UNESCAPED_SINGLE_CHAR_VALIDATOR) @@ -54,7 +54,7 @@ public class CSVUtils { .defaultValue(",") .required(true) .build(); - static final PropertyDescriptor QUOTE_CHAR = new PropertyDescriptor.Builder() + public static final PropertyDescriptor QUOTE_CHAR = new PropertyDescriptor.Builder() .name("Quote Character") .description("The character that is used to quote values so that escape characters do not have to be used") .addValidator(new CSVValidators.SingleCharacterValidator()) @@ -62,7 +62,7 @@ public class CSVUtils { .defaultValue("\"") .required(true) .build(); - static final PropertyDescriptor FIRST_LINE_IS_HEADER = new PropertyDescriptor.Builder() + public static final PropertyDescriptor FIRST_LINE_IS_HEADER = new PropertyDescriptor.Builder() .name("Skip Header Line") .displayName("Treat First Line as Header") .description("Specifies whether or not the first line of CSV should be considered a Header or should be considered a record. If the Schema Access Strategy " @@ -75,7 +75,7 @@ public class CSVUtils { .defaultValue("false") .required(true) .build(); - static final PropertyDescriptor IGNORE_CSV_HEADER = new PropertyDescriptor.Builder() + public static final PropertyDescriptor IGNORE_CSV_HEADER = new PropertyDescriptor.Builder() .name("ignore-csv-header") .displayName("Ignore CSV Header Column Names") .description("If the first line of a CSV is a header, and the configured schema does not match the fields named in the header line, this controls how " @@ -87,14 +87,14 @@ public class CSVUtils { .defaultValue("false") .required(false) .build(); - static final PropertyDescriptor COMMENT_MARKER = new PropertyDescriptor.Builder() + public static final PropertyDescriptor COMMENT_MARKER = new PropertyDescriptor.Builder() .name("Comment Marker") .description("The character that is used to denote the start of a comment. Any line that begins with this comment will be ignored.") .addValidator(new CSVValidators.SingleCharacterValidator()) .expressionLanguageSupported(false) .required(false) .build(); - static final PropertyDescriptor ESCAPE_CHAR = new PropertyDescriptor.Builder() + public static final PropertyDescriptor ESCAPE_CHAR = new PropertyDescriptor.Builder() .name("Escape Character") .description("The character that is used to escape characters that would otherwise have a specific meaning to the CSV Parser.") .addValidator(new CSVValidators.SingleCharacterValidator()) @@ -102,14 +102,14 @@ public class CSVUtils { .defaultValue("\\") .required(true) .build(); - static final PropertyDescriptor NULL_STRING = new PropertyDescriptor.Builder() + public static final PropertyDescriptor NULL_STRING = new PropertyDescriptor.Builder() .name("Null String") .description("Specifies a String that, if present as a value in the CSV, should be considered a null field instead of using the literal value.") .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .expressionLanguageSupported(false) .required(false) .build(); - static final PropertyDescriptor TRIM_FIELDS = new PropertyDescriptor.Builder() + public static final PropertyDescriptor TRIM_FIELDS = new PropertyDescriptor.Builder() .name("Trim Fields") .description("Whether or not white space should be removed from the beginning and end of fields") .expressionLanguageSupported(false) @@ -119,14 +119,14 @@ public class CSVUtils { .build(); // CSV Format fields for writers only - static final AllowableValue QUOTE_ALL = new AllowableValue("ALL", "Quote All Values", "All values will be quoted using the configured quote character."); - static final AllowableValue QUOTE_MINIMAL = new AllowableValue("MINIMAL", "Quote Minimal", + public static final AllowableValue QUOTE_ALL = new AllowableValue("ALL", "Quote All Values", "All values will be quoted using the configured quote character."); + public static final AllowableValue QUOTE_MINIMAL = new AllowableValue("MINIMAL", "Quote Minimal", "Values will be quoted only if they are contain special characters such as newline characters or field separators."); - static final AllowableValue QUOTE_NON_NUMERIC = new AllowableValue("NON_NUMERIC", "Quote Non-Numeric Values", "Values will be quoted unless the value is a number."); - static final AllowableValue QUOTE_NONE = new AllowableValue("NONE", "Do Not Quote Values", + public static final AllowableValue QUOTE_NON_NUMERIC = new AllowableValue("NON_NUMERIC", "Quote Non-Numeric Values", "Values will be quoted unless the value is a number."); + public static final AllowableValue QUOTE_NONE = new AllowableValue("NONE", "Do Not Quote Values", "Values will not be quoted. Instead, all special characters will be escaped using the configured escape character."); - static final PropertyDescriptor QUOTE_MODE = new PropertyDescriptor.Builder() + public static final PropertyDescriptor QUOTE_MODE = new PropertyDescriptor.Builder() .name("Quote Mode") .description("Specifies how fields should be quoted when they are written") .expressionLanguageSupported(false) @@ -134,7 +134,7 @@ public class CSVUtils { .defaultValue(QUOTE_MINIMAL.getValue()) .required(true) .build(); - static final PropertyDescriptor TRAILING_DELIMITER = new PropertyDescriptor.Builder() + public static final PropertyDescriptor TRAILING_DELIMITER = new PropertyDescriptor.Builder() .name("Include Trailing Delimiter") .description("If true, a trailing delimiter will be added to each CSV Record that is written. If false, the trailing delimiter will be omitted.") .expressionLanguageSupported(false) @@ -142,7 +142,7 @@ public class CSVUtils { .defaultValue("false") .required(true) .build(); - static final PropertyDescriptor RECORD_SEPARATOR = new PropertyDescriptor.Builder() + public static final PropertyDescriptor RECORD_SEPARATOR = new PropertyDescriptor.Builder() .name("Record Separator") .description("Specifies the characters to use in order to separate CSV Records") .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) @@ -150,7 +150,7 @@ public class CSVUtils { .defaultValue("\\n") .required(true) .build(); - static final PropertyDescriptor INCLUDE_HEADER_LINE = new PropertyDescriptor.Builder() + public static final PropertyDescriptor INCLUDE_HEADER_LINE = new PropertyDescriptor.Builder() .name("Include Header Line") .description("Specifies whether or not the CSV column names should be written out as the first line.") .allowableValues("true", "false") @@ -158,7 +158,7 @@ public class CSVUtils { .required(true) .build(); - static CSVFormat createCSVFormat(final ConfigurationContext context) { + public static CSVFormat createCSVFormat(final PropertyContext context) { final String formatName = context.getProperty(CSV_FORMAT).getValue(); if (formatName.equalsIgnoreCase(CUSTOM.getValue())) { return buildCustomFormat(context); @@ -180,15 +180,15 @@ public class CSVUtils { } } - private static char getUnescapedChar(final ConfigurationContext context, final PropertyDescriptor property) { + private static char getUnescapedChar(final PropertyContext context, final PropertyDescriptor property) { return StringEscapeUtils.unescapeJava(context.getProperty(property).getValue()).charAt(0); } - private static char getChar(final ConfigurationContext context, final PropertyDescriptor property) { + private static char getChar(final PropertyContext context, final PropertyDescriptor property) { return CSVUtils.unescape(context.getProperty(property).getValue()).charAt(0); } - private static CSVFormat buildCustomFormat(final ConfigurationContext context) { + private static CSVFormat buildCustomFormat(final PropertyContext context) { final char valueSeparator = getUnescapedChar(context, VALUE_SEPARATOR); CSVFormat format = CSVFormat.newFormat(valueSeparator) .withAllowMissingColumnNames() diff --git a/nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVValidators.java b/nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/src/main/java/org/apache/nifi/csv/CSVValidators.java similarity index 100% rename from nifi-nar-bundles/nifi-standard-services/nifi-record-serialization-services-bundle/nifi-record-serialization-services/src/main/java/org/apache/nifi/csv/CSVValidators.java rename to nifi-nar-bundles/nifi-extension-utils/nifi-record-utils/nifi-standard-record-utils/src/main/java/org/apache/nifi/csv/CSVValidators.java diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml index ce0a9b258a..432967b1d7 100644 --- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/pom.xml @@ -17,7 +17,7 @@ 4.0.0 - 3.14 + 3.17 @@ -66,7 +66,6 @@ poi-ooxml ${poi.version} - org.apache.nifi nifi-api @@ -75,6 +74,10 @@ org.apache.nifi nifi-processor-utils + + org.apache.nifi + nifi-standard-record-utils + org.apache.nifi nifi-mock diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java index 6d8274bd43..1e0df88452 100644 --- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/main/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessor.java @@ -19,14 +19,16 @@ package org.apache.nifi.processors.poi; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; import org.apache.nifi.annotation.behavior.WritesAttribute; @@ -34,6 +36,7 @@ import org.apache.nifi.annotation.behavior.WritesAttributes; import org.apache.nifi.annotation.documentation.CapabilityDescription; import org.apache.nifi.annotation.documentation.Tags; import org.apache.nifi.components.PropertyDescriptor; +import org.apache.nifi.csv.CSVUtils; import org.apache.nifi.flowfile.FlowFile; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.processor.AbstractProcessor; @@ -48,15 +51,20 @@ import org.apache.nifi.processor.util.StandardValidators; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.util.CellAddress; +import org.apache.poi.ss.util.CellReference; +import org.apache.poi.util.SAXHelper; +import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable; import org.apache.poi.xssf.eventusermodel.XSSFReader; -import org.apache.poi.xssf.model.SharedStringsTable; -import org.apache.poi.xssf.usermodel.XSSFRichTextString; -import org.xml.sax.Attributes; +import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler; +import org.apache.poi.xssf.model.StylesTable; +import org.apache.poi.xssf.usermodel.XSSFComment; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; -import org.xml.sax.helpers.XMLReaderFactory; + +import javax.xml.parsers.ParserConfigurationException; @Tags({"excel", "csv", "poi"}) @@ -78,17 +86,8 @@ public class ConvertExcelToCSVProcessor public static final String SHEET_NAME = "sheetname"; public static final String ROW_NUM = "numrows"; public static final String SOURCE_FILE_NAME = "sourcefilename"; - private static final String SAX_CELL_REF = "c"; - private static final String SAX_CELL_TYPE = "t"; - private static final String SAX_CELL_ADDRESS = "r"; - private static final String SAX_CELL_STRING = "s"; - private static final String SAX_CELL_CONTENT_REF = "v"; - private static final String SAX_ROW_REF = "row"; - private static final String SAX_SHEET_NAME_REF = "sheetPr"; private static final String DESIRED_SHEETS_DELIMITER = ","; private static final String UNKNOWN_SHEET_NAME = "UNKNOWN"; - private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser"; - private static final Pattern CELL_ADDRESS_REGEX = Pattern.compile("^([a-zA-Z]+)([\\d]+)$"); public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor .Builder().name("extract-sheets") @@ -101,6 +100,35 @@ public class ConvertExcelToCSVProcessor .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) .build(); + public static final PropertyDescriptor ROWS_TO_SKIP = new PropertyDescriptor + .Builder().name("excel-extract-first-row") + .displayName("Number of Rows to Skip") + .description("The row number of the first row to start processing." + + "Use this to skip over rows of data at the top of your worksheet that are not part of the dataset." + + "Empty rows of data anywhere in the spreadsheet will always be skipped, no matter what this value is set to.") + .required(true) + .defaultValue("0") + .addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR) + .build(); + + public static final PropertyDescriptor COLUMNS_TO_SKIP = new PropertyDescriptor + .Builder().name("excel-extract-column-to-skip") + .displayName("Columns To Skip") + .description("Comma delimited list of column numbers to skip. Use the columns number and not the letter designation. " + + "Use this to skip over columns anywhere in your worksheet that you don't want extracted as part of the record.") + .required(false) + .addValidator(StandardValidators.NON_EMPTY_VALIDATOR) + .build(); + + public static final PropertyDescriptor FORMAT_VALUES = new PropertyDescriptor.Builder() + .name("excel-format-values") + .displayName("Format Cell Values") + .description("Should the cell values be written to CSV using the formatting applied in Excel, or should they be printed as raw values.") + .allowableValues("true", "false") + .defaultValue("false") + .required(true) + .build(); + public static final Relationship ORIGINAL = new Relationship.Builder() .name("original") .description("Original Excel document received by this processor") @@ -124,6 +152,24 @@ public class ConvertExcelToCSVProcessor protected void init(final ProcessorInitializationContext context) { final List descriptors = new ArrayList<>(); descriptors.add(DESIRED_SHEETS); + descriptors.add(ROWS_TO_SKIP); + descriptors.add(COLUMNS_TO_SKIP); + descriptors.add(FORMAT_VALUES); + + descriptors.add(CSVUtils.CSV_FORMAT); + descriptors.add(CSVUtils.VALUE_SEPARATOR); + descriptors.add(CSVUtils.INCLUDE_HEADER_LINE); + descriptors.add(CSVUtils.QUOTE_CHAR); + descriptors.add(CSVUtils.ESCAPE_CHAR); + descriptors.add(CSVUtils.COMMENT_MARKER); + descriptors.add(CSVUtils.NULL_STRING); + descriptors.add(CSVUtils.TRIM_FIELDS); + descriptors.add(new PropertyDescriptor.Builder() + .fromPropertyDescriptor(CSVUtils.QUOTE_MODE) + .defaultValue(CSVUtils.QUOTE_NONE.getValue()) + .build()); + descriptors.add(CSVUtils.RECORD_SEPARATOR); + descriptors.add(CSVUtils.TRAILING_DELIMITER); this.descriptors = Collections.unmodifiableList(descriptors); final Set relationships = new HashSet<>(); @@ -150,28 +196,46 @@ public class ConvertExcelToCSVProcessor return; } - try { + final String desiredSheetsDelimited = context.getProperty(DESIRED_SHEETS).evaluateAttributeExpressions().getValue(); + final boolean formatValues = context.getProperty(FORMAT_VALUES).asBoolean(); + final CSVFormat csvFormat = CSVUtils.createCSVFormat(context); + + //Switch to 0 based index + final int firstRow = context.getProperty(ROWS_TO_SKIP).asInteger() - 1; + final String[] sColumnsToSkip = StringUtils + .split(context.getProperty(COLUMNS_TO_SKIP).getValue(), ","); + + final List columnsToSkip = new ArrayList<>(); + + if(sColumnsToSkip != null && sColumnsToSkip.length > 0) { + for (String c : sColumnsToSkip) { + try { + //Switch to 0 based index + columnsToSkip.add(Integer.parseInt(c) - 1); + } catch (NumberFormatException e) { + throw new ProcessException("Invalid column in Columns to Skip list.", e); + } + } + } + + try { session.read(flowFile, new InputStreamCallback() { @Override public void process(InputStream inputStream) throws IOException { try { - String desiredSheetsDelimited = context.getProperty(DESIRED_SHEETS) - .evaluateAttributeExpressions().getValue(); - OPCPackage pkg = OPCPackage.open(inputStream); XSSFReader r = new XSSFReader(pkg); - SharedStringsTable sst = r.getSharedStringsTable(); + ReadOnlySharedStringsTable sst = new ReadOnlySharedStringsTable(pkg); + StylesTable styles = r.getStylesTable(); XSSFReader.SheetIterator iter = (XSSFReader.SheetIterator) r.getSheetsData(); if (desiredSheetsDelimited != null) { - String[] desiredSheets = StringUtils .split(desiredSheetsDelimited, DESIRED_SHEETS_DELIMITER); if (desiredSheets != null) { - while (iter.hasNext()) { InputStream sheet = iter.next(); String sheetName = iter.getSheetName(); @@ -179,7 +243,8 @@ public class ConvertExcelToCSVProcessor for (int i = 0; i < desiredSheets.length; i++) { //If the sheetName is a desired one parse it if (sheetName.equalsIgnoreCase(desiredSheets[i])) { - handleExcelSheet(session, flowFile, sst, sheet, sheetName); + ExcelSheetReadConfig readConfig = new ExcelSheetReadConfig(columnsToSkip, firstRow, sheetName, formatValues, sst, styles); + handleExcelSheet(session, flowFile, sheet, readConfig, csvFormat); break; } } @@ -191,13 +256,17 @@ public class ConvertExcelToCSVProcessor } else { //Get all of the sheets in the document. while (iter.hasNext()) { - handleExcelSheet(session, flowFile, sst, iter.next(), iter.getSheetName()); + InputStream sheet = iter.next(); + String sheetName = iter.getSheetName(); + + ExcelSheetReadConfig readConfig = new ExcelSheetReadConfig(columnsToSkip, firstRow, sheetName, formatValues, sst, styles); + handleExcelSheet(session, flowFile, sheet, readConfig, csvFormat); } } } catch (InvalidFormatException ife) { getLogger().error("Only .xlsx Excel 2007 OOXML files are supported", ife); throw new UnsupportedOperationException("Only .xlsx Excel 2007 OOXML files are supported", ife); - } catch (OpenXML4JException e) { + } catch (OpenXML4JException | SAXException e) { getLogger().error("Error occurred while processing Excel document metadata", e); } } @@ -206,7 +275,7 @@ public class ConvertExcelToCSVProcessor session.transfer(flowFile, ORIGINAL); } catch (RuntimeException ex) { - getLogger().error("Failed to process incoming Excel document", ex); + getLogger().error("Failed to process incoming Excel document. " + ex.getMessage(), ex); FlowFile failedFlowFile = session.putAttribute(flowFile, ConvertExcelToCSVProcessor.class.getName() + ".error", ex.getMessage()); session.transfer(failedFlowFile, FAILURE); @@ -220,45 +289,48 @@ public class ConvertExcelToCSVProcessor * @param session * The NiFi ProcessSession instance for the current invocation. */ - private void handleExcelSheet(ProcessSession session, FlowFile originalParentFF, - SharedStringsTable sst, final InputStream sheetInputStream, String sName) throws IOException { + private void handleExcelSheet(ProcessSession session, FlowFile originalParentFF, final InputStream sheetInputStream, ExcelSheetReadConfig readConfig, + CSVFormat csvFormat) throws IOException { FlowFile ff = session.create(); try { + final DataFormatter formatter = new DataFormatter(); + final InputSource sheetSource = new InputSource(sheetInputStream); + + final SheetToCSV sheetHandler = new SheetToCSV(readConfig, csvFormat); + + final XMLReader parser = SAXHelper.newXMLReader(); + + //If Value Formatting is set to false then don't pass in the styles table. + // This will cause the XSSF Handler to return the raw value instead of the formatted one. + final StylesTable sst = readConfig.getFormatValues()?readConfig.getStyles():null; + + final XSSFSheetXMLHandler handler = new XSSFSheetXMLHandler( + sst, null, readConfig.getSharedStringsTable(), sheetHandler, formatter, false); - XMLReader parser = - XMLReaderFactory.createXMLReader( - SAX_PARSER - ); - ExcelSheetRowHandler handler = new ExcelSheetRowHandler(sst); parser.setContentHandler(handler); ff = session.write(ff, new OutputStreamCallback() { @Override public void process(OutputStream out) throws IOException { - InputSource sheetSource = new InputSource(sheetInputStream); - ExcelSheetRowHandler eh = null; + PrintStream outPrint = new PrintStream(out); + sheetHandler.setOutput(outPrint); + try { - eh = (ExcelSheetRowHandler) parser.getContentHandler(); - eh.setFlowFileOutputStream(out); - parser.setContentHandler(eh); parser.parse(sheetSource); + sheetInputStream.close(); + + sheetHandler.close(); + outPrint.close(); } catch (SAXException se) { - getLogger().error("Error occurred while processing Excel sheet {}", new Object[]{eh.getSheetName()}, se); + getLogger().error("Error occurred while processing Excel sheet {}", new Object[]{readConfig.getSheetName()}, se); } } }); - if (handler.getSheetName().equals(UNKNOWN_SHEET_NAME)) { - //Used the named parsed from the handler. This logic is only here because IF the handler does find a value that should take precedence. - ff = session.putAttribute(ff, SHEET_NAME, sName); - } else { - ff = session.putAttribute(ff, SHEET_NAME, handler.getSheetName()); - sName = handler.getSheetName(); - } - - ff = session.putAttribute(ff, ROW_NUM, new Long(handler.getRowCount()).toString()); + ff = session.putAttribute(ff, SHEET_NAME, readConfig.getSheetName()); + ff = session.putAttribute(ff, ROW_NUM, new Long(sheetHandler.getRowCount()).toString()); if (StringUtils.isNotEmpty(originalParentFF.getAttribute(CoreAttributes.FILENAME.key()))) { ff = session.putAttribute(ff, SOURCE_FILE_NAME, originalParentFF.getAttribute(CoreAttributes.FILENAME.key())); @@ -268,13 +340,13 @@ public class ConvertExcelToCSVProcessor //Update the CoreAttributes.FILENAME to have the .csv extension now. Also update MIME.TYPE ff = session.putAttribute(ff, CoreAttributes.FILENAME.key(), updateFilenameToCSVExtension(ff.getAttribute(CoreAttributes.UUID.key()), - ff.getAttribute(CoreAttributes.FILENAME.key()), sName)); + ff.getAttribute(CoreAttributes.FILENAME.key()), readConfig.getSheetName())); ff = session.putAttribute(ff, CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE); session.transfer(ff, SUCCESS); - } catch (SAXException saxE) { - getLogger().error("Failed to create instance of SAXParser {}", new Object[]{SAX_PARSER}, saxE); + } catch (SAXException | ParserConfigurationException saxE) { + getLogger().error("Failed to create instance of Parser.", saxE); ff = session.putAttribute(ff, ConvertExcelToCSVProcessor.class.getName() + ".error", saxE.getMessage()); session.transfer(ff, FAILURE); @@ -283,162 +355,161 @@ public class ConvertExcelToCSVProcessor } } - static Integer columnToIndex(String col) { - int length = col.length(); - int accumulator = 0; - for (int i = length; i > 0; i--) { - char c = col.charAt(i - 1); - int x = ((int) c) - 64; - accumulator += x * Math.pow(26, length - i); - } - // Make it to start with 0. - return accumulator - 1; - } - - private static class CellAddress { - final int row; - final int col; - - private CellAddress(int row, int col) { - this.row = row; - this.col = col; - } - } - /** - * Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value - * is the content of that CellAddress converted to a String + * Uses the XSSF Event SAX helpers to do most of the work + * of parsing the Sheet XML, and outputs the contents + * as a (basic) CSV. */ - private class ExcelSheetRowHandler - extends DefaultHandler { + private class SheetToCSV implements XSSFSheetXMLHandler.SheetContentsHandler { + private ExcelSheetReadConfig readConfig; + CSVFormat csvFormat; - private SharedStringsTable sst; - private String currentContent; - private boolean nextIsString; - private CellAddress firstCellAddress; - private CellAddress firstRowLastCellAddress; - private CellAddress previousCellAddress; - private CellAddress nextCellAddress; - private OutputStream outputStream; - private boolean firstColInRow; - long rowCount; - String sheetName; + private boolean firstCellOfRow; + private boolean skipRow; + private int currentRow = -1; + private int currentCol = -1; + private int rowCount = 0; + private boolean rowHasValues=false; + private int skippedColumns=0; - private ExcelSheetRowHandler(SharedStringsTable sst) { - this.sst = sst; - this.firstColInRow = true; - this.rowCount = 0l; - this.sheetName = UNKNOWN_SHEET_NAME; - } + private CSVPrinter printer; - public void setFlowFileOutputStream(OutputStream outputStream) { - this.outputStream = outputStream; - } + private boolean firstRow=false; + private ArrayList fieldValues; - public void startElement(String uri, String localName, String name, - Attributes attributes) throws SAXException { - - if (name.equals(SAX_CELL_REF)) { - String cellType = attributes.getValue(SAX_CELL_TYPE); - // Analyze cell address. - Matcher cellAddressMatcher = CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS)); - if (cellAddressMatcher.matches()) { - String col = cellAddressMatcher.group(1); - String row = cellAddressMatcher.group(2); - nextCellAddress = new CellAddress(Integer.parseInt(row), columnToIndex(col)); - - if (firstCellAddress == null) { - firstCellAddress = nextCellAddress; - } - } - if (cellType != null && cellType.equals(SAX_CELL_STRING)) { - nextIsString = true; - } else { - nextIsString = false; - } - } else if (name.equals(SAX_ROW_REF)) { - if (firstRowLastCellAddress == null) { - firstRowLastCellAddress = previousCellAddress; - } - firstColInRow = true; - previousCellAddress = null; - nextCellAddress = null; - } else if (name.equals(SAX_SHEET_NAME_REF)) { - sheetName = attributes.getValue(0); - } - - currentContent = ""; - } - - private void fillEmptyColumns(int nextColumn) throws IOException { - final CellAddress previousCell = previousCellAddress != null ? previousCellAddress : firstCellAddress; - if (previousCell != null) { - for (int i = 0; i < (nextColumn - previousCell.col); i++) { - // Fill columns. - outputStream.write(",".getBytes()); - } - } - } - - public void endElement(String uri, String localName, String name) - throws SAXException { - - if (nextIsString) { - int idx = Integer.parseInt(currentContent); - currentContent = new XSSFRichTextString(sst.getEntryAt(idx)).toString(); - nextIsString = false; - } - - if (name.equals(SAX_CELL_CONTENT_REF) - // Limit scanning from the first column, and up to the last column. - && (firstCellAddress == null || firstCellAddress.col <= nextCellAddress.col) - && (firstRowLastCellAddress == null || nextCellAddress.col <= firstRowLastCellAddress.col)) { - try { - // A cell is found. - fillEmptyColumns(nextCellAddress.col); - firstColInRow = false; - outputStream.write(currentContent.getBytes()); - // Keep previously found cell address. - previousCellAddress = nextCellAddress; - } catch (IOException e) { - getLogger().error("IO error encountered while writing content of parsed cell " + - "value from sheet {}", new Object[]{getSheetName()}, e); - } - } - - if (name.equals(SAX_ROW_REF)) { - //If this is the first row and the end of the row element has been encountered then that means no columns were present. - if (!firstColInRow) { - try { - if (firstRowLastCellAddress != null) { - fillEmptyColumns(firstRowLastCellAddress.col); - } - rowCount++; - outputStream.write("\n".getBytes()); - } catch (IOException e) { - getLogger().error("IO error encountered while writing new line indicator", e); - } - } - } - - } - - public void characters(char[] ch, int start, int length) - throws SAXException { - currentContent += new String(ch, start, length); - } - - public long getRowCount() { + public int getRowCount(){ return rowCount; } - public String getSheetName() { - return sheetName; + public void setOutput(PrintStream output){ + final OutputStreamWriter streamWriter = new OutputStreamWriter(output); + + try { + printer = new CSVPrinter(streamWriter, csvFormat); + } catch (IOException e) { + throw new ProcessException("Failed to create CSV Printer.", e); + } + } + + public SheetToCSV(ExcelSheetReadConfig readConfig, CSVFormat csvFormat){ + this.readConfig = readConfig; + this.csvFormat = csvFormat; + } + + @Override + public void startRow(int rowNum) { + if(rowNum <= readConfig.getOverrideFirstRow()) { + skipRow = true; + return; + } + + // Prepare for this row + skipRow = false; + firstCellOfRow = true; + firstRow = currentRow==-1; + currentRow = rowNum; + currentCol = -1; + rowHasValues = false; + + fieldValues = new ArrayList<>(); + } + + @Override + public void endRow(int rowNum) { + if(skipRow) { + return; + } + + if(firstRow){ + readConfig.setLastColumn(currentCol); + } + + //if there was no data in this row, don't write it + if(!rowHasValues) { + return; + } + + // Ensure the correct number of columns + int columnsToAdd = (readConfig.getLastColumn() - currentCol) - readConfig.getColumnsToSkip().size(); + for (int i=0; i readConfig.getLastColumn())){ + return; + } + + if(readConfig.getColumnsToSkip().contains(thisCol)){ + skippedColumns++; + return; + } + + int missedCols = (thisCol - readConfig.getFirstColumn()) - (currentCol - readConfig.getFirstColumn()) - 1; + if(firstCellOfRow){ + missedCols = (thisCol - readConfig.getFirstColumn()); + } + + missedCols -= skippedColumns; + + if (firstCellOfRow) { + firstCellOfRow = false; + } + + for (int i=0; i getColumnsToSkip(){ + return columnsToSkip; + } + + public ReadOnlySharedStringsTable getSharedStringsTable(){ + return sst; + } + + public StylesTable getStyles(){ + return styles; + } + + private int firstColumn; + private int lastColumn; + + private int firstRow; + private int lastRow; + private int overrideFirstRow; + private String sheetName; + private boolean formatValues; + + private ReadOnlySharedStringsTable sst; + private StylesTable styles; + + private List columnsToSkip; + + public ExcelSheetReadConfig(List columnsToSkip, int overrideFirstRow, String sheetName, boolean formatValues, + ReadOnlySharedStringsTable sst, StylesTable styles){ + + this.sheetName = sheetName; + this.columnsToSkip = columnsToSkip; + this.overrideFirstRow = overrideFirstRow; + this.formatValues = formatValues; + + this.sst = sst; + this.styles = styles; + } + } } \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java index 1df2568dd5..9e9131fe74 100644 --- a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java +++ b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/java/org/apache/nifi/processors/poi/ConvertExcelToCSVProcessorTest.java @@ -20,9 +20,9 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.File; -import java.nio.charset.StandardCharsets; import java.util.List; +import org.apache.nifi.csv.CSVUtils; import org.apache.nifi.flowfile.attributes.CoreAttributes; import org.apache.nifi.util.LogMessage; import org.apache.nifi.util.MockFlowFile; @@ -41,16 +41,6 @@ public class ConvertExcelToCSVProcessorTest { testRunner = TestRunners.newTestRunner(ConvertExcelToCSVProcessor.class); } - @Test - public void testColToIndex() { - assertEquals(Integer.valueOf(0), ConvertExcelToCSVProcessor.columnToIndex("A")); - assertEquals(Integer.valueOf(1), ConvertExcelToCSVProcessor.columnToIndex("B")); - assertEquals(Integer.valueOf(25), ConvertExcelToCSVProcessor.columnToIndex("Z")); - assertEquals(Integer.valueOf(29), ConvertExcelToCSVProcessor.columnToIndex("AD")); - assertEquals(Integer.valueOf(239), ConvertExcelToCSVProcessor.columnToIndex("IF")); - assertEquals(Integer.valueOf(16383), ConvertExcelToCSVProcessor.columnToIndex("XFD")); - } - @Test public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception { @@ -81,6 +71,144 @@ public class ConvertExcelToCSVProcessorTest { } + @Test + public void testDataFormatting() throws Exception { + testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath()); + + testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "false"); + + testRunner.run(); + + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0); + + MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); + Long rowsSheet = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); + assertTrue(rowsSheet == 9); + + ff.assertContentEquals("Numbers,Timestamps,Money\n" + + "1234.4559999999999,42736.5,123.45\n" + + "1234.4559999999999,42736.5,123.45\n" + + "1234.4559999999999,42736.5,123.45\n" + + "1234.4559999999999,42736.5,1023.45\n" + + "1234.4559999999999,42736.5,1023.45\n" + + "987654321,42736.5,1023.45\n" + + "987654321,,\n" + + "987654321,,\n"); + } + + @Test + public void testQuoting() throws Exception { + testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath()); + + testRunner.setProperty(CSVUtils.QUOTE_MODE, CSVUtils.QUOTE_MINIMAL); + testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true"); + + testRunner.run(); + + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0); + + MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); + Long rowsSheet = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); + assertTrue(rowsSheet == 9); + + ff.assertContentEquals("Numbers,Timestamps,Money\n" + + "1234.456,1/1/17,$ 123.45\n" + + "1234.46,12:00:00 PM,£ 123.45\n" + + "1234.5,\"Sunday, January 01, 2017\",¥ 123.45\n" + + "\"1,234.46\",1/1/17 12:00,\"$ 1,023.45\"\n" + + "\"1,234.4560\",12:00 PM,\"£ 1,023.45\"\n" + + "9.88E+08,2017/01/01/ 12:00,\"¥ 1,023.45\"\n" + + "9.877E+08,,\n" + + "9.8765E+08,,\n"); + } + + @Test + public void testSkipRows() throws Exception { + testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath()); + + testRunner.setProperty(ConvertExcelToCSVProcessor.ROWS_TO_SKIP, "2"); + testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true"); + + testRunner.run(); + + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0); + + MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); + Long rowsSheet = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); + assertEquals("Row count does match expected value.", "7", rowsSheet.toString()); + + ff.assertContentEquals("1234.46,12:00:00 PM,£ 123.45\n" + + "1234.5,Sunday\\, January 01\\, 2017,¥ 123.45\n" + + "1\\,234.46,1/1/17 12:00,$ 1\\,023.45\n" + + "1\\,234.4560,12:00 PM,£ 1\\,023.45\n" + + "9.88E+08,2017/01/01/ 12:00,¥ 1\\,023.45\n" + + "9.877E+08,,\n" + + "9.8765E+08,,\n"); + } + + @Test + public void testSkipColumns() throws Exception { + testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath()); + + testRunner.setProperty(ConvertExcelToCSVProcessor.COLUMNS_TO_SKIP, "2"); + testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true"); + + testRunner.run(); + + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0); + + MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); + Long rowsSheet = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); + assertTrue(rowsSheet == 9); + + ff.assertContentEquals("Numbers,Money\n" + + "1234.456,$ 123.45\n" + + "1234.46,£ 123.45\n" + + "1234.5,¥ 123.45\n" + + "1\\,234.46,$ 1\\,023.45\n" + + "1\\,234.4560,£ 1\\,023.45\n" + + "9.88E+08,¥ 1\\,023.45\n" + + "9.877E+08,\n" + + "9.8765E+08,\n"); + } + + @Test + public void testCustomDelimiters() throws Exception { + testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath()); + + testRunner.setProperty(CSVUtils.VALUE_SEPARATOR, "|"); + testRunner.setProperty(CSVUtils.RECORD_SEPARATOR, "\\r\\n"); + testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true"); + + testRunner.run(); + + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1); + testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0); + + MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); + Long rowsSheet = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); + assertTrue(rowsSheet == 9); + + ff.assertContentEquals("Numbers|Timestamps|Money\r\n" + + "1234.456|1/1/17|$ 123.45\r\n" + + "1234.46|12:00:00 PM|£ 123.45\r\n" + + "1234.5|Sunday, January 01, 2017|¥ 123.45\r\n" + + "1,234.46|1/1/17 12:00|$ 1,023.45\r\n" + + "1,234.4560|12:00 PM|£ 1,023.45\r\n" + + "9.88E+08|2017/01/01/ 12:00|¥ 1,023.45\r\n" + + "9.877E+08||\r\n" + + "9.8765E+08||\r\n"); + } + /** * Validates that all sheets in the Excel document are exported. * @@ -181,7 +309,7 @@ public class ConvertExcelToCSVProcessorTest { MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0); Long l = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM)); assertTrue(l == 8l); - ff.isContentEqual("test", StandardCharsets.UTF_8); + ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv")); } @@ -199,8 +327,8 @@ public class ConvertExcelToCSVProcessorTest { testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 1); List errorMessages = testRunner.getLogger().getErrorMessages(); - Assert.assertEquals(2, errorMessages.size()); + Assert.assertEquals(1, errorMessages.size()); String messageText = errorMessages.get(0).getMsg(); - Assert.assertTrue(messageText.contains("Excel") && messageText.contains("supported")); + Assert.assertTrue(messageText.contains("Excel") && messageText.contains("OLE2")); } } \ No newline at end of file diff --git a/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/dataformatting.xlsx b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/dataformatting.xlsx new file mode 100644 index 0000000000..a9428e2bda Binary files /dev/null and b/nifi-nar-bundles/nifi-poi-bundle/nifi-poi-processors/src/test/resources/dataformatting.xlsx differ