NIFI-12100 Removed the ConvertExcelToCSVProcessor

This closes #7802

Signed-off-by: Mike Thomsen <mthomsen@apache.org>
This commit is contained in:
dan-s1 2023-09-27 18:13:46 +00:00 committed by Mike Thomsen
parent a74c411079
commit e9b532bd32
14 changed files with 0 additions and 1346 deletions

View File

@ -30,11 +30,6 @@
</properties>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-poi-processors</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-poi-services</artifactId>

View File

@ -1,76 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-poi-bundle</artifactId>
<version>2.0.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-poi-processors</artifactId>
<packaging>jar</packaging>
<build>
<plugins>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
<configuration>
<excludes combine.children="append">
<exclude>src/test/resources/with-blank-cells.csv</exclude>
</excludes>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
</dependency>
<dependency>
<groupId>com.github.pjfanning</groupId>
<artifactId>excel-streaming-reader</artifactId>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-standard-record-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-mock</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,534 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.poi;
import com.github.pjfanning.xlsx.StreamingReader;
import com.github.pjfanning.xlsx.exceptions.OpenException;
import com.github.pjfanning.xlsx.exceptions.ParseException;
import com.github.pjfanning.xlsx.exceptions.ReadException;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.csv.CSVUtils;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
@Tags({"excel", "csv", "poi"})
@CapabilityDescription("Consumes a Microsoft Excel document and converts each worksheet to csv. Each sheet from the incoming Excel " +
"document will generate a new Flowfile that will be output from this processor. Each output Flowfile's contents will be formatted as a csv file " +
"where the each row from the excel sheet is output as a newline in the csv file. This processor is currently only capable of processing .xlsx " +
"(XSSF 2007 OOXML file format) Excel documents and not older .xls (HSSF '97(-2007) file format) documents. This processor also expects well formatted " +
"CSV content and will not escape cell's containing invalid content such as newlines or additional commas.")
@WritesAttributes({@WritesAttribute(attribute = "sheetname", description = "The name of the Excel sheet that this particular row of data came from in the Excel document"),
@WritesAttribute(attribute = "numrows", description = "The number of rows in this Excel Sheet"),
@WritesAttribute(attribute = "sourcefilename", description = "The name of the Excel document file that this data originated from"),
@WritesAttribute(attribute = "convertexceltocsvprocessor.error", description = "Error message that was encountered on a per Excel sheet basis. This attribute is" +
" only populated if an error was occured while processing the particular sheet. Having the error present at the sheet level will allow for the end" +
" user to better understand what syntax errors in their excel doc on a larger scale caused the error.")})
public class ConvertExcelToCSVProcessor extends AbstractProcessor {
private static final String CSV_MIME_TYPE = "text/csv";
public static final String SHEET_NAME = "sheetname";
public static final String ROW_NUM = "numrows";
public static final String SOURCE_FILE_NAME = "sourcefilename";
private static final String DESIRED_SHEETS_DELIMITER = ",";
private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
.Builder().name("extract-sheets")
.displayName("Sheets to Extract")
.description("Comma separated list of Excel document sheet names that should be extracted from the excel document. If this property" +
" is left blank then all of the sheets will be extracted from the Excel document. The list of names is case in-sensitive. Any sheets not " +
"specified in this value will be ignored. A bulletin will be generated if a specified sheet(s) are not found.")
.required(false)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor ROWS_TO_SKIP = new PropertyDescriptor
.Builder().name("excel-extract-first-row")
.displayName("Number of Rows to Skip")
.description("The row number of the first row to start processing."
+ "Use this to skip over rows of data at the top of your worksheet that are not part of the dataset."
+ "Empty rows of data anywhere in the spreadsheet will always be skipped, no matter what this value is set to.")
.required(true)
.defaultValue("0")
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.build();
public static final PropertyDescriptor COLUMNS_TO_SKIP = new PropertyDescriptor
.Builder().name("excel-extract-column-to-skip")
.displayName("Columns To Skip")
.description("Comma delimited list of column numbers to skip. Use the columns number and not the letter designation. "
+ "Use this to skip over columns anywhere in your worksheet that you don't want extracted as part of the record.")
.required(false)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
public static final PropertyDescriptor FORMAT_VALUES = new PropertyDescriptor.Builder()
.name("excel-format-values")
.displayName("Format Cell Values")
.description("Should the cell values be written to CSV using the formatting applied in Excel, or should they be printed as raw values.")
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
public static final Relationship ORIGINAL = new Relationship.Builder()
.name("original")
.description("Original Excel document received by this processor")
.build();
public static final Relationship SUCCESS = new Relationship.Builder()
.name("success")
.description("Excel data converted to csv")
.build();
public static final Relationship FAILURE = new Relationship.Builder()
.name("failure")
.description("Failed to parse the Excel document")
.build();
private List<PropertyDescriptor> descriptors;
private Set<Relationship> relationships;
@Override
protected void init(final ProcessorInitializationContext context) {
final List<PropertyDescriptor> descriptors = new ArrayList<>();
descriptors.add(DESIRED_SHEETS);
descriptors.add(ROWS_TO_SKIP);
descriptors.add(COLUMNS_TO_SKIP);
descriptors.add(FORMAT_VALUES);
descriptors.add(CSVUtils.CSV_FORMAT);
descriptors.add(CSVUtils.VALUE_SEPARATOR);
descriptors.add(CSVUtils.INCLUDE_HEADER_LINE);
descriptors.add(CSVUtils.QUOTE_CHAR);
descriptors.add(CSVUtils.ESCAPE_CHAR);
descriptors.add(CSVUtils.COMMENT_MARKER);
descriptors.add(CSVUtils.NULL_STRING);
descriptors.add(CSVUtils.TRIM_FIELDS);
descriptors.add(new PropertyDescriptor.Builder()
.fromPropertyDescriptor(CSVUtils.QUOTE_MODE)
.defaultValue(CSVUtils.QUOTE_NONE.getValue())
.build());
descriptors.add(CSVUtils.RECORD_SEPARATOR);
descriptors.add(CSVUtils.TRAILING_DELIMITER);
this.descriptors = Collections.unmodifiableList(descriptors);
final Set<Relationship> relationships = new LinkedHashSet<>();
relationships.add(ORIGINAL);
relationships.add(SUCCESS);
relationships.add(FAILURE);
this.relationships = Collections.unmodifiableSet(relationships);
}
@Override
public Set<Relationship> getRelationships() {
return this.relationships;
}
@Override
public final List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return descriptors;
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final Map<String, Boolean> desiredSheets = getDesiredSheets(context, flowFile);
final boolean formatValues = context.getProperty(FORMAT_VALUES).asBoolean();
final CSVFormat csvFormat = CSVUtils.createCSVFormat(context, flowFile.getAttributes());
//Switch to 0 based index
final int firstRow = context.getProperty(ROWS_TO_SKIP).evaluateAttributeExpressions(flowFile).asInteger() - 1;
final List<Integer> columnsToSkip = getColumnsToSkip(context, flowFile);
try {
session.read(flowFile, inputStream -> {
try (Workbook workbook = StreamingReader.builder()
.rowCacheSize(100)
.bufferSize(4096)
.setReadStyles(formatValues)
.open(inputStream)) {
if (!desiredSheets.isEmpty()) {
desiredSheets.keySet().forEach(desiredSheet -> workbook.forEach(sheet -> {
if (sheet.getSheetName().equalsIgnoreCase(desiredSheet)) {
ExcelSheetReadConfig readConfig = new ExcelSheetReadConfig(columnsToSkip, firstRow, sheet.getSheetName());
handleExcelSheet(session, flowFile, sheet, readConfig, csvFormat);
desiredSheets.put(desiredSheet, Boolean.TRUE);
}
}));
String sheetsNotFound = getSheetsNotFound(desiredSheets);
if (!sheetsNotFound.isEmpty()) {
getLogger().warn("Excel sheet(s) not found: {}", sheetsNotFound);
}
} else {
workbook.forEach(sheet -> {
ExcelSheetReadConfig readConfig = new ExcelSheetReadConfig(columnsToSkip, firstRow, sheet.getSheetName());
handleExcelSheet(session, flowFile, sheet, readConfig, csvFormat);
});
}
} catch (ParseException | OpenException | ReadException e) {
if (e.getCause() instanceof InvalidFormatException) {
String msg = "Only .xlsx Excel 2007 OOXML files are supported";
getLogger().error(msg, e);
throw new UnsupportedOperationException(msg, e);
}
getLogger().error("Error occurred while processing Excel document metadata", e);
}
});
session.transfer(flowFile, ORIGINAL);
} catch (RuntimeException ex) {
getLogger().error("Failed to process incoming Excel document. " + ex.getMessage(), ex);
FlowFile failedFlowFile = session.putAttribute(flowFile,
ConvertExcelToCSVProcessor.class.getName() + ".error", ex.getMessage());
session.transfer(failedFlowFile, FAILURE);
}
}
private List<Integer> getColumnsToSkip(final ProcessContext context, FlowFile flowFile) {
final String[] columnsToSkip = StringUtils.split(context.getProperty(COLUMNS_TO_SKIP)
.evaluateAttributeExpressions(flowFile).getValue(), ",");
if (columnsToSkip != null) {
try {
return Arrays.stream(columnsToSkip)
.map(columnToSkip -> Integer.parseInt(columnToSkip) - 1)
.collect(Collectors.toList());
} catch (NumberFormatException e) {
throw new ProcessException("Invalid column in Columns to Skip list.", e);
}
}
return new ArrayList<>();
}
private Map<String, Boolean> getDesiredSheets(final ProcessContext context, FlowFile flowFile) {
final String desiredSheetsDelimited = context.getProperty(DESIRED_SHEETS).evaluateAttributeExpressions(flowFile).getValue();
if (desiredSheetsDelimited != null) {
String[] desiredSheets = StringUtils.split(desiredSheetsDelimited, DESIRED_SHEETS_DELIMITER);
if (desiredSheets != null) {
return Arrays.stream(desiredSheets)
.collect(Collectors.toMap(key -> key, value -> Boolean.FALSE));
} else {
getLogger().debug("Excel document was parsed but no sheets with the specified desired names were found.");
}
}
return new HashMap<>();
}
/**
* Handles an individual Excel sheet from the entire Excel document. Each sheet will result in an individual flowfile.
*
* @param session The NiFi ProcessSession instance for the current invocation.
*/
private void handleExcelSheet(ProcessSession session, FlowFile originalParentFF, final Sheet sheet, ExcelSheetReadConfig readConfig,
CSVFormat csvFormat) {
FlowFile ff = session.create(originalParentFF);
final SheetToCSV sheetHandler = new SheetToCSV(readConfig, csvFormat);
try {
ff = session.write(ff, out -> {
PrintStream outPrint = new PrintStream(out, false, StandardCharsets.UTF_8);
sheetHandler.setOutput(outPrint);
sheet.forEach(row -> {
sheetHandler.startRow(row.getRowNum());
row.forEach(sheetHandler::cell);
sheetHandler.endRow();
});
sheetHandler.close();
});
ff = session.putAttribute(ff, SHEET_NAME, readConfig.getSheetName());
ff = session.putAttribute(ff, ROW_NUM, Long.toString(sheetHandler.getRowCount()));
if (StringUtils.isNotEmpty(originalParentFF.getAttribute(CoreAttributes.FILENAME.key()))) {
ff = session.putAttribute(ff, SOURCE_FILE_NAME, originalParentFF.getAttribute(CoreAttributes.FILENAME.key()));
} else {
ff = session.putAttribute(ff, SOURCE_FILE_NAME, UNKNOWN_SHEET_NAME);
}
//Update the CoreAttributes.FILENAME to have the .csv extension now. Also update MIME.TYPE
ff = session.putAttribute(ff, CoreAttributes.FILENAME.key(), updateFilenameToCSVExtension(ff.getAttribute(CoreAttributes.UUID.key()),
ff.getAttribute(CoreAttributes.FILENAME.key()), readConfig.getSheetName()));
ff = session.putAttribute(ff, CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
session.transfer(ff, SUCCESS);
} catch (RuntimeException e) {
ff = session.putAttribute(ff, ConvertExcelToCSVProcessor.class.getName() + ".error", e.getMessage());
session.transfer(ff, FAILURE);
}
}
private String getSheetsNotFound(Map<String, Boolean> desiredSheets) {
return desiredSheets.entrySet().stream()
.filter(entry -> !entry.getValue())
.map(Map.Entry::getKey)
.collect(Collectors.joining(","));
}
/**
* Uses the com.github.pjfanning streaming cell implementation to
* do most of the work of parsing the contents of the Excel sheet
* and outputs the contents as a (basic) CSV.
*/
private class SheetToCSV {
private final ExcelSheetReadConfig readConfig;
CSVFormat csvFormat;
private boolean firstCellOfRow;
private boolean skipRow;
private int currentRow = -1;
private int currentCol = -1;
private int rowCount = 0;
private int skippedColumns = 0;
private CSVPrinter printer;
private boolean firstRow = false;
private ArrayList<String> fieldValues;
public int getRowCount() {
return rowCount;
}
public void setOutput(PrintStream output) {
final OutputStreamWriter streamWriter = new OutputStreamWriter(output, StandardCharsets.UTF_8);
try {
printer = new CSVPrinter(streamWriter, csvFormat);
} catch (IOException e) {
throw new ProcessException("Failed to create CSV Printer.", e);
}
}
public SheetToCSV(ExcelSheetReadConfig readConfig, CSVFormat csvFormat) {
this.readConfig = readConfig;
this.csvFormat = csvFormat;
}
public void startRow(int rowNum) {
if (rowNum <= readConfig.getOverrideFirstRow()) {
skipRow = true;
return;
}
// Prepare for this row
skipRow = false;
firstCellOfRow = true;
firstRow = currentRow == -1;
currentRow = rowNum;
currentCol = -1;
fieldValues = new ArrayList<>();
}
public void endRow() {
if(skipRow) {
return;
}
if(firstRow) {
readConfig.setLastColumn(currentCol);
}
//if there was no data in this row, don't write it
if(fieldValues.stream()
.noneMatch(string -> string != null && !string.isEmpty())) {
return;
}
// Ensure the correct number of columns
int columnsToAdd = (readConfig.getLastColumn() - currentCol) - readConfig.getColumnsToSkip().size();
for (int i = 0; i < columnsToAdd; i++) {
fieldValues.add(null);
}
try {
printer.printRecord(fieldValues);
} catch (IOException e) {
getLogger().warn("Print Record failed", e);
}
rowCount++;
}
public void cell(Cell cell) {
if (skipRow) {
return;
}
// Did we miss any cells?
int thisCol = cell.getColumnIndex();
//Use the first row of the file to decide on the area of data to export
if (firstRow && firstCellOfRow) {
readConfig.setFirstColumn(thisCol);
}
//if this cell falls outside our area, or has been explicitly marked as a skipped column, return and don't write it out.
if (!firstRow && (thisCol < readConfig.getFirstColumn() || thisCol > readConfig.getLastColumn())) {
return;
}
if (readConfig.getColumnsToSkip().contains(thisCol)) {
skippedColumns++;
return;
}
int missedCols = (thisCol - readConfig.getFirstColumn()) - (currentCol - readConfig.getFirstColumn()) - 1;
if (firstCellOfRow) {
missedCols = (thisCol - readConfig.getFirstColumn());
}
missedCols -= skippedColumns;
if (firstCellOfRow) {
firstCellOfRow = false;
}
for (int i = 0; i < missedCols; i++) {
fieldValues.add(null);
}
currentCol = thisCol;
String stringCellValue = cell.getStringCellValue();
fieldValues.add(stringCellValue != null && !stringCellValue.isEmpty() ? stringCellValue : null);
skippedColumns = 0;
}
public void close() throws IOException {
printer.close();
}
}
/**
* Takes the original input filename and updates it by removing the file extension and replacing it with
* the .csv extension.
*
* @param origFileName Original filename from the input file.
* @return The new filename with the .csv extension that should be place in the output flowfile's attributes
*/
private String updateFilenameToCSVExtension(String nifiUUID, String origFileName, String sheetName) {
StringBuilder stringBuilder = new StringBuilder();
if (StringUtils.isNotEmpty(origFileName)) {
String ext = FilenameUtils.getExtension(origFileName);
if (StringUtils.isNotEmpty(ext)) {
stringBuilder.append(StringUtils.replace(origFileName, ("." + ext), ""));
} else {
stringBuilder.append(origFileName);
}
} else {
stringBuilder.append(nifiUUID);
}
stringBuilder.append("_");
stringBuilder.append(sheetName);
stringBuilder.append(".");
stringBuilder.append("csv");
return stringBuilder.toString();
}
private static class ExcelSheetReadConfig {
public String getSheetName() {
return sheetName;
}
public int getFirstColumn() {
return firstColumn;
}
public void setFirstColumn(int value) {
this.firstColumn = value;
}
public int getLastColumn() {
return lastColumn;
}
public void setLastColumn(int lastColumn) {
this.lastColumn = lastColumn;
}
public int getOverrideFirstRow() {
return overrideFirstRow;
}
public List<Integer> getColumnsToSkip() {
return columnsToSkip;
}
private int firstColumn;
private int lastColumn;
private final int overrideFirstRow;
private final String sheetName;
private final List<Integer> columnsToSkip;
public ExcelSheetReadConfig(List<Integer> columnsToSkip, int overrideFirstRow, String sheetName) {
this.sheetName = sheetName;
this.columnsToSkip = columnsToSkip;
this.overrideFirstRow = overrideFirstRow;
}
}
}

View File

@ -1,15 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.processors.poi.ConvertExcelToCSVProcessor

View File

@ -1,97 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<head>
<meta charset="utf-8" />
<title>ConvertExcelToCSVProcessor</title>
<style>
table {
border-collapse: collapse;
}
table, th, td {
border: 1px solid #ccc;
}
td.r {
text-align: right;
}
td {
width: 50px;
padding: 5px;
}
</style>
<link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css" />
</head>
<body>
<h2>How it extracts CSV data from a sheet</h2>
<p>
ConvertExcelToCSVProcessor extracts CSV data with following rules:
</p>
<ul>
<li>Find the fist cell which has a value in it (the FirstCell).</li>
<li>Scan cells in the first row, starting from the FirstCell,
until it reaches to a cell after which no cell with a value can not be found in the row (the FirstRowLastCell).</li>
<li>Process the 2nd row and later, from the column of FirstCell to the column of FirstRowLastCell.</li>
<li>If a row does not have any cell that has a value, then the row is ignored.</li>
</ul>
<p>
As an example, the sheet shown below will be:
</p>
<table>
<tbody>
<tr><th>row </th><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th></tr>
<tr><td class="r"> 1</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
<tr><td class="r"> 2</td><td> </td><td> </td><td>x</td><td>y</td><td>z</td><td> </td><td> </td></tr>
<tr><td class="r"> 3</td><td> </td><td> </td><td>1</td><td> </td><td> </td><td> </td><td> </td></tr>
<tr><td class="r"> 4</td><td>2</td><td> </td><td> </td><td>3</td><td> </td><td> </td><td> </td></tr>
<tr><td class="r"> 5</td><td> </td><td> </td><td> </td><td> </td><td>4</td><td> </td><td> </td></tr>
<tr><td class="r"> 6</td><td> </td><td> </td><td>5</td><td>6</td><td>7</td><td> </td><td> </td></tr>
<tr><td class="r"> 7</td><td> </td><td> </td><td> </td><td> </td><td> </td><td>8</td><td> </td></tr>
<tr><td class="r"> 8</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
<tr><td class="r"> 9</td><td> </td><td> </td><td> </td><td> </td><td>9</td><td> </td><td> </td></tr>
<tr><td class="r"> 10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
<tr><td class="r"> 11</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
</tbody>
</table>
<p>
converted to following CSV:
</p>
<pre>
x,y,z
1,,
,3,
,,4
5,6,7
,,9
</pre>
<ul>
<li>C2(x) is the FirstCell, and E2(z) is the FirstRowLastCell.</li>
<li>A4(2) is ignored because it is out of range. So is F7(8).</li>
<li>Row 7 and 8 are ignored because those do not have a valid cell.</li>
<li>It is important to have a header row as shown in the example to define data area,
especially when a sheet includes empty cells.</li>
</ul>
</body>
</html>

View File

@ -1,578 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.poi;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.text.DecimalFormatSymbols;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.nifi.csv.CSVUtils;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.util.LogMessage;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class ConvertExcelToCSVProcessorTest {
private TestRunner testRunner;
@BeforeEach
public void init() {
testRunner = TestRunners.newTestRunner(ConvertExcelToCSVProcessor.class);
}
@Test
public void testMultipleSheetsGeneratesMultipleFlowFiles() throws IOException {
Map<String, String> attributes = new HashMap<>();
attributes.put("test", "attribute");
final URL resourceUrl = getClass().getResource("/TwoSheets.xlsx");
assertNotNull(resourceUrl);
testRunner.enqueue(new File(resourceUrl.getPath()).toPath(), attributes);
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 2);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ffSheetA = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheetA = Long.parseLong(ffSheetA.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(4, rowsSheetA);
assertTrue(ffSheetA.getAttribute(ConvertExcelToCSVProcessor.SHEET_NAME).equalsIgnoreCase("TestSheetA"));
assertEquals("TwoSheets.xlsx", ffSheetA.getAttribute(ConvertExcelToCSVProcessor.SOURCE_FILE_NAME));
//Since TestRunner.run() will create a random filename even if the attribute is set in enqueue manually we just check that "_{SHEETNAME}.csv is present
assertTrue(ffSheetA.getAttribute(CoreAttributes.FILENAME.key()).endsWith("_TestSheetA.csv"));
assertEquals("attribute", ffSheetA.getAttribute("test"));
MockFlowFile ffSheetB = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(1);
long rowsSheetB = Long.parseLong(ffSheetB.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(3, rowsSheetB);
assertTrue(ffSheetB.getAttribute(ConvertExcelToCSVProcessor.SHEET_NAME).equalsIgnoreCase("TestSheetB"));
assertEquals("TwoSheets.xlsx", ffSheetB.getAttribute(ConvertExcelToCSVProcessor.SOURCE_FILE_NAME));
//Since TestRunner.run() will create a random filename even if the attribute is set in enqueue manually we just check that "_{SHEETNAME}.csv is present
assertTrue(ffSheetB.getAttribute(CoreAttributes.FILENAME.key()).endsWith("_TestSheetB.csv"));
assertEquals("attribute", ffSheetB.getAttribute("test"));
}
@Test
public void testDataFormatting() {
testRunner.enqueue(getClass().getResourceAsStream("/dataformatting.xlsx"));
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "false");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
ff.assertContentEquals("Numbers,Timestamps,Money\n" +
"1234.4559999999999,42736.5,123.45\n" +
"1234.4559999999999,42736.5,123.45\n" +
"1234.4559999999999,42736.5,123.45\n" +
"1234.4559999999999,42736.5,1023.45\n" +
"1234.4559999999999,42736.5,1023.45\n" +
"987654321,42736.5,1023.45\n" +
"987654321,,\n" +
"987654321,,\n");
}
@Test
public void testQuoting() {
testRunner.enqueue(getClass().getResourceAsStream("/dataformatting.xlsx"));
testRunner.setProperty(CSVUtils.QUOTE_MODE, CSVUtils.QUOTE_MINIMAL);
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
char decimalSeparator = decimalFormatSymbols.getDecimalSeparator();
char groupingSeparator = decimalFormatSymbols.getGroupingSeparator();
ff.assertContentEquals(("Numbers,Timestamps,Money\n" +
addQuotingIfNeeded(String.format("1234%1$s456", decimalSeparator)) + "," + DateTimeFormatter.ofPattern("d/M/yy").format(localDt) + "," +
addQuotingIfNeeded(String.format("$ 123%1$s45", decimalSeparator)) + "\n" +
addQuotingIfNeeded(String.format("1234%1$s46", decimalSeparator)) + "," + DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + "," +
addQuotingIfNeeded(String.format("£ 123%1$s45", decimalSeparator)) + "\n" +
addQuotingIfNeeded(String.format("1234%1$s5", decimalSeparator)) + ",\"" + DateTimeFormatter.ofPattern("EEEE, MMMM dd, yyyy").format(localDt) + "\"," +
addQuotingIfNeeded(String.format("¥ 123%1$s45", decimalSeparator)) + "\n" +
addQuotingIfNeeded(String.format("1%2$s234%1$s46", decimalSeparator, groupingSeparator)) + "," + DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + "," +
addQuotingIfNeeded(String.format("$ 1%2$s023%1$s45", decimalSeparator, groupingSeparator)) + "\n" +
addQuotingIfNeeded(String.format("1%2$s234%1$s4560", decimalSeparator, groupingSeparator)) + "," + DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + "," +
addQuotingIfNeeded(String.format("£ 1%2$s023%1$s45", decimalSeparator, groupingSeparator)) + "\n" +
addQuotingIfNeeded(String.format("9%1$s88E+08", decimalSeparator)) + "," + DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + "," +
addQuotingIfNeeded(String.format("¥ 1%2$s023%1$s45", decimalSeparator, groupingSeparator)) + "\n" +
addQuotingIfNeeded(String.format("9%1$s877E+08", decimalSeparator)) + ",,\n" +
addQuotingIfNeeded(String.format("9%1$s8765E+08", decimalSeparator)) + ",,\n").replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
/**
* Workaround for interaction between {@link DecimalFormatSymbols} and use of custom {@link java.util.Locale}.
*/
private static String getExponentSeparator(final DecimalFormatSymbols decimalFormatSymbols) {
final String exponentSeparator = decimalFormatSymbols.getExponentSeparator();
return (exponentSeparator.equals("e") ? "e" : exponentSeparator + "+");
}
@Test
public void testSkipRows() {
testRunner.enqueue(getClass().getResourceAsStream("/dataformatting.xlsx"));
testRunner.setProperty(ConvertExcelToCSVProcessor.ROWS_TO_SKIP, "2");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(7, rowsSheet, "Row count does match expected value.");
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String decimalSeparator = decimalFormatSymbols.getDecimalSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = decimalFormatSymbols.getGroupingSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("1234%1$s46," + DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + ",£ 123%1$s45\n" +
"1234%1$s5," + DateTimeFormatter.ofPattern("EEEE\\, MMMM dd\\, yyyy").format(localDt) + ",¥ 123%1$s45\n" +
"1%2$s234%1$s46," + DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + ",$ 1%2$s023%1$s45\n" +
"1%2$s234%1$s4560," + DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + ",£ 1%2$s023%1$s45\n" +
"9%1$s88E+08," + DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + ",¥ 1%2$s023%1$s45\n" +
"9%1$s877E+08,,\n" +
"9%1$s8765E+08,,\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testSkipRowsWithEL() {
Map<String, String> attributes = new HashMap<>();
attributes.put("rowsToSkip", "2");
testRunner.enqueue(getClass().getResourceAsStream("/dataformatting.xlsx"), attributes);
testRunner.setProperty(ConvertExcelToCSVProcessor.ROWS_TO_SKIP, "${rowsToSkip}");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(7, rowsSheet, "Row count does match expected value.");
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String decimalSeparator = decimalFormatSymbols.getDecimalSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = decimalFormatSymbols.getGroupingSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("1234%1$s46," + DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + ",£ 123%1$s45\n" +
"1234%1$s5," + DateTimeFormatter.ofPattern("EEEE\\, MMMM dd\\, yyyy").format(localDt) + ",¥ 123%1$s45\n" +
"1%2$s234%1$s46," + DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + ",$ 1%2$s023%1$s45\n" +
"1%2$s234%1$s4560," + DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + ",£ 1%2$s023%1$s45\n" +
"9%1$s88E+08," + DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + ",¥ 1%2$s023%1$s45\n" +
"9%1$s877E+08,,\n" +
"9%1$s8765E+08,,\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testSkipColumns() throws Exception {
testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath());
testRunner.setProperty(ConvertExcelToCSVProcessor.COLUMNS_TO_SKIP, "2");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String decimalSeparator = decimalFormatSymbols.getDecimalSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = decimalFormatSymbols.getGroupingSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("Numbers,Money\n" +
"1234%1$s456,$ 123%1$s45\n" +
"1234%1$s46,£ 123%1$s45\n" +
"1234%1$s5,¥ 123%1$s45\n" +
"1%2$s234%1$s46,$ 1%2$s023%1$s45\n" +
"1%2$s234%1$s4560,£ 1%2$s023%1$s45\n" +
"9%1$s88E+08,¥ 1%2$s023%1$s45\n" +
"9%1$s877E+08,\n" +
"9%1$s8765E+08,\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testSkipColumnsWithEL() throws Exception {
Map<String, String> attributes = new HashMap<>();
attributes.put("columnsToSkip", "2");
testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath(), attributes);
testRunner.setProperty(ConvertExcelToCSVProcessor.COLUMNS_TO_SKIP, "${columnsToSkip}");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String decimalSeparator = decimalFormatSymbols.getDecimalSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = decimalFormatSymbols.getGroupingSeparator() == ',' ? "\\," : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("Numbers,Money\n" +
"1234%1$s456,$ 123%1$s45\n" +
"1234%1$s46,£ 123%1$s45\n" +
"1234%1$s5,¥ 123%1$s45\n" +
"1%2$s234%1$s46,$ 1%2$s023%1$s45\n" +
"1%2$s234%1$s4560,£ 1%2$s023%1$s45\n" +
"9%1$s88E+08,¥ 1%2$s023%1$s45\n" +
"9%1$s877E+08,\n" +
"9%1$s8765E+08,\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testCustomDelimiters() throws Exception {
testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath());
testRunner.setProperty(CSVUtils.VALUE_SEPARATOR, "|");
testRunner.setProperty(CSVUtils.RECORD_SEPARATOR, "\\r\\n");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String valueSeparator = testRunner.getProcessContext().getProperty(CSVUtils.VALUE_SEPARATOR).evaluateAttributeExpressions(ff).getValue();
String decimalSeparator = (String.valueOf(decimalFormatSymbols.getDecimalSeparator()).equals(valueSeparator))
? ("\\" + decimalFormatSymbols.getDecimalSeparator()) : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = String.valueOf(decimalFormatSymbols.getGroupingSeparator()).equals(valueSeparator)
? "\\" + decimalFormatSymbols.getGroupingSeparator() : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("Numbers|Timestamps|Money\r\n" +
"1234%1$s456|" + DateTimeFormatter.ofPattern("d/M/yy").format(localDt) + "|$ 123%1$s45\r\n" +
"1234%1$s46|" + DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + "|£ 123%1$s45\r\n" +
"1234%1$s5|" + DateTimeFormatter.ofPattern("EEEE, MMMM dd, yyyy").format(localDt) + "|¥ 123%1$s45\r\n" +
"1%2$s234%1$s46|" + DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + "|$ 1%2$s023%1$s45\r\n" +
"1%2$s234%1$s4560|" + DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + "|£ 1%2$s023%1$s45\r\n" +
"9%1$s88E+08|" + DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + "|¥ 1%2$s023%1$s45\r\n" +
"9%1$s877E+08||\r\n" +
"9%1$s8765E+08||\r\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testCustomValueSeparatorWithEL() throws Exception {
Map<String, String> attributes = new HashMap<>();
attributes.put("csv.delimiter", "|");
testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath(), attributes);
testRunner.setProperty(CSVUtils.VALUE_SEPARATOR, "${csv.delimiter}");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String valueSeparator = testRunner.getProcessContext().getProperty(CSVUtils.VALUE_SEPARATOR).evaluateAttributeExpressions(ff).getValue();
String decimalSeparator = (String.valueOf(decimalFormatSymbols.getDecimalSeparator()).equals(valueSeparator))
? ("\\" + decimalFormatSymbols.getDecimalSeparator()) : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = String.valueOf(decimalFormatSymbols.getGroupingSeparator()).equals(valueSeparator)
? "\\" + decimalFormatSymbols.getGroupingSeparator() : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("Numbers|Timestamps|Money\n" +
"1234%1$s456|" + DateTimeFormatter.ofPattern("d/M/yy").format(localDt) + "|$ 123%1$s45\n" +
"1234%1$s46|" + DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + "|£ 123%1$s45\n" +
"1234%1$s5|" + DateTimeFormatter.ofPattern("EEEE, MMMM dd, yyyy").format(localDt) + "|¥ 123%1$s45\n" +
"1%2$s234%1$s46|" + DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + "|$ 1%2$s023%1$s45\n" +
"1%2$s234%1$s4560|" + DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + "|£ 1%2$s023%1$s45\n" +
"9%1$s88E+08|" + DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + "|¥ 1%2$s023%1$s45\n" +
"9%1$s877E+08||\n" +
"9%1$s8765E+08||\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testCustomQuoteCharWithEL() throws Exception {
Map<String, String> attributes = new HashMap<>();
attributes.put("csv.quote", "'");
testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath(), attributes);
testRunner.setProperty(CSVUtils.QUOTE_CHAR, "${csv.quote}");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.setProperty(CSVUtils.QUOTE_MODE, CSVUtils.QUOTE_ALL);
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
String quoteCharValue = testRunner.getProcessContext().getProperty(CSVUtils.QUOTE_CHAR).evaluateAttributeExpressions(ff).getValue();
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
char decimalSeparator = decimalFormatSymbols.getDecimalSeparator();
char groupingSeparator = decimalFormatSymbols.getGroupingSeparator();
ff.assertContentEquals(("'Numbers','Timestamps','Money'\n" +
addQuotingIfNeeded(String.format("1234%1$s456", decimalSeparator), quoteCharValue, true) + "," + quoteCharValue +
DateTimeFormatter.ofPattern("d/M/yy").format(localDt) + quoteCharValue + "," +
addQuotingIfNeeded(String.format("$ 123%1$s45", decimalSeparator), quoteCharValue, true) + "\n" +
addQuotingIfNeeded(String.format("1234%1$s46", decimalSeparator), quoteCharValue, true) + "," + quoteCharValue +
DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + quoteCharValue + "," +
addQuotingIfNeeded(String.format("£ 123%1$s45", decimalSeparator), quoteCharValue, true) + "\n" +
addQuotingIfNeeded(String.format("1234%1$s5", decimalSeparator), quoteCharValue, true) + "," + quoteCharValue +
DateTimeFormatter.ofPattern("EEEE, MMMM dd, yyyy").format(localDt) + quoteCharValue + "," +
addQuotingIfNeeded(String.format("¥ 123%1$s45", decimalSeparator), quoteCharValue, true) + "\n" +
addQuotingIfNeeded(String.format("1%2$s234%1$s46", decimalSeparator, groupingSeparator), quoteCharValue, true) + "," + quoteCharValue +
DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + quoteCharValue + "," +
addQuotingIfNeeded(String.format("$ 1%2$s023%1$s45", decimalSeparator, groupingSeparator), quoteCharValue, true) + "\n" +
addQuotingIfNeeded(String.format("1%2$s234%1$s4560", decimalSeparator, groupingSeparator), quoteCharValue, true) + "," + quoteCharValue +
DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + quoteCharValue + "," +
addQuotingIfNeeded(String.format("£ 1%2$s023%1$s45", decimalSeparator, groupingSeparator), quoteCharValue, true) + "\n" +
addQuotingIfNeeded(String.format("9%1$s88E+08", decimalSeparator), quoteCharValue, true) + "," + quoteCharValue +
DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + quoteCharValue + "," +
addQuotingIfNeeded(String.format("¥ 1%2$s023%1$s45", decimalSeparator, groupingSeparator), quoteCharValue, true) + "\n" +
addQuotingIfNeeded(String.format("9%1$s877E+08", decimalSeparator), quoteCharValue, true) + ",,\n" +
addQuotingIfNeeded(String.format("9%1$s8765E+08", decimalSeparator), quoteCharValue, true) + ",,\n").replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
@Test
public void testCustomEscapeCharWithEL() throws Exception {
Map<String, String> attributes = new HashMap<>();
attributes.put("csv.escape", "^");
testRunner.enqueue(new File("src/test/resources/dataformatting.xlsx").toPath(), attributes);
testRunner.setProperty(CSVUtils.ESCAPE_CHAR, "${csv.escape}");
testRunner.setProperty(ConvertExcelToCSVProcessor.FORMAT_VALUES, "true");
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long rowsSheet = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(9, rowsSheet);
LocalDateTime localDt = LocalDateTime.of(2017, 1, 1, 12, 0, 0);
DecimalFormatSymbols decimalFormatSymbols = DecimalFormatSymbols.getInstance();
String escapeCharValue = testRunner.getProcessContext().getProperty(CSVUtils.ESCAPE_CHAR).evaluateAttributeExpressions(ff).getValue();
String decimalSeparator = String.valueOf(decimalFormatSymbols.getDecimalSeparator()).equals(",")
? escapeCharValue + decimalFormatSymbols.getDecimalSeparator() : String.valueOf(decimalFormatSymbols.getDecimalSeparator());
String groupingSeparator = String.valueOf(decimalFormatSymbols.getGroupingSeparator()).equals(",")
? escapeCharValue + decimalFormatSymbols.getGroupingSeparator() : String.valueOf(decimalFormatSymbols.getGroupingSeparator());
ff.assertContentEquals(String.format("Numbers,Timestamps,Money\n" +
"1234%1$s456," + DateTimeFormatter.ofPattern("d/M/yy").format(localDt) + ",$ 123%1$s45\n" +
"1234%1$s46," + DateTimeFormatter.ofPattern("hh:mm:ss a").format(localDt) + ",£ 123%1$s45\n" +
"1234%1$s5," + DateTimeFormatter.ofPattern(String.format("EEEE%1$s, MMMM dd%1$s, yyyy", escapeCharValue)).format(localDt) + ",¥ 123%1$s45\n" +
"1%2$s234%1$s46," + DateTimeFormatter.ofPattern("d/M/yy HH:mm").format(localDt) + ",$ 1%2$s023%1$s45\n" +
"1%2$s234%1$s4560," + DateTimeFormatter.ofPattern("hh:mm a").format(localDt) + ",£ 1%2$s023%1$s45\n" +
"9%1$s88E+08," + DateTimeFormatter.ofPattern("yyyy/MM/dd/ HH:mm").format(localDt) + ",¥ 1%2$s023%1$s45\n" +
"9%1$s877E+08,,\n" +
"9%1$s8765E+08,,\n", decimalSeparator, groupingSeparator).replace("E+", getExponentSeparator(decimalFormatSymbols)));
}
/**
* Validates that all sheets in the Excel document are exported.
*
* @throws Exception
* Any exception thrown during execution.
*/
@Test
public void testProcessAllSheets() throws Exception {
testRunner.enqueue(new File("src/test/resources/CollegeScorecard.xlsx").toPath());
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long l = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(10, l);
testRunner.clearProvenanceEvents();
testRunner.clearTransferState();
testRunner.enqueue(new File("src/test/resources/TwoSheets.xlsx").toPath());
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 2);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
l = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(4, l);
ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(1);
l = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(3, l);
}
/**
* Validates that the manually specified sheet is exported from the Excel document.
*
* @throws Exception
* Any exception thrown during execution.
*/
@Test
public void testProcessASpecificSheetThatDoesExist() throws Exception {
testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Scorecard");
testRunner.enqueue(new File("src/test/resources/CollegeScorecard.xlsx").toPath());
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long l = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(10, l);
}
/**
* Tests for a syntactically valid Excel XSSF document with a manually specified Excel sheet that does not exist.
* In this scenario only the Original relationship should be invoked.
*
* @throws Exception
* Any exception thrown during execution.
*/
@Test
public void testNonExistantSpecifiedSheetName() throws Exception {
testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "NopeIDoNotExist");
testRunner.enqueue(new File("src/test/resources/CollegeScorecard.xlsx").toPath());
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 0); //We aren't expecting any output to success here because the sheet doesn't exist
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
assertFalse(testRunner.getLogger().getWarnMessages().isEmpty());
}
/**
* Validates that a sheet contains blank cells can be converted to a CSV without missing columns.
*
* @throws Exception
* Any exception thrown during execution.
*/
@Test
public void testProcessASheetWithBlankCells() throws Exception {
testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Sheet1");
testRunner.enqueue(new File("src/test/resources/with-blank-cells.xlsx").toPath());
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
long l = Long.parseLong(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
assertEquals(8, l);
ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv"));
}
/**
* Tests for graceful handling and error messaging of unsupported .XLS files.
*/
@Test
public void testHandleUnsupportedXlsFile() throws Exception {
testRunner.enqueue(new File("src/test/resources/Unsupported.xls").toPath());
testRunner.run();
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 0);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 0);
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 1);
List<LogMessage> errorMessages = testRunner.getLogger().getErrorMessages();
assertEquals(1, errorMessages.size());
String messageText = errorMessages.get(0).getMsg();
assertTrue(messageText.contains("Excel") && messageText.contains("OLE2"));
}
private String addQuotingIfNeeded(String csvField) {
return addQuotingIfNeeded(csvField, "\"", false);
}
private String addQuotingIfNeeded(String csvField, String csvQuote, boolean force) {
return csvField.contains(",") || force ? String.format("%2$s%1$s%2$s", csvField, csvQuote) : csvField;
}
}

View File

@ -1,32 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration scan="true" scanPeriod="30 seconds">
<appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
<encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
<pattern>%-4r [%t] %-5p %c - %m%n</pattern>
</encoder>
</appender>
<!-- valid logging levels: TRACE, DEBUG, INFO, WARN, ERROR -->
<logger name="org.apache.nifi" level="WARN"/>
<root level="INFO">
<appender-ref ref="CONSOLE"/>
</root>
</configuration>

View File

@ -1,8 +0,0 @@
A,B,C,D
A1,,,
,B2,C2,
,,C3,
,,C4,D4
A5,,C5,D5
A6,B6,,D6
A7,B7,C7,D7
1 A B C D
2 A1
3 B2 C2
4 C3
5 C4 D4
6 A5 C5 D5
7 A6 B6 D6
8 A7 B7 C7 D7

View File

@ -28,7 +28,6 @@
<poi.version>5.2.3</poi.version>
</properties>
<modules>
<module>nifi-poi-processors</module>
<module>nifi-poi-nar</module>
<module>nifi-poi-services</module>
</modules>