mirror of https://github.com/apache/nifi.git
NIFI-4112: Fix ConvertExcelToCSV to handle empty cells.
Signed-off-by: Matt Burgess <mattyb149@apache.org> This closes #1973
This commit is contained in:
parent
8acee02393
commit
5c48655e69
|
@ -29,6 +29,19 @@
|
||||||
<artifactId>nifi-poi-processors</artifactId>
|
<artifactId>nifi-poi-processors</artifactId>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<groupId>org.apache.rat</groupId>
|
||||||
|
<artifactId>apache-rat-plugin</artifactId>
|
||||||
|
<configuration>
|
||||||
|
<excludes combine.children="append">
|
||||||
|
<exclude>src/test/resources/with-blank-cells.csv</exclude>
|
||||||
|
</excludes>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<!-- https://mvnrepository.com/artifact/xerces/xerces -->
|
<!-- https://mvnrepository.com/artifact/xerces/xerces -->
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -24,6 +24,8 @@ import java.util.Collections;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.commons.io.FilenameUtils;
|
import org.apache.commons.io.FilenameUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -78,6 +80,7 @@ public class ConvertExcelToCSVProcessor
|
||||||
public static final String SOURCE_FILE_NAME = "sourcefilename";
|
public static final String SOURCE_FILE_NAME = "sourcefilename";
|
||||||
private static final String SAX_CELL_REF = "c";
|
private static final String SAX_CELL_REF = "c";
|
||||||
private static final String SAX_CELL_TYPE = "t";
|
private static final String SAX_CELL_TYPE = "t";
|
||||||
|
private static final String SAX_CELL_ADDRESS = "r";
|
||||||
private static final String SAX_CELL_STRING = "s";
|
private static final String SAX_CELL_STRING = "s";
|
||||||
private static final String SAX_CELL_CONTENT_REF = "v";
|
private static final String SAX_CELL_CONTENT_REF = "v";
|
||||||
private static final String SAX_ROW_REF = "row";
|
private static final String SAX_ROW_REF = "row";
|
||||||
|
@ -85,6 +88,7 @@ public class ConvertExcelToCSVProcessor
|
||||||
private static final String DESIRED_SHEETS_DELIMITER = ",";
|
private static final String DESIRED_SHEETS_DELIMITER = ",";
|
||||||
private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
|
private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
|
||||||
private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser";
|
private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser";
|
||||||
|
private static final Pattern CELL_ADDRESS_REGEX = Pattern.compile("^([a-zA-Z]+)([\\d]+)$");
|
||||||
|
|
||||||
public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
|
public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
|
||||||
.Builder().name("extract-sheets")
|
.Builder().name("extract-sheets")
|
||||||
|
@ -279,6 +283,27 @@ public class ConvertExcelToCSVProcessor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Integer columnToIndex(String col) {
|
||||||
|
int length = col.length();
|
||||||
|
int accumulator = 0;
|
||||||
|
for (int i = length; i > 0; i--) {
|
||||||
|
char c = col.charAt(i - 1);
|
||||||
|
int x = ((int) c) - 64;
|
||||||
|
accumulator += x * Math.pow(26, length - i);
|
||||||
|
}
|
||||||
|
// Make it to start with 0.
|
||||||
|
return accumulator - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class CellAddress {
|
||||||
|
final int row;
|
||||||
|
final int col;
|
||||||
|
|
||||||
|
private CellAddress(int row, int col) {
|
||||||
|
this.row = row;
|
||||||
|
this.col = col;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value
|
* Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value
|
||||||
|
@ -290,6 +315,10 @@ public class ConvertExcelToCSVProcessor
|
||||||
private SharedStringsTable sst;
|
private SharedStringsTable sst;
|
||||||
private String currentContent;
|
private String currentContent;
|
||||||
private boolean nextIsString;
|
private boolean nextIsString;
|
||||||
|
private CellAddress firstCellAddress;
|
||||||
|
private CellAddress firstRowLastCellAddress;
|
||||||
|
private CellAddress previousCellAddress;
|
||||||
|
private CellAddress nextCellAddress;
|
||||||
private OutputStream outputStream;
|
private OutputStream outputStream;
|
||||||
private boolean firstColInRow;
|
private boolean firstColInRow;
|
||||||
long rowCount;
|
long rowCount;
|
||||||
|
@ -306,18 +335,35 @@ public class ConvertExcelToCSVProcessor
|
||||||
this.outputStream = outputStream;
|
this.outputStream = outputStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void startElement(String uri, String localName, String name,
|
public void startElement(String uri, String localName, String name,
|
||||||
Attributes attributes) throws SAXException {
|
Attributes attributes) throws SAXException {
|
||||||
|
|
||||||
if (name.equals(SAX_CELL_REF)) {
|
if (name.equals(SAX_CELL_REF)) {
|
||||||
String cellType = attributes.getValue(SAX_CELL_TYPE);
|
String cellType = attributes.getValue(SAX_CELL_TYPE);
|
||||||
if(cellType != null && cellType.equals(SAX_CELL_STRING)) {
|
// Analyze cell address.
|
||||||
|
Matcher cellAddressMatcher = CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS));
|
||||||
|
if (cellAddressMatcher.matches()) {
|
||||||
|
String col = cellAddressMatcher.group(1);
|
||||||
|
String row = cellAddressMatcher.group(2);
|
||||||
|
nextCellAddress = new CellAddress(Integer.parseInt(row), columnToIndex(col));
|
||||||
|
|
||||||
|
if (firstCellAddress == null) {
|
||||||
|
firstCellAddress = nextCellAddress;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cellType != null && cellType.equals(SAX_CELL_STRING)) {
|
||||||
nextIsString = true;
|
nextIsString = true;
|
||||||
} else {
|
} else {
|
||||||
nextIsString = false;
|
nextIsString = false;
|
||||||
}
|
}
|
||||||
} else if (name.equals(SAX_ROW_REF)) {
|
} else if (name.equals(SAX_ROW_REF)) {
|
||||||
|
if (firstRowLastCellAddress == null) {
|
||||||
|
firstRowLastCellAddress = previousCellAddress;
|
||||||
|
}
|
||||||
firstColInRow = true;
|
firstColInRow = true;
|
||||||
|
previousCellAddress = null;
|
||||||
|
nextCellAddress = null;
|
||||||
} else if (name.equals(SAX_SHEET_NAME_REF)) {
|
} else if (name.equals(SAX_SHEET_NAME_REF)) {
|
||||||
sheetName = attributes.getValue(0);
|
sheetName = attributes.getValue(0);
|
||||||
}
|
}
|
||||||
|
@ -325,6 +371,16 @@ public class ConvertExcelToCSVProcessor
|
||||||
currentContent = "";
|
currentContent = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void fillEmptyColumns(int nextColumn) throws IOException {
|
||||||
|
final CellAddress previousCell = previousCellAddress != null ? previousCellAddress : firstCellAddress;
|
||||||
|
if (previousCell != null) {
|
||||||
|
for (int i = 0; i < (nextColumn - previousCell.col); i++) {
|
||||||
|
// Fill columns.
|
||||||
|
outputStream.write(",".getBytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void endElement(String uri, String localName, String name)
|
public void endElement(String uri, String localName, String name)
|
||||||
throws SAXException {
|
throws SAXException {
|
||||||
|
|
||||||
|
@ -334,22 +390,20 @@ public class ConvertExcelToCSVProcessor
|
||||||
nextIsString = false;
|
nextIsString = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (name.equals(SAX_CELL_CONTENT_REF)) {
|
if (name.equals(SAX_CELL_CONTENT_REF)
|
||||||
if (firstColInRow) {
|
// Limit scanning from the first column, and up to the last column.
|
||||||
|
&& (firstCellAddress == null || firstCellAddress.col <= nextCellAddress.col)
|
||||||
|
&& (firstRowLastCellAddress == null || nextCellAddress.col <= firstRowLastCellAddress.col)) {
|
||||||
|
try {
|
||||||
|
// A cell is found.
|
||||||
|
fillEmptyColumns(nextCellAddress.col);
|
||||||
firstColInRow = false;
|
firstColInRow = false;
|
||||||
try {
|
outputStream.write(currentContent.getBytes());
|
||||||
outputStream.write(currentContent.getBytes());
|
// Keep previously found cell address.
|
||||||
} catch (IOException e) {
|
previousCellAddress = nextCellAddress;
|
||||||
getLogger().error("IO error encountered while writing content of parsed cell " +
|
} catch (IOException e) {
|
||||||
"value from sheet {}", new Object[]{getSheetName()}, e);
|
getLogger().error("IO error encountered while writing content of parsed cell " +
|
||||||
}
|
"value from sheet {}", new Object[]{getSheetName()}, e);
|
||||||
} else {
|
|
||||||
try {
|
|
||||||
outputStream.write(("," + currentContent).getBytes());
|
|
||||||
} catch (IOException e) {
|
|
||||||
getLogger().error("IO error encountered while writing content of parsed cell " +
|
|
||||||
"value from sheet {}", new Object[]{getSheetName()}, e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -357,6 +411,9 @@ public class ConvertExcelToCSVProcessor
|
||||||
//If this is the first row and the end of the row element has been encountered then that means no columns were present.
|
//If this is the first row and the end of the row element has been encountered then that means no columns were present.
|
||||||
if (!firstColInRow) {
|
if (!firstColInRow) {
|
||||||
try {
|
try {
|
||||||
|
if (firstRowLastCellAddress != null) {
|
||||||
|
fillEmptyColumns(firstRowLastCellAddress.col);
|
||||||
|
}
|
||||||
rowCount++;
|
rowCount++;
|
||||||
outputStream.write("\n".getBytes());
|
outputStream.write("\n".getBytes());
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -0,0 +1,97 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>ConvertExcelToCSVProcessor</title>
|
||||||
|
<style>
|
||||||
|
table {
|
||||||
|
border-collapse: collapse;
|
||||||
|
}
|
||||||
|
|
||||||
|
table, th, td {
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
}
|
||||||
|
|
||||||
|
td.r {
|
||||||
|
text-align: right;
|
||||||
|
}
|
||||||
|
|
||||||
|
td {
|
||||||
|
width: 50px;
|
||||||
|
padding: 5px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css" />
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<h2>How it extracts CSV data from a sheet</h2>
|
||||||
|
<p>
|
||||||
|
ConvertExcelToCSVProcessor extracts CSV data with following rules:
|
||||||
|
</p>
|
||||||
|
<ul>
|
||||||
|
<li>Find the fist cell which has a value in it (the FirstCell).</li>
|
||||||
|
<li>Scan cells in the first row, starting from the FirstCell,
|
||||||
|
until it reaches to a cell after which no cell with a value can not be found in the row (the FirstRowLastCell).</li>
|
||||||
|
<li>Process the 2nd row and later, from the column of FirstCell to the column of FirstRowLastCell.</li>
|
||||||
|
<li>If a row does not have any cell that has a value, then the row is ignored.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
As an example, the sheet shown below will be:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tbody>
|
||||||
|
<tr><th>row </th><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th></tr>
|
||||||
|
<tr><td class="r"> 1</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 2</td><td> </td><td> </td><td>x</td><td>y</td><td>z</td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 3</td><td> </td><td> </td><td>1</td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 4</td><td>2</td><td> </td><td> </td><td>3</td><td> </td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 5</td><td> </td><td> </td><td> </td><td> </td><td>4</td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 6</td><td> </td><td> </td><td>5</td><td>6</td><td>7</td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 7</td><td> </td><td> </td><td> </td><td> </td><td> </td><td>8</td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 8</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 9</td><td> </td><td> </td><td> </td><td> </td><td>9</td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||||
|
<tr><td class="r"> 11</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
converted to following CSV:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>
|
||||||
|
x,y,z
|
||||||
|
1,,
|
||||||
|
,3,
|
||||||
|
,,4
|
||||||
|
5,6,7
|
||||||
|
,,9
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>C2(x) is the FirstCell, and E2(z) is the FirstRowLastCell.</li>
|
||||||
|
<li>A4(2) is ignored because it is out of range. So is F7(8).</li>
|
||||||
|
<li>Row 7 and 8 are ignored because those do not have a valid cell.</li>
|
||||||
|
<li>It is important to have a header row as shown in the example to define data area,
|
||||||
|
especially when a sheet includes empty cells.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -16,9 +16,11 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.nifi.processors.poi;
|
package org.apache.nifi.processors.poi;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||||
|
@ -39,6 +41,16 @@ public class ConvertExcelToCSVProcessorTest {
|
||||||
testRunner = TestRunners.newTestRunner(ConvertExcelToCSVProcessor.class);
|
testRunner = TestRunners.newTestRunner(ConvertExcelToCSVProcessor.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testColToIndex() {
|
||||||
|
assertEquals(Integer.valueOf(0), ConvertExcelToCSVProcessor.columnToIndex("A"));
|
||||||
|
assertEquals(Integer.valueOf(1), ConvertExcelToCSVProcessor.columnToIndex("B"));
|
||||||
|
assertEquals(Integer.valueOf(25), ConvertExcelToCSVProcessor.columnToIndex("Z"));
|
||||||
|
assertEquals(Integer.valueOf(29), ConvertExcelToCSVProcessor.columnToIndex("AD"));
|
||||||
|
assertEquals(Integer.valueOf(239), ConvertExcelToCSVProcessor.columnToIndex("IF"));
|
||||||
|
assertEquals(Integer.valueOf(16383), ConvertExcelToCSVProcessor.columnToIndex("XFD"));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception {
|
public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception {
|
||||||
|
|
||||||
|
@ -149,6 +161,30 @@ public class ConvertExcelToCSVProcessorTest {
|
||||||
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
|
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates that a sheet contains blank cells can be converted to a CSV without missing columns.
|
||||||
|
*
|
||||||
|
* @throws Exception
|
||||||
|
* Any exception thrown during execution.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testProcessASheetWithBlankCells() throws Exception {
|
||||||
|
|
||||||
|
testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Sheet1");
|
||||||
|
testRunner.enqueue(new File("src/test/resources/with-blank-cells.xlsx").toPath());
|
||||||
|
testRunner.run();
|
||||||
|
|
||||||
|
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
|
||||||
|
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
|
||||||
|
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
|
||||||
|
|
||||||
|
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
|
||||||
|
Long l = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
|
||||||
|
assertTrue(l == 8l);
|
||||||
|
ff.isContentEqual("test", StandardCharsets.UTF_8);
|
||||||
|
ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests for graceful handling and error messaging of unsupported .XLS files.
|
* Tests for graceful handling and error messaging of unsupported .XLS files.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
A,B,C,D
|
||||||
|
A1,,,
|
||||||
|
,B2,C2,
|
||||||
|
,,C3,
|
||||||
|
,,C4,D4
|
||||||
|
A5,,C5,D5
|
||||||
|
A6,B6,,D6
|
||||||
|
A7,B7,C7,D7
|
|
Binary file not shown.
Loading…
Reference in New Issue