mirror of https://github.com/apache/nifi.git
NIFI-4112: Fix ConvertExcelToCSV to handle empty cells.
Signed-off-by: Matt Burgess <mattyb149@apache.org> This closes #1973
This commit is contained in:
parent
8acee02393
commit
5c48655e69
|
@ -29,6 +29,19 @@
|
|||
<artifactId>nifi-poi-processors</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.rat</groupId>
|
||||
<artifactId>apache-rat-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes combine.children="append">
|
||||
<exclude>src/test/resources/with-blank-cells.csv</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
<dependencies>
|
||||
<!-- https://mvnrepository.com/artifact/xerces/xerces -->
|
||||
<dependency>
|
||||
|
|
|
@ -24,6 +24,8 @@ import java.util.Collections;
|
|||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -78,6 +80,7 @@ public class ConvertExcelToCSVProcessor
|
|||
public static final String SOURCE_FILE_NAME = "sourcefilename";
|
||||
private static final String SAX_CELL_REF = "c";
|
||||
private static final String SAX_CELL_TYPE = "t";
|
||||
private static final String SAX_CELL_ADDRESS = "r";
|
||||
private static final String SAX_CELL_STRING = "s";
|
||||
private static final String SAX_CELL_CONTENT_REF = "v";
|
||||
private static final String SAX_ROW_REF = "row";
|
||||
|
@ -85,6 +88,7 @@ public class ConvertExcelToCSVProcessor
|
|||
private static final String DESIRED_SHEETS_DELIMITER = ",";
|
||||
private static final String UNKNOWN_SHEET_NAME = "UNKNOWN";
|
||||
private static final String SAX_PARSER = "org.apache.xerces.parsers.SAXParser";
|
||||
private static final Pattern CELL_ADDRESS_REGEX = Pattern.compile("^([a-zA-Z]+)([\\d]+)$");
|
||||
|
||||
public static final PropertyDescriptor DESIRED_SHEETS = new PropertyDescriptor
|
||||
.Builder().name("extract-sheets")
|
||||
|
@ -279,6 +283,27 @@ public class ConvertExcelToCSVProcessor
|
|||
}
|
||||
}
|
||||
|
||||
static Integer columnToIndex(String col) {
|
||||
int length = col.length();
|
||||
int accumulator = 0;
|
||||
for (int i = length; i > 0; i--) {
|
||||
char c = col.charAt(i - 1);
|
||||
int x = ((int) c) - 64;
|
||||
accumulator += x * Math.pow(26, length - i);
|
||||
}
|
||||
// Make it to start with 0.
|
||||
return accumulator - 1;
|
||||
}
|
||||
|
||||
private static class CellAddress {
|
||||
final int row;
|
||||
final int col;
|
||||
|
||||
private CellAddress(int row, int col) {
|
||||
this.row = row;
|
||||
this.col = col;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts every row from an Excel Sheet and generates a corresponding JSONObject whose key is the Excel CellAddress and value
|
||||
|
@ -290,6 +315,10 @@ public class ConvertExcelToCSVProcessor
|
|||
private SharedStringsTable sst;
|
||||
private String currentContent;
|
||||
private boolean nextIsString;
|
||||
private CellAddress firstCellAddress;
|
||||
private CellAddress firstRowLastCellAddress;
|
||||
private CellAddress previousCellAddress;
|
||||
private CellAddress nextCellAddress;
|
||||
private OutputStream outputStream;
|
||||
private boolean firstColInRow;
|
||||
long rowCount;
|
||||
|
@ -306,18 +335,35 @@ public class ConvertExcelToCSVProcessor
|
|||
this.outputStream = outputStream;
|
||||
}
|
||||
|
||||
|
||||
public void startElement(String uri, String localName, String name,
|
||||
Attributes attributes) throws SAXException {
|
||||
|
||||
if (name.equals(SAX_CELL_REF)) {
|
||||
String cellType = attributes.getValue(SAX_CELL_TYPE);
|
||||
// Analyze cell address.
|
||||
Matcher cellAddressMatcher = CELL_ADDRESS_REGEX.matcher(attributes.getValue(SAX_CELL_ADDRESS));
|
||||
if (cellAddressMatcher.matches()) {
|
||||
String col = cellAddressMatcher.group(1);
|
||||
String row = cellAddressMatcher.group(2);
|
||||
nextCellAddress = new CellAddress(Integer.parseInt(row), columnToIndex(col));
|
||||
|
||||
if (firstCellAddress == null) {
|
||||
firstCellAddress = nextCellAddress;
|
||||
}
|
||||
}
|
||||
if (cellType != null && cellType.equals(SAX_CELL_STRING)) {
|
||||
nextIsString = true;
|
||||
} else {
|
||||
nextIsString = false;
|
||||
}
|
||||
} else if (name.equals(SAX_ROW_REF)) {
|
||||
if (firstRowLastCellAddress == null) {
|
||||
firstRowLastCellAddress = previousCellAddress;
|
||||
}
|
||||
firstColInRow = true;
|
||||
previousCellAddress = null;
|
||||
nextCellAddress = null;
|
||||
} else if (name.equals(SAX_SHEET_NAME_REF)) {
|
||||
sheetName = attributes.getValue(0);
|
||||
}
|
||||
|
@ -325,6 +371,16 @@ public class ConvertExcelToCSVProcessor
|
|||
currentContent = "";
|
||||
}
|
||||
|
||||
private void fillEmptyColumns(int nextColumn) throws IOException {
|
||||
final CellAddress previousCell = previousCellAddress != null ? previousCellAddress : firstCellAddress;
|
||||
if (previousCell != null) {
|
||||
for (int i = 0; i < (nextColumn - previousCell.col); i++) {
|
||||
// Fill columns.
|
||||
outputStream.write(",".getBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String uri, String localName, String name)
|
||||
throws SAXException {
|
||||
|
||||
|
@ -334,29 +390,30 @@ public class ConvertExcelToCSVProcessor
|
|||
nextIsString = false;
|
||||
}
|
||||
|
||||
if (name.equals(SAX_CELL_CONTENT_REF)) {
|
||||
if (firstColInRow) {
|
||||
if (name.equals(SAX_CELL_CONTENT_REF)
|
||||
// Limit scanning from the first column, and up to the last column.
|
||||
&& (firstCellAddress == null || firstCellAddress.col <= nextCellAddress.col)
|
||||
&& (firstRowLastCellAddress == null || nextCellAddress.col <= firstRowLastCellAddress.col)) {
|
||||
try {
|
||||
// A cell is found.
|
||||
fillEmptyColumns(nextCellAddress.col);
|
||||
firstColInRow = false;
|
||||
try {
|
||||
outputStream.write(currentContent.getBytes());
|
||||
// Keep previously found cell address.
|
||||
previousCellAddress = nextCellAddress;
|
||||
} catch (IOException e) {
|
||||
getLogger().error("IO error encountered while writing content of parsed cell " +
|
||||
"value from sheet {}", new Object[]{getSheetName()}, e);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
outputStream.write(("," + currentContent).getBytes());
|
||||
} catch (IOException e) {
|
||||
getLogger().error("IO error encountered while writing content of parsed cell " +
|
||||
"value from sheet {}", new Object[]{getSheetName()}, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (name.equals(SAX_ROW_REF)) {
|
||||
//If this is the first row and the end of the row element has been encountered then that means no columns were present.
|
||||
if (!firstColInRow) {
|
||||
try {
|
||||
if (firstRowLastCellAddress != null) {
|
||||
fillEmptyColumns(firstRowLastCellAddress.col);
|
||||
}
|
||||
rowCount++;
|
||||
outputStream.write("\n".getBytes());
|
||||
} catch (IOException e) {
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>ConvertExcelToCSVProcessor</title>
|
||||
<style>
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
table, th, td {
|
||||
border: 1px solid #ccc;
|
||||
}
|
||||
|
||||
td.r {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
td {
|
||||
width: 50px;
|
||||
padding: 5px;
|
||||
}
|
||||
</style>
|
||||
<link rel="stylesheet" href="../../../../../css/component-usage.css" type="text/css" />
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<h2>How it extracts CSV data from a sheet</h2>
|
||||
<p>
|
||||
ConvertExcelToCSVProcessor extracts CSV data with following rules:
|
||||
</p>
|
||||
<ul>
|
||||
<li>Find the fist cell which has a value in it (the FirstCell).</li>
|
||||
<li>Scan cells in the first row, starting from the FirstCell,
|
||||
until it reaches to a cell after which no cell with a value can not be found in the row (the FirstRowLastCell).</li>
|
||||
<li>Process the 2nd row and later, from the column of FirstCell to the column of FirstRowLastCell.</li>
|
||||
<li>If a row does not have any cell that has a value, then the row is ignored.</li>
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
As an example, the sheet shown below will be:
|
||||
</p>
|
||||
|
||||
<table>
|
||||
<tbody>
|
||||
<tr><th>row </th><th>A</th><th>B</th><th>C</th><th>D</th><th>E</th><th>F</th><th>G</th></tr>
|
||||
<tr><td class="r"> 1</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 2</td><td> </td><td> </td><td>x</td><td>y</td><td>z</td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 3</td><td> </td><td> </td><td>1</td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 4</td><td>2</td><td> </td><td> </td><td>3</td><td> </td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 5</td><td> </td><td> </td><td> </td><td> </td><td>4</td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 6</td><td> </td><td> </td><td>5</td><td>6</td><td>7</td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 7</td><td> </td><td> </td><td> </td><td> </td><td> </td><td>8</td><td> </td></tr>
|
||||
<tr><td class="r"> 8</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 9</td><td> </td><td> </td><td> </td><td> </td><td>9</td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 10</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||
<tr><td class="r"> 11</td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td><td> </td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
converted to following CSV:
|
||||
</p>
|
||||
|
||||
<pre>
|
||||
x,y,z
|
||||
1,,
|
||||
,3,
|
||||
,,4
|
||||
5,6,7
|
||||
,,9
|
||||
</pre>
|
||||
|
||||
<ul>
|
||||
<li>C2(x) is the FirstCell, and E2(z) is the FirstRowLastCell.</li>
|
||||
<li>A4(2) is ignored because it is out of range. So is F7(8).</li>
|
||||
<li>Row 7 and 8 are ignored because those do not have a valid cell.</li>
|
||||
<li>It is important to have a header row as shown in the example to define data area,
|
||||
especially when a sheet includes empty cells.</li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -16,9 +16,11 @@
|
|||
*/
|
||||
package org.apache.nifi.processors.poi;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.nifi.flowfile.attributes.CoreAttributes;
|
||||
|
@ -39,6 +41,16 @@ public class ConvertExcelToCSVProcessorTest {
|
|||
testRunner = TestRunners.newTestRunner(ConvertExcelToCSVProcessor.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testColToIndex() {
|
||||
assertEquals(Integer.valueOf(0), ConvertExcelToCSVProcessor.columnToIndex("A"));
|
||||
assertEquals(Integer.valueOf(1), ConvertExcelToCSVProcessor.columnToIndex("B"));
|
||||
assertEquals(Integer.valueOf(25), ConvertExcelToCSVProcessor.columnToIndex("Z"));
|
||||
assertEquals(Integer.valueOf(29), ConvertExcelToCSVProcessor.columnToIndex("AD"));
|
||||
assertEquals(Integer.valueOf(239), ConvertExcelToCSVProcessor.columnToIndex("IF"));
|
||||
assertEquals(Integer.valueOf(16383), ConvertExcelToCSVProcessor.columnToIndex("XFD"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleSheetsGeneratesMultipleFlowFiles() throws Exception {
|
||||
|
||||
|
@ -149,6 +161,30 @@ public class ConvertExcelToCSVProcessorTest {
|
|||
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that a sheet contains blank cells can be converted to a CSV without missing columns.
|
||||
*
|
||||
* @throws Exception
|
||||
* Any exception thrown during execution.
|
||||
*/
|
||||
@Test
|
||||
public void testProcessASheetWithBlankCells() throws Exception {
|
||||
|
||||
testRunner.setProperty(ConvertExcelToCSVProcessor.DESIRED_SHEETS, "Sheet1");
|
||||
testRunner.enqueue(new File("src/test/resources/with-blank-cells.xlsx").toPath());
|
||||
testRunner.run();
|
||||
|
||||
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.SUCCESS, 1);
|
||||
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.ORIGINAL, 1);
|
||||
testRunner.assertTransferCount(ConvertExcelToCSVProcessor.FAILURE, 0);
|
||||
|
||||
MockFlowFile ff = testRunner.getFlowFilesForRelationship(ConvertExcelToCSVProcessor.SUCCESS).get(0);
|
||||
Long l = new Long(ff.getAttribute(ConvertExcelToCSVProcessor.ROW_NUM));
|
||||
assertTrue(l == 8l);
|
||||
ff.isContentEqual("test", StandardCharsets.UTF_8);
|
||||
ff.assertContentEquals(new File("src/test/resources/with-blank-cells.csv"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests for graceful handling and error messaging of unsupported .XLS files.
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
A,B,C,D
|
||||
A1,,,
|
||||
,B2,C2,
|
||||
,,C3,
|
||||
,,C4,D4
|
||||
A5,,C5,D5
|
||||
A6,B6,,D6
|
||||
A7,B7,C7,D7
|
|
Binary file not shown.
Loading…
Reference in New Issue