NIFI-1518 InferAvroSchema note has an option to set CSV delimiter

Reviewed (and amended based on acknowledgement in PR review) by Tony Kurc (tkurc@apache.org). This closes #235.
This commit is contained in:
Michal Klempa 2016-02-18 09:42:58 +01:00 committed by Tony Kurc
parent d3367a7dc3
commit 784f2a2c20
3 changed files with 431 additions and 3 deletions

View File

@ -20,6 +20,7 @@ package org.apache.nifi.processors.kite;
import org.apache.avro.Schema;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
@ -28,6 +29,9 @@ import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.ProcessContext;
@ -42,7 +46,6 @@ import org.kitesdk.data.spi.JsonUtil;
import org.kitesdk.data.spi.filesystem.CSVProperties;
import org.kitesdk.data.spi.filesystem.CSVUtil;
import java.io.InputStream;
import java.io.IOException;
import java.io.InputStreamReader;
@ -81,7 +84,22 @@ import java.util.concurrent.atomic.AtomicReference;
public class InferAvroSchema
extends AbstractKiteProcessor {
public static final String CSV_DELIMITER = ",";
private static final Validator CHAR_VALIDATOR = new Validator() {
@Override
public ValidationResult validate(String subject, String input, ValidationContext context) {
// Allows special, escaped characters as input, which is then unescaped and converted to a single character.
// Examples for special characters: \t (or \u0009), \f.
input = unescapeString(input);
return new ValidationResult.Builder()
.subject(subject)
.input(input)
.explanation("Only non-null single characters are supported")
.valid(input.length() == 1 && input.charAt(0) != 0)
.build();
}
};
public static final String USE_MIME_TYPE = "use mime.type value";
public static final String JSON_CONTENT = "json";
public static final String CSV_CONTENT = "csv";
@ -154,6 +172,13 @@ public class InferAvroSchema
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.build();
public static final PropertyDescriptor DELIMITER = new PropertyDescriptor.Builder()
.name("CSV delimiter")
.description("Delimiter character for CSV records")
.addValidator(CHAR_VALIDATOR)
.defaultValue(",")
.build();
public static final PropertyDescriptor ESCAPE_STRING = new PropertyDescriptor.Builder()
.name("CSV Escape String")
.description("This property only applies to CSV content type. String that represents an escape sequence" +
@ -234,6 +259,7 @@ public class InferAvroSchema
properties.add(CSV_HEADER_DEFINITION);
properties.add(GET_CSV_HEADER_DEFINITION_FROM_INPUT);
properties.add(HEADER_LINE_SKIP_COUNT);
properties.add(DELIMITER);
properties.add(ESCAPE_STRING);
properties.add(QUOTE_STRING);
properties.add(PRETTY_AVRO_OUTPUT);
@ -366,7 +392,7 @@ public class InferAvroSchema
//Prepares the CSVProperties for kite
final CSVProperties props = new CSVProperties.Builder()
.delimiter(CSV_DELIMITER)
.delimiter(context.getProperty(DELIMITER).getValue())
.escape(context.getProperty(ESCAPE_STRING).evaluateAttributeExpressions().getValue())
.quote(context.getProperty(QUOTE_STRING).evaluateAttributeExpressions().getValue())
.header(header.get())
@ -457,4 +483,11 @@ public class InferAvroSchema
return avroSchema;
}
private static String unescapeString(String input) {
if (input.length() > 1) {
input = StringEscapeUtils.unescapeJava(input);
}
return input;
}
}

View File

@ -173,4 +173,47 @@ public class TestInferAvroSchema {
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
}
@Test
public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFile() throws Exception {
runner.setProperty(InferAvroSchema.DELIMITER, "\\t");
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 0);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 1);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_SUCCESS).get(0);
flowFile.assertContentEquals(new File("src/test/resources/Shapes_header.csv.avro").toPath());
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "application/avro-binary");
}
@Test
public void inferAvroSchemaFromHeaderDefinitionOfCSVTabDelimitedFileNegativeTest() throws Exception {
// Inproper InferAvroSchema.DELIMITER > original goes to InferAvroSchema.REL_FAILURE
runner.setProperty(InferAvroSchema.DELIMITER, ";");
runner.assertValid();
Map<String, String> attributes = new HashMap<>();
attributes.put(CoreAttributes.MIME_TYPE.key(), "text/csv");
runner.enqueue(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath(), attributes);
runner.run();
runner.assertTransferCount(InferAvroSchema.REL_UNSUPPORTED_CONTENT, 0);
runner.assertTransferCount(InferAvroSchema.REL_FAILURE, 1);
runner.assertTransferCount(InferAvroSchema.REL_ORIGINAL, 0);
runner.assertTransferCount(InferAvroSchema.REL_SUCCESS, 0);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(InferAvroSchema.REL_FAILURE).get(0);
flowFile.assertContentEquals(new File("src/test/resources/Shapes_Header_TabDelimited.csv").toPath());
flowFile.assertAttributeEquals(CoreAttributes.MIME_TYPE.key(), "text/csv");
}
}

View File

@ -0,0 +1,352 @@
shape color width height
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
circle red 100 100
square red 100 100
sphere red 100 100
triangle red 100 100
rectangle red 100 100
circle red 100 100
sphere red 100 100
circle red 100 100
circle red 100 100
triangle red 100 100
cone red 100 100
circle red 100 100
rectangle red 100 100
1 shape color width height
2 circle red 100 100
3 square red 100 100
4 sphere red 100 100
5 triangle red 100 100
6 rectangle red 100 100
7 circle red 100 100
8 sphere red 100 100
9 circle red 100 100
10 circle red 100 100
11 triangle red 100 100
12 cone red 100 100
13 circle red 100 100
14 rectangle red 100 100
15 circle red 100 100
16 square red 100 100
17 sphere red 100 100
18 triangle red 100 100
19 rectangle red 100 100
20 circle red 100 100
21 sphere red 100 100
22 circle red 100 100
23 circle red 100 100
24 triangle red 100 100
25 cone red 100 100
26 circle red 100 100
27 rectangle red 100 100
28 circle red 100 100
29 square red 100 100
30 sphere red 100 100
31 triangle red 100 100
32 rectangle red 100 100
33 circle red 100 100
34 sphere red 100 100
35 circle red 100 100
36 circle red 100 100
37 triangle red 100 100
38 cone red 100 100
39 circle red 100 100
40 rectangle red 100 100
41 circle red 100 100
42 square red 100 100
43 sphere red 100 100
44 triangle red 100 100
45 rectangle red 100 100
46 circle red 100 100
47 sphere red 100 100
48 circle red 100 100
49 circle red 100 100
50 triangle red 100 100
51 cone red 100 100
52 circle red 100 100
53 rectangle red 100 100
54 circle red 100 100
55 square red 100 100
56 sphere red 100 100
57 triangle red 100 100
58 rectangle red 100 100
59 circle red 100 100
60 sphere red 100 100
61 circle red 100 100
62 circle red 100 100
63 triangle red 100 100
64 cone red 100 100
65 circle red 100 100
66 rectangle red 100 100
67 circle red 100 100
68 square red 100 100
69 sphere red 100 100
70 triangle red 100 100
71 rectangle red 100 100
72 circle red 100 100
73 sphere red 100 100
74 circle red 100 100
75 circle red 100 100
76 triangle red 100 100
77 cone red 100 100
78 circle red 100 100
79 rectangle red 100 100
80 circle red 100 100
81 square red 100 100
82 sphere red 100 100
83 triangle red 100 100
84 rectangle red 100 100
85 circle red 100 100
86 sphere red 100 100
87 circle red 100 100
88 circle red 100 100
89 triangle red 100 100
90 cone red 100 100
91 circle red 100 100
92 rectangle red 100 100
93 circle red 100 100
94 square red 100 100
95 sphere red 100 100
96 triangle red 100 100
97 rectangle red 100 100
98 circle red 100 100
99 sphere red 100 100
100 circle red 100 100
101 circle red 100 100
102 triangle red 100 100
103 cone red 100 100
104 circle red 100 100
105 rectangle red 100 100
106 circle red 100 100
107 square red 100 100
108 sphere red 100 100
109 triangle red 100 100
110 rectangle red 100 100
111 circle red 100 100
112 sphere red 100 100
113 circle red 100 100
114 circle red 100 100
115 triangle red 100 100
116 cone red 100 100
117 circle red 100 100
118 rectangle red 100 100
119 circle red 100 100
120 square red 100 100
121 sphere red 100 100
122 triangle red 100 100
123 rectangle red 100 100
124 circle red 100 100
125 sphere red 100 100
126 circle red 100 100
127 circle red 100 100
128 triangle red 100 100
129 cone red 100 100
130 circle red 100 100
131 rectangle red 100 100
132 circle red 100 100
133 square red 100 100
134 sphere red 100 100
135 triangle red 100 100
136 rectangle red 100 100
137 circle red 100 100
138 sphere red 100 100
139 circle red 100 100
140 circle red 100 100
141 triangle red 100 100
142 cone red 100 100
143 circle red 100 100
144 rectangle red 100 100
145 circle red 100 100
146 square red 100 100
147 sphere red 100 100
148 triangle red 100 100
149 rectangle red 100 100
150 circle red 100 100
151 sphere red 100 100
152 circle red 100 100
153 circle red 100 100
154 triangle red 100 100
155 cone red 100 100
156 circle red 100 100
157 rectangle red 100 100
158 circle red 100 100
159 square red 100 100
160 sphere red 100 100
161 triangle red 100 100
162 rectangle red 100 100
163 circle red 100 100
164 sphere red 100 100
165 circle red 100 100
166 circle red 100 100
167 triangle red 100 100
168 cone red 100 100
169 circle red 100 100
170 rectangle red 100 100
171 circle red 100 100
172 square red 100 100
173 sphere red 100 100
174 triangle red 100 100
175 rectangle red 100 100
176 circle red 100 100
177 sphere red 100 100
178 circle red 100 100
179 circle red 100 100
180 triangle red 100 100
181 cone red 100 100
182 circle red 100 100
183 rectangle red 100 100
184 circle red 100 100
185 square red 100 100
186 sphere red 100 100
187 triangle red 100 100
188 rectangle red 100 100
189 circle red 100 100
190 sphere red 100 100
191 circle red 100 100
192 circle red 100 100
193 triangle red 100 100
194 cone red 100 100
195 circle red 100 100
196 rectangle red 100 100
197 circle red 100 100
198 square red 100 100
199 sphere red 100 100
200 triangle red 100 100
201 rectangle red 100 100
202 circle red 100 100
203 sphere red 100 100
204 circle red 100 100
205 circle red 100 100
206 triangle red 100 100
207 cone red 100 100
208 circle red 100 100
209 rectangle red 100 100
210 circle red 100 100
211 square red 100 100
212 sphere red 100 100
213 triangle red 100 100
214 rectangle red 100 100
215 circle red 100 100
216 sphere red 100 100
217 circle red 100 100
218 circle red 100 100
219 triangle red 100 100
220 cone red 100 100
221 circle red 100 100
222 rectangle red 100 100
223 circle red 100 100
224 square red 100 100
225 sphere red 100 100
226 triangle red 100 100
227 rectangle red 100 100
228 circle red 100 100
229 sphere red 100 100
230 circle red 100 100
231 circle red 100 100
232 triangle red 100 100
233 cone red 100 100
234 circle red 100 100
235 rectangle red 100 100
236 circle red 100 100
237 square red 100 100
238 sphere red 100 100
239 triangle red 100 100
240 rectangle red 100 100
241 circle red 100 100
242 sphere red 100 100
243 circle red 100 100
244 circle red 100 100
245 triangle red 100 100
246 cone red 100 100
247 circle red 100 100
248 rectangle red 100 100
249 circle red 100 100
250 square red 100 100
251 sphere red 100 100
252 triangle red 100 100
253 rectangle red 100 100
254 circle red 100 100
255 sphere red 100 100
256 circle red 100 100
257 circle red 100 100
258 triangle red 100 100
259 cone red 100 100
260 circle red 100 100
261 rectangle red 100 100
262 circle red 100 100
263 square red 100 100
264 sphere red 100 100
265 triangle red 100 100
266 rectangle red 100 100
267 circle red 100 100
268 sphere red 100 100
269 circle red 100 100
270 circle red 100 100
271 triangle red 100 100
272 cone red 100 100
273 circle red 100 100
274 rectangle red 100 100
275 circle red 100 100
276 square red 100 100
277 sphere red 100 100
278 triangle red 100 100
279 rectangle red 100 100
280 circle red 100 100
281 sphere red 100 100
282 circle red 100 100
283 circle red 100 100
284 triangle red 100 100
285 cone red 100 100
286 circle red 100 100
287 rectangle red 100 100
288 circle red 100 100
289 square red 100 100
290 sphere red 100 100
291 triangle red 100 100
292 rectangle red 100 100
293 circle red 100 100
294 sphere red 100 100
295 circle red 100 100
296 circle red 100 100
297 triangle red 100 100
298 cone red 100 100
299 circle red 100 100
300 rectangle red 100 100
301 circle red 100 100
302 square red 100 100
303 sphere red 100 100
304 triangle red 100 100
305 rectangle red 100 100
306 circle red 100 100
307 sphere red 100 100
308 circle red 100 100
309 circle red 100 100
310 triangle red 100 100
311 cone red 100 100
312 circle red 100 100
313 rectangle red 100 100
314 circle red 100 100
315 square red 100 100
316 sphere red 100 100
317 triangle red 100 100
318 rectangle red 100 100
319 circle red 100 100
320 sphere red 100 100
321 circle red 100 100
322 circle red 100 100
323 triangle red 100 100
324 cone red 100 100
325 circle red 100 100
326 rectangle red 100 100
327 circle red 100 100
328 square red 100 100
329 sphere red 100 100
330 triangle red 100 100
331 rectangle red 100 100
332 circle red 100 100
333 sphere red 100 100
334 circle red 100 100
335 circle red 100 100
336 triangle red 100 100
337 cone red 100 100
338 circle red 100 100
339 rectangle red 100 100
340 circle red 100 100
341 square red 100 100
342 sphere red 100 100
343 triangle red 100 100
344 rectangle red 100 100
345 circle red 100 100
346 sphere red 100 100
347 circle red 100 100
348 circle red 100 100
349 triangle red 100 100
350 cone red 100 100
351 circle red 100 100
352 rectangle red 100 100