NIFI-11183 Removed Hive 1.x components

This closes #6957

Signed-off-by: David Handermann <exceptionfactory@apache.org>
This commit is contained in:
Pierre Villard 2023-02-14 21:21:20 -05:00 committed by exceptionfactory
parent 46f89e3226
commit aae6bafc6c
No known key found for this signature in database
GPG Key ID: 29B6A52D2AAE8DBA
72 changed files with 0 additions and 21448 deletions

View File

@ -1036,55 +1036,6 @@ language governing permissions and limitations under the License. -->
</dependency>
</dependencies>
</profile>
<profile>
<id>include-hive</id>
<activation>
<activeByDefault>false</activeByDefault>
<property>
<name>allProfiles</name>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-nar</artifactId>
<version>2.0.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-services-api-nar</artifactId>
<version>2.0.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
</dependencies>
</profile>
<profile>
<id>include-hive1_1</id>
<!-- This profile handles the inclusion of Hive 1.1.x artifacts. The NAR
is quite large and makes the resultant binary distribution significantly
larger (150+ MB). -->
<activation>
<activeByDefault>false</activeByDefault>
<property>
<name>allProfiles</name>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive_1_1-nar</artifactId>
<version>2.0.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-services-api-nar</artifactId>
<version>2.0.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
</dependencies>
</profile>
<profile>
<id>include-hive3</id>
<!-- This profile handles the inclusion of Hive 3 artifacts. The NAR

View File

@ -1,47 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-bundle</artifactId>
<version>2.0.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-hive-nar</artifactId>
<packaging>nar</packaging>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
<source.skip>true</source.skip>
<!-- Need to override hadoop.version here, for Hive and hadoop-client transitive dependencies -->
<hadoop.version>${hive.hadoop.version}</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-services-api-nar</artifactId>
<version>2.0.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-processors</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -1,329 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
APACHE NIFI SUBCOMPONENTS:
The Apache NiFi project contains subcomponents with separate copyright
notices and license terms. Your use of the source code for the these
subcomponents is subject to the terms and conditions of the following
licenses.
The binary distribution of this product bundles 'Bouncy Castle JDK 1.5'
under an MIT style license.
Copyright (c) 2000 - 2015 The Legion of the Bouncy Castle Inc. (http://www.bouncycastle.org)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
The binary distribution of this product includes modules from Groovy which bundles ANTLR
SOFTWARE RIGHTS
ANTLR 1989-2006 Developed by Terence Parr
Partially supported by University of San Francisco & jGuru.com
We reserve no legal rights to the ANTLR--it is fully in the
public domain. An individual or company may do whatever
they wish with source code distributed with ANTLR or the
code generated by ANTLR, including the incorporation of
ANTLR, or its output, into commerical software.
We encourage users to develop software with ANTLR. However,
we do ask that credit is given to us for developing
ANTLR. By "credit", we mean that if you use ANTLR or
incorporate any source code into one of your programs
(commercial product, research project, or otherwise) that
you acknowledge this fact somewhere in the documentation,
research report, etc... If you like ANTLR and have
developed a nice tool with the output, please mention that
you developed it using ANTLR. In addition, we ask that the
headers remain intact in our source code. As long as these
guidelines are kept, we expect to continue enhancing this
system and expect to make other tools available as they are
completed.
The primary ANTLR guy:
Terence Parr
parrt@cs.usfca.edu
parrt@antlr.org
The binary distribution of this product includes modules from Groovy which bundles ASM
/***
* http://asm.objectweb.org/
*
* ASM: a very small and fast Java bytecode manipulation framework
* Copyright (c) 2000-2005 INRIA, France Telecom
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
The binary distribution of this product includes modules from Groovy which bundles source from JSR-223
The following notice applies to the files:
src/main/org/codehaus/groovy/jsr223/GroovyCompiledScript.java
src/main/org/codehaus/groovy/jsr223/GroovyScriptEngineFactory.java
src/main/org/codehaus/groovy/jsr223/GroovyScriptEngineImpl.java
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Redistribution and use in source and binary forms, with or without modification, are
* permitted provided that the following conditions are met: Redistributions of source code
* must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the Sun Microsystems nor the names of
* is contributors may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

View File

@ -1,348 +0,0 @@
nifi-hive-nar
Copyright 2014-2023 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
This includes derived works from the Apache Storm (ASLv2 licensed) project (https://github.com/apache/storm):
Copyright 2015 The Apache Software Foundation
The derived work is adapted from
org/apache/storm/hive/common/HiveWriter.java
org/apache/storm/hive/common/HiveOptions.java
and can be found in the org.apache.nifi.util.hive package
This includes derived works from the Apache Hive (ASLv2 licensed) project (https://github.com/apache/hive):
Copyright 2008-2016 The Apache Software Foundation
The derived work is adapted from
release-1.2.1/ql/src/java/org/apache/hadoop/hive/ql/io/orc/WriterImpl.java
and can be found in the org.apache.hadoop.hive.ql.io.orc package
===========================================
Apache Software License v2
===========================================
The following binary components are provided under the Apache Software License v2
(ASLv2) Apache Ant
The following NOTICE information applies:
Apache Ant
Copyright 1999-2016 The Apache Software Foundation
(ASLv2) Apache Commons Codec
The following NOTICE information applies:
Apache Commons Codec
Copyright 2002-2014 The Apache Software Foundation
src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
contains test data from http://aspell.net/test/orig/batch0.tab.
Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
===============================================================================
The content of package org.apache.commons.codec.language.bm has been translated
from the original php source code available at http://stevemorse.org/phoneticinfo.htm
with permission from the original authors.
Original source copyright:
Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
(ASLv2) Apache Commons DBCP
The following NOTICE information applies:
Apache Commons DBCP
Copyright 2001-2015 The Apache Software Foundation.
(ASLv2) Apache Commons EL
The following NOTICE information applies:
Apache Commons EL
Copyright 1999-2016 The Apache Software Foundation
EL-8 patch - Copyright 2004-2007 Jamie Taylor
http://issues.apache.org/jira/browse/EL-8
(ASLv2) Apache HttpComponents
The following NOTICE information applies:
Apache HttpComponents Client
Copyright 1999-2016 The Apache Software Foundation
Apache HttpComponents Core - HttpCore
Copyright 2006-2009 The Apache Software Foundation
(ASLv2) Apache Commons Pool
The following NOTICE information applies:
Apache Commons Pool
Copyright 1999-2009 The Apache Software Foundation.
(ASLv2) Apache Commons IO
The following NOTICE information applies:
Apache Commons IO
Copyright 2002-2016 The Apache Software Foundation
(ASLv2) Apache Hive
The following NOTICE information applies:
Apache Hive
Copyright 2008-2015 The Apache Software Foundation
This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).
This product includes Jersey (https://jersey.java.net/)
Copyright (c) 2010-2014 Oracle and/or its affiliates.
This project includes software copyrighted by Microsoft Corporation and
licensed under the Apache License, Version 2.0.
This project includes software copyrighted by Dell SecureWorks and
licensed under the Apache License, Version 2.0.
(ASLv2) Jackson JSON processor
The following NOTICE information applies:
# Jackson JSON processor
Jackson is a high-performance, Free/Open Source JSON processing library.
It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has
been in development since 2007.
It is currently developed by a community of developers, as well as supported
commercially by FasterXML.com.
## Licensing
Jackson core and extension components may licensed under different licenses.
To find the details that apply to this artifact see the accompanying LICENSE file.
For more information, including possible other licensing options, contact
FasterXML.com (http://fasterxml.com).
## Credits
A list of contributors may be found from CREDITS file, which is included
in some artifacts (usually source distributions); but is always available
from the source code management (SCM) system project uses.
(ASLv2) BoneCP
The following NOTICE information applies:
BoneCP
Copyright 2010 Wallace Wadge
(ASLv2) Apache Hadoop
The following NOTICE information applies:
The binary distribution of this product bundles binaries of
org.iq80.leveldb:leveldb-api (https://github.com/dain/leveldb), which has the
following notices:
* Copyright 2011 Dain Sundstrom <dain@iq80.com>
* Copyright 2011 FuseSource Corp. http://fusesource.com
The binary distribution of this product bundles binaries of
org.fusesource.hawtjni:hawtjni-runtime (https://github.com/fusesource/hawtjni),
which has the following notices:
* This product includes software developed by FuseSource Corp.
http://fusesource.com
* This product includes software developed at
Progress Software Corporation and/or its subsidiaries or affiliates.
* This product includes software developed by IBM Corporation and others.
(ASLv2) Apache HBase
The following NOTICE information applies:
Apache HBase
Copyright 2007-2015 The Apache Software Foundation
--
This product incorporates portions of the 'Hadoop' project
Copyright 2007-2009 The Apache Software Foundation
Licensed under the Apache License v2.0
--
Our Orca logo we got here: http://www.vectorfree.com/jumping-orca
It is licensed Creative Commons Attribution 3.0.
See https://creativecommons.org/licenses/by/3.0/us/
We changed the logo by stripping the colored background, inverting
it and then rotating it some.
Later we found that vectorfree.com image is not properly licensed.
The original is owned by vectorportal.com. The original was
relicensed so we could use it as Creative Commons Attribution 3.0.
The license is bundled with the download available here:
http://www.vectorportal.com/subcategory/205/KILLER-WHALE-FREE-VECTOR.eps/ifile/9136/detailtest.asp
--
This product includes portions of the Bootstrap project v3.0.0
Copyright 2013 Twitter, Inc.
Licensed under the Apache License v2.0
This product uses the Glyphicons Halflings icon set.
http://glyphicons.com/
Copyright Jan Kovařík
Licensed under the Apache License v2.0 as a part of the Bootstrap project.
--
This product includes portions of the Guava project v14, specifically
'hbase-common/src/main/java/org/apache/hadoop/hbase/io/LimitInputStream.java'
Copyright (C) 2007 The Guava Authors
Licensed under the Apache License, Version 2.0
(ASLv2) Apache Commons Lang
The following NOTICE information applies:
Apache Commons Lang
Copyright 2001-2015 The Apache Software Foundation
(ASLv2) Apache Curator
The following NOTICE information applies:
Apache Curator
Copyright 2013-2014 The Apache Software Foundation
(ASLv2) Apache Derby
The following NOTICE information applies:
Apache Derby
Copyright 2004-2014 Apache, Apache DB, Apache Derby, Apache Torque, Apache JDO, Apache DDLUtils,
the Derby hat logo, the Apache JDO logo, and the Apache feather logo are trademarks of The Apache Software Foundation.
(ASLv2) Apache DS
The following NOTICE information applies:
ApacheDS
Copyright 2003-2015 The Apache Software Foundation
(ASLv2) Apache Geronimo
The following NOTICE information applies:
Apache Geronimo
Copyright 2003-2008 The Apache Software Foundation
(ASLv2) HTrace Core
The following NOTICE information applies:
In addition, this product includes software dependencies. See
the accompanying LICENSE.txt for a listing of dependencies
that are NOT Apache licensed (with pointers to their licensing)
Apache HTrace includes an Apache Thrift connector to Zipkin. Zipkin
is a distributed tracing system that is Apache 2.0 Licensed.
Copyright 2012 Twitter, Inc.
(ASLv2) Jettison
The following NOTICE information applies:
Copyright 2006 Envoi Solutions LLC
(ASLv2) Jetty
The following NOTICE information applies:
Jetty Web Container
Copyright 1995-2019 Mort Bay Consulting Pty Ltd.
(ASLv2) Apache log4j
The following NOTICE information applies:
Apache log4j
Copyright 2007 The Apache Software Foundation
(ASLv2) Parquet MR
The following NOTICE information applies:
Parquet MR
Copyright 2012 Twitter, Inc.
This project includes code from https://github.com/lemire/JavaFastPFOR
parquet-column/src/main/java/parquet/column/values/bitpacking/LemireBitPacking.java
Apache License Version 2.0 http://www.apache.org/licenses/.
(c) Daniel Lemire, http://lemire.me/en/
(ASLv2) Apache Thrift
The following NOTICE information applies:
Apache Thrift
Copyright 2006-2010 The Apache Software Foundation.
(ASLv2) Apache Twill
The following NOTICE information applies:
Apache Twill
Copyright 2013-2016 The Apache Software Foundation
(ASLv2) Dropwizard Metrics
The following NOTICE information applies:
Metrics
Copyright 2010-2013 Coda Hale and Yammer, Inc.
This product includes code derived from the JSR-166 project (ThreadLocalRandom, Striped64,
LongAdder), which was released with the following comments:
Written by Doug Lea with assistance from members of JCP JSR-166
Expert Group and released to the public domain, as explained at
http://creativecommons.org/publicdomain/zero/1.0/
(ASLv2) Joda Time
The following NOTICE information applies:
This product includes software developed by
Joda.org (http://www.joda.org/).
(ASLv2) The Netty Project
The following NOTICE information applies:
The Netty Project
Copyright 2011 The Netty Project
(ASLv2) Apache Tomcat
The following NOTICE information applies:
Apache Tomcat
Copyright 2007 The Apache Software Foundation
Java Management Extensions (JMX) support is provided by
the MX4J package, which is open source software. The
original software and related information is available
at http://mx4j.sourceforge.net.
Java compilation software for JSP pages is provided by Eclipse,
which is open source software. The orginal software and
related infomation is available at
http://www.eclipse.org.
(ASLv2) Apache ZooKeeper
The following NOTICE information applies:
Apache ZooKeeper
Copyright 2009-2012 The Apache Software Foundation
(ASLv2) Google GSON
The following NOTICE information applies:
Copyright 2008 Google Inc.
(ASLv2) Groovy (org.codehaus.groovy:groovy-all:jar:2.1.6 - http://www.groovy-lang.org)
The following NOTICE information applies:
Groovy Language
Copyright 2003-2012 The respective authors and developers
Developers and Contributors are listed in the project POM file
and Gradle build file
This product includes software developed by
The Groovy community (http://groovy.codehaus.org/).
(ASLv2) JPam
The following NOTICE information applies:
Copyright 2003-2006 Greg Luck
************************
Common Development and Distribution License 1.1
************************
The following binary components are provided under the Common Development and Distribution License 1.1. See project link for details.
(CDDL 1.1) (GPL2 w/ CPE) jersey-client (com.sun.jersey:jersey-client:jar:1.9 - https://jersey.java.net)
(CDDL 1.1) (GPL2 w/ CPE) jersey-core (com.sun.jersey:jersey-core:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) jersey-json (com.sun.jersey:jersey-json:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) jersey-server (com.sun.jersey:jersey-server:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) jersey-guice (com.sun.jersey.contribs:jersey-guice:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) Java Architecture For XML Binding (javax.xml.bind:jaxb-api:jar:2.2.2 - https://jaxb.dev.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) JavaMail API (compat) (javax.mail:mail:jar:1.4.7 - http://kenai.com/projects/javamail/mail)
************************
Common Development and Distribution License 1.0
************************
The following binary components are provided under the Common Development and Distribution License 1.0. See project link for details.
(CDDL 1.0) JavaServlet(TM) Specification (javax.servlet:servlet-api:jar:2.5 - no url available)
(CDDL 1.0) (GPL3) Streaming API For XML (javax.xml.stream:stax-api:jar:1.0-2 - no url provided)
(CDDL 1.0) JavaBeans Activation Framework (JAF) (javax.activation:activation:jar:1.1 - http://java.sun.com/products/javabeans/jaf/index.jsp)
(CDDL 1.0) JavaServer Pages(TM) API (javax.servlet.jsp:jsp-api:jar:2.1 - http://jsp.java.net)
*****************
Public Domain
*****************
The following binary components are provided to the 'Public Domain'. See project link for details.
(Public Domain) AOP Alliance 1.0 (http://aopalliance.sourceforge.net/)

View File

@ -1,225 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-bundle</artifactId>
<version>2.0.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-hive-processors</artifactId>
<packaging>jar</packaging>
<properties>
<!-- Need to override hadoop.version here, for Hive and hadoop-client transitive dependencies -->
<hadoop.version>${hive12.hadoop.version}</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-api</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-put-pattern</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-security-kerberos</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-dbcp-service-api</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-services-api</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-kerberos-credentials-service-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive12.version}</version>
<exclusions>
<exclusion>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>apache-log4j-extras</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hive.hcatalog</groupId>
<artifactId>hive-hcatalog-streaming</artifactId>
<version>${hive12.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>apache-log4j-extras</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Override groovy-all:2.1.6 from Hive -->
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>2.4.21</version>
</dependency>
<dependency>
<groupId>org.apache.hive.hcatalog</groupId>
<artifactId>hive-hcatalog-core</artifactId>
<version>${hive12.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hadoop-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hadoop-record-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-record-serialization-service-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-record</artifactId>
</dependency>
<dependency>
<groupId>com.github.stephenc.findbugs</groupId>
<artifactId>findbugs-annotations</artifactId>
<version>1.3.9-1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-mock</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-mock-record-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,612 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.util.Utf8;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.nifi.serialization.record.DataType;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.serialization.record.type.ArrayDataType;
import org.apache.nifi.serialization.record.type.ChoiceDataType;
import org.apache.nifi.serialization.record.type.MapDataType;
import org.apache.nifi.serialization.record.type.RecordDataType;
import java.io.IOException;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_PADDING;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_SIZE;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_WRITE_FORMAT;
/**
* Utility methods for ORC support (conversion from Avro, conversion to Hive types, e.g.
*/
public class NiFiOrcUtils {
public static Object convertToORCObject(TypeInfo typeInfo, Object o) {
if (o != null) {
if (typeInfo instanceof UnionTypeInfo) {
OrcUnion union = new OrcUnion();
// Avro uses Utf8 and GenericData.EnumSymbol objects instead of Strings. This is handled in other places in the method, but here
// we need to determine the union types from the objects, so choose String.class if the object is one of those Avro classes
Class clazzToCompareTo = o.getClass();
if (o instanceof org.apache.avro.util.Utf8 || o instanceof GenericData.EnumSymbol) {
clazzToCompareTo = String.class;
}
// Need to find which of the union types correspond to the primitive object
TypeInfo objectTypeInfo = TypeInfoUtils.getTypeInfoFromObjectInspector(
ObjectInspectorFactory.getReflectionObjectInspector(clazzToCompareTo, ObjectInspectorFactory.ObjectInspectorOptions.JAVA));
List<TypeInfo> unionTypeInfos = ((UnionTypeInfo) typeInfo).getAllUnionObjectTypeInfos();
int index = 0;
while (index < unionTypeInfos.size() && !unionTypeInfos.get(index).equals(objectTypeInfo)) {
index++;
}
if (index < unionTypeInfos.size()) {
union.set((byte) index, convertToORCObject(objectTypeInfo, o));
} else {
throw new IllegalArgumentException("Object Type for class " + o.getClass().getName() + " not in Union declaration");
}
return union;
}
if (o instanceof Integer) {
return new IntWritable((int) o);
}
if (o instanceof Boolean) {
return new BooleanWritable((boolean) o);
}
if (o instanceof Long) {
return new LongWritable((long) o);
}
if (o instanceof Float) {
return new FloatWritable((float) o);
}
if (o instanceof Double) {
return new DoubleWritable((double) o);
}
if (o instanceof BigDecimal) {
return new HiveDecimalWritable(HiveDecimal.create((BigDecimal) o));
}
if (o instanceof String || o instanceof Utf8 || o instanceof GenericData.EnumSymbol) {
return new Text(o.toString());
}
if (o instanceof ByteBuffer && typeInfo instanceof DecimalTypeInfo) {
ByteBuffer buffer = (ByteBuffer) o;
return new HiveDecimalWritable(buffer.array(), ((DecimalTypeInfo) typeInfo).scale());
}
if (o instanceof ByteBuffer) {
return new BytesWritable(((ByteBuffer) o).array());
}
if (o instanceof int[]) {
int[] intArray = (int[]) o;
return Arrays.stream(intArray)
.mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("int"), element))
.collect(Collectors.toList());
}
if (o instanceof long[]) {
long[] longArray = (long[]) o;
return Arrays.stream(longArray)
.mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("bigint"), element))
.collect(Collectors.toList());
}
if (o instanceof float[]) {
float[] floatArray = (float[]) o;
return IntStream.range(0, floatArray.length)
.mapToDouble(i -> floatArray[i])
.mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("float"), (float) element))
.collect(Collectors.toList());
}
if (o instanceof double[]) {
double[] doubleArray = (double[]) o;
return Arrays.stream(doubleArray)
.mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("double"), element))
.collect(Collectors.toList());
}
if (o instanceof boolean[]) {
boolean[] booleanArray = (boolean[]) o;
return IntStream.range(0, booleanArray.length)
.map(i -> booleanArray[i] ? 1 : 0)
.mapToObj((element) -> convertToORCObject(TypeInfoFactory.getPrimitiveTypeInfo("boolean"), element == 1))
.collect(Collectors.toList());
}
if (o instanceof GenericData.Array) {
GenericData.Array array = ((GenericData.Array) o);
// The type information in this case is interpreted as a List
TypeInfo listTypeInfo = ((ListTypeInfo) typeInfo).getListElementTypeInfo();
return array.stream().map((element) -> convertToORCObject(listTypeInfo, element)).collect(Collectors.toList());
}
if (o instanceof List) {
return o;
}
if (o instanceof Map) {
Map map = new HashMap();
TypeInfo keyInfo = ((MapTypeInfo) typeInfo).getMapKeyTypeInfo();
TypeInfo valueInfo = ((MapTypeInfo) typeInfo).getMapValueTypeInfo();
// Unions are not allowed as key/value types, so if we convert the key and value objects,
// they should return Writable objects
((Map) o).forEach((key, value) -> {
Object keyObject = convertToORCObject(keyInfo, key);
Object valueObject = convertToORCObject(valueInfo, value);
if (keyObject == null) {
throw new IllegalArgumentException("Maps' key cannot be null");
}
map.put(keyObject, valueObject);
});
return map;
}
if (o instanceof GenericData.Record) {
GenericData.Record record = (GenericData.Record) o;
TypeInfo recordSchema = NiFiOrcUtils.getOrcField(record.getSchema());
List<Schema.Field> recordFields = record.getSchema().getFields();
if (recordFields != null) {
Object[] fieldObjects = new Object[recordFields.size()];
for (int i = 0; i < recordFields.size(); i++) {
Schema.Field field = recordFields.get(i);
Schema fieldSchema = field.schema();
Object fieldObject = record.get(field.name());
fieldObjects[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), fieldObject);
}
return NiFiOrcUtils.createOrcStruct(recordSchema, fieldObjects);
}
}
throw new IllegalArgumentException("Error converting object of type " + o.getClass().getName() + " to ORC type " + typeInfo.getTypeName());
} else {
return null;
}
}
/**
* Create an object of OrcStruct given a TypeInfo and a list of objects
*
* @param typeInfo The TypeInfo object representing the ORC record schema
* @param objs ORC objects/Writables
* @return an OrcStruct containing the specified objects for the specified schema
*/
public static OrcStruct createOrcStruct(TypeInfo typeInfo, Object... objs) {
SettableStructObjectInspector oi = (SettableStructObjectInspector) OrcStruct
.createObjectInspector(typeInfo);
List<StructField> fields = (List<StructField>) oi.getAllStructFieldRefs();
OrcStruct result = (OrcStruct) oi.create();
result.setNumFields(fields.size());
for (int i = 0; i < fields.size(); i++) {
oi.setStructFieldData(result, fields.get(i), objs[i]);
}
return result;
}
public static String normalizeHiveTableName(String name) {
return name.replaceAll("[\\. ]", "_");
}
public static String generateHiveDDL(Schema avroSchema, String tableName) {
Schema.Type schemaType = avroSchema.getType();
StringBuilder sb = new StringBuilder("CREATE EXTERNAL TABLE IF NOT EXISTS ");
sb.append(tableName);
sb.append(" (");
if (Schema.Type.RECORD.equals(schemaType)) {
List<String> hiveColumns = new ArrayList<>();
List<Schema.Field> fields = avroSchema.getFields();
if (fields != null) {
hiveColumns.addAll(
fields.stream().map(field -> field.name() + " " + getHiveTypeFromAvroType(field.schema())).collect(Collectors.toList()));
}
sb.append(StringUtils.join(hiveColumns, ", "));
sb.append(") STORED AS ORC");
return sb.toString();
} else {
throw new IllegalArgumentException("Avro schema is of type " + schemaType.getName() + ", not RECORD");
}
}
public static TypeInfo getOrcField(Schema fieldSchema) throws IllegalArgumentException {
Schema.Type fieldType = fieldSchema.getType();
switch (fieldType) {
case INT:
case LONG:
case BOOLEAN:
case DOUBLE:
case FLOAT:
case STRING:
case NULL:
return getPrimitiveOrcTypeFromPrimitiveAvroType(fieldType);
case BYTES:
if (isLogicalType(fieldSchema)){
return getLogicalTypeInfo(fieldSchema);
} else {
return getPrimitiveOrcTypeFromPrimitiveAvroType(fieldType);
}
case UNION:
List<Schema> unionFieldSchemas = fieldSchema.getTypes();
if (unionFieldSchemas != null) {
// Ignore null types in union
List<TypeInfo> orcFields = unionFieldSchemas.stream().filter(
unionFieldSchema -> !Schema.Type.NULL.equals(unionFieldSchema.getType()))
.map(NiFiOrcUtils::getOrcField)
.collect(Collectors.toList());
// Flatten the field if the union only has one non-null element
if (orcFields.size() == 1) {
return orcFields.get(0);
} else {
return TypeInfoFactory.getUnionTypeInfo(orcFields);
}
}
return null;
case ARRAY:
return TypeInfoFactory.getListTypeInfo(getOrcField(fieldSchema.getElementType()));
case MAP:
return TypeInfoFactory.getMapTypeInfo(
getPrimitiveOrcTypeFromPrimitiveAvroType(Schema.Type.STRING),
getOrcField(fieldSchema.getValueType()));
case RECORD:
List<Schema.Field> avroFields = fieldSchema.getFields();
if (avroFields != null) {
List<String> orcFieldNames = new ArrayList<>(avroFields.size());
List<TypeInfo> orcFields = new ArrayList<>(avroFields.size());
avroFields.forEach(avroField -> {
String fieldName = avroField.name();
orcFieldNames.add(fieldName);
orcFields.add(getOrcField(avroField.schema()));
});
return TypeInfoFactory.getStructTypeInfo(orcFieldNames, orcFields);
}
return null;
case ENUM:
// An enum value is just a String for ORC/Hive
return getPrimitiveOrcTypeFromPrimitiveAvroType(Schema.Type.STRING);
default:
throw new IllegalArgumentException("Did not recognize Avro type " + fieldType.getName());
}
}
private static boolean isLogicalType(Schema schema){
return schema.getProp("logicalType") != null;
}
private static TypeInfo getLogicalTypeInfo(Schema schema){
String type = schema.getProp("logicalType");
switch (type){
case "decimal":
int precision = schema.getObjectProp("precision") != null
? Integer.valueOf(schema.getObjectProp("precision").toString())
: 10;
int scale = schema.getObjectProp("scale") != null
? Integer.valueOf(schema.getObjectProp("scale").toString())
: 2;
return new DecimalTypeInfo(precision, scale);
}
throw new IllegalArgumentException("Logical type " + type + " is not supported!");
}
public static Schema.Type getAvroSchemaTypeOfObject(Object o) {
if (o == null) {
return Schema.Type.NULL;
} else if (o instanceof Integer) {
return Schema.Type.INT;
} else if (o instanceof Long) {
return Schema.Type.LONG;
} else if (o instanceof Boolean) {
return Schema.Type.BOOLEAN;
} else if (o instanceof byte[]) {
return Schema.Type.BYTES;
} else if (o instanceof Float) {
return Schema.Type.FLOAT;
} else if (o instanceof Double) {
return Schema.Type.DOUBLE;
} else if (o instanceof Enum) {
return Schema.Type.ENUM;
} else if (o instanceof Object[]) {
return Schema.Type.ARRAY;
} else if (o instanceof List) {
return Schema.Type.ARRAY;
} else if (o instanceof Map) {
return Schema.Type.MAP;
} else {
throw new IllegalArgumentException("Object of class " + o.getClass() + " is not a supported Avro Type");
}
}
public static TypeInfo getPrimitiveOrcTypeFromPrimitiveAvroType(Schema.Type avroType) throws IllegalArgumentException {
if (avroType == null) {
throw new IllegalArgumentException("Avro type is null");
}
switch (avroType) {
case INT:
return TypeInfoFactory.getPrimitiveTypeInfo("int");
case LONG:
return TypeInfoFactory.getPrimitiveTypeInfo("bigint");
case BOOLEAN:
case NULL: // ORC has no null type, so just pick the smallest. All values are necessarily null.
return TypeInfoFactory.getPrimitiveTypeInfo("boolean");
case BYTES:
return TypeInfoFactory.getPrimitiveTypeInfo("binary");
case DOUBLE:
return TypeInfoFactory.getPrimitiveTypeInfo("double");
case FLOAT:
return TypeInfoFactory.getPrimitiveTypeInfo("float");
case STRING:
return TypeInfoFactory.getPrimitiveTypeInfo("string");
default:
throw new IllegalArgumentException("Avro type " + avroType.getName() + " is not a primitive type");
}
}
public static String getHiveTypeFromAvroType(Schema avroSchema) {
if (avroSchema == null) {
throw new IllegalArgumentException("Avro schema is null");
}
Schema.Type avroType = avroSchema.getType();
switch (avroType) {
case INT:
return "INT";
case LONG:
return "BIGINT";
case BOOLEAN:
case NULL: // Hive has no null type, we picked boolean as the ORC type so use it for Hive DDL too. All values are necessarily null.
return "BOOLEAN";
case BYTES:
if (isLogicalType(avroSchema)){
return getLogicalTypeInfo(avroSchema).toString().toUpperCase();
} else {
return "BINARY";
}
case DOUBLE:
return "DOUBLE";
case FLOAT:
return "FLOAT";
case STRING:
case ENUM:
return "STRING";
case UNION:
List<Schema> unionFieldSchemas = avroSchema.getTypes();
if (unionFieldSchemas != null) {
List<String> hiveFields = new ArrayList<>();
for (Schema unionFieldSchema : unionFieldSchemas) {
Schema.Type unionFieldSchemaType = unionFieldSchema.getType();
// Ignore null types in union
if (!Schema.Type.NULL.equals(unionFieldSchemaType)) {
hiveFields.add(getHiveTypeFromAvroType(unionFieldSchema));
}
}
// Flatten the field if the union only has one non-null element
return (hiveFields.size() == 1)
? hiveFields.get(0)
: "UNIONTYPE<" + StringUtils.join(hiveFields, ", ") + ">";
}
break;
case MAP:
return "MAP<STRING, " + getHiveTypeFromAvroType(avroSchema.getValueType()) + ">";
case ARRAY:
return "ARRAY<" + getHiveTypeFromAvroType(avroSchema.getElementType()) + ">";
case RECORD:
List<Schema.Field> recordFields = avroSchema.getFields();
if (recordFields != null) {
List<String> hiveFields = recordFields.stream().map(
recordField -> recordField.name() + ":" + getHiveTypeFromAvroType(recordField.schema())).collect(Collectors.toList());
return "STRUCT<" + StringUtils.join(hiveFields, ", ") + ">";
}
break;
default:
break;
}
throw new IllegalArgumentException("Error converting Avro type " + avroType.getName() + " to Hive type");
}
public static String getHiveTypeFromFieldType(DataType rawDataType, boolean hiveFieldNames) {
if (rawDataType == null) {
throw new IllegalArgumentException("Field type is null");
}
RecordFieldType dataType = rawDataType.getFieldType();
if (RecordFieldType.INT.equals(dataType)) {
return "INT";
}
if (RecordFieldType.LONG.equals(dataType)) {
return "BIGINT";
}
if (RecordFieldType.BOOLEAN.equals(dataType)) {
return "BOOLEAN";
}
if (RecordFieldType.DOUBLE.equals(dataType)) {
return "DOUBLE";
}
if (RecordFieldType.FLOAT.equals(dataType)) {
return "FLOAT";
}
if (RecordFieldType.DECIMAL.equals(dataType)) {
return "DECIMAL";
}
if (RecordFieldType.STRING.equals(dataType) || RecordFieldType.ENUM.equals(dataType)) {
return "STRING";
}
if (RecordFieldType.DATE.equals(dataType)) {
return "DATE";
}
if (RecordFieldType.TIME.equals(dataType)) {
return "INT";
}
if (RecordFieldType.TIMESTAMP.equals(dataType)) {
return "TIMESTAMP";
}
if (RecordFieldType.ARRAY.equals(dataType)) {
ArrayDataType arrayDataType = (ArrayDataType) rawDataType;
if (RecordFieldType.BYTE.getDataType().equals(arrayDataType.getElementType())) {
return "BINARY";
}
return "ARRAY<" + getHiveTypeFromFieldType(arrayDataType.getElementType(), hiveFieldNames) + ">";
}
if (RecordFieldType.MAP.equals(dataType)) {
MapDataType mapDataType = (MapDataType) rawDataType;
return "MAP<STRING, " + getHiveTypeFromFieldType(mapDataType.getValueType(), hiveFieldNames) + ">";
}
if (RecordFieldType.CHOICE.equals(dataType)) {
ChoiceDataType choiceDataType = (ChoiceDataType) rawDataType;
List<DataType> unionFieldSchemas = choiceDataType.getPossibleSubTypes();
if (unionFieldSchemas != null) {
// Ignore null types in union
List<String> hiveFields = unionFieldSchemas.stream()
.map((it) -> getHiveTypeFromFieldType(it, hiveFieldNames))
.collect(Collectors.toList());
// Flatten the field if the union only has one non-null element
return (hiveFields.size() == 1)
? hiveFields.get(0)
: "UNIONTYPE<" + StringUtils.join(hiveFields, ", ") + ">";
}
return null;
}
if (RecordFieldType.RECORD.equals(dataType)) {
RecordDataType recordDataType = (RecordDataType) rawDataType;
List<RecordField> recordFields = recordDataType.getChildSchema().getFields();
if (recordFields != null) {
List<String> hiveFields = recordFields.stream().map(
recordField -> ("`" + (hiveFieldNames ? recordField.getFieldName().toLowerCase() : recordField.getFieldName()) + "`:"
+ getHiveTypeFromFieldType(recordField.getDataType(), hiveFieldNames))).collect(Collectors.toList());
return "STRUCT<" + StringUtils.join(hiveFields, ", ") + ">";
}
return null;
}
throw new IllegalArgumentException("Error converting Avro type " + dataType.name() + " to Hive type");
}
public static OrcFlowFileWriter createWriter(OutputStream flowFileOutputStream,
Path path,
Configuration conf,
TypeInfo orcSchema,
long stripeSize,
CompressionKind compress,
int bufferSize) throws IOException {
int rowIndexStride = HiveConf.getIntVar(conf, HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE);
boolean addBlockPadding = HiveConf.getBoolVar(conf, HIVE_ORC_DEFAULT_BLOCK_PADDING);
String versionName = HiveConf.getVar(conf, HIVE_ORC_WRITE_FORMAT);
OrcFile.Version versionValue = (versionName == null)
? OrcFile.Version.CURRENT
: OrcFile.Version.byName(versionName);
OrcFile.EncodingStrategy encodingStrategy;
String enString = conf.get(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname);
if (enString == null) {
encodingStrategy = OrcFile.EncodingStrategy.SPEED;
} else {
encodingStrategy = OrcFile.EncodingStrategy.valueOf(enString);
}
OrcFile.CompressionStrategy compressionStrategy;
String compString = conf.get(HiveConf.ConfVars.HIVE_ORC_COMPRESSION_STRATEGY.varname);
if (compString == null) {
compressionStrategy = OrcFile.CompressionStrategy.SPEED;
} else {
compressionStrategy = OrcFile.CompressionStrategy.valueOf(compString);
}
float paddingTolerance;
paddingTolerance = conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname,
HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal);
long blockSizeValue = HiveConf.getLongVar(conf, HIVE_ORC_DEFAULT_BLOCK_SIZE);
double bloomFilterFpp = BloomFilterIO.DEFAULT_FPP;
ObjectInspector inspector = OrcStruct.createObjectInspector(orcSchema);
return new OrcFlowFileWriter(flowFileOutputStream,
path,
conf,
inspector,
stripeSize,
compress,
bufferSize,
rowIndexStride,
getMemoryManager(conf),
addBlockPadding,
versionValue,
null, // no callback
encodingStrategy,
compressionStrategy,
paddingTolerance,
blockSizeValue,
null, // no Bloom Filter column names
bloomFilterFpp);
}
private static MemoryManager memoryManager = null;
private static synchronized MemoryManager getMemoryManager(Configuration conf) {
if (memoryManager == null) {
memoryManager = new MemoryManager(conf);
}
return memoryManager;
}
}

View File

@ -1,459 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.dbcp.hive;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.jdbc.HiveDriver;
import org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnDisabled;
import org.apache.nifi.annotation.lifecycle.OnEnabled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.PropertyValue;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.resource.ResourceCardinality;
import org.apache.nifi.components.resource.ResourceType;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.controller.ConfigurationContext;
import org.apache.nifi.controller.ControllerServiceInitializationContext;
import org.apache.nifi.dbcp.DBCPValidator;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.hadoop.KerberosProperties;
import org.apache.nifi.hadoop.SecurityUtil;
import org.apache.nifi.kerberos.KerberosCredentialsService;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.security.krb.KerberosKeytabUser;
import org.apache.nifi.security.krb.KerberosLoginException;
import org.apache.nifi.security.krb.KerberosPasswordUser;
import org.apache.nifi.security.krb.KerberosUser;
import org.apache.nifi.util.hive.AuthenticationFailedException;
import org.apache.nifi.util.hive.HiveConfigurator;
import org.apache.nifi.util.hive.ValidationResources;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.UndeclaredThrowableException;
import java.security.PrivilegedExceptionAction;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
/**
* Implementation for Database Connection Pooling Service used for Apache Hive
* connections. Apache DBCP is used for connection pooling functionality.
*/
@RequiresInstanceClassLoading
@Tags({"hive", "dbcp", "jdbc", "database", "connection", "pooling", "store"})
@CapabilityDescription("Provides Database Connection Pooling Service for Apache Hive. Connections can be asked from pool and returned after usage.")
@DeprecationNotice(classNames = "org.apache.nifi.dbcp.hive.Hive3ConnectionPool")
public class HiveConnectionPool extends AbstractControllerService implements HiveDBCPService {
private static final String ALLOW_EXPLICIT_KEYTAB = "NIFI_ALLOW_EXPLICIT_KEYTAB";
private static final String DEFAULT_MAX_CONN_LIFETIME = "-1";
public static final PropertyDescriptor DATABASE_URL = new PropertyDescriptor.Builder()
.name("hive-db-connect-url")
.displayName("Database Connection URL")
.description("A database connection URL used to connect to a database. May contain database system name, host, port, database name and some parameters."
+ " The exact syntax of a database connection URL is specified by the Hive documentation. For example, the server principal is often included "
+ "as a connection parameter when connecting to a secure Hive server.")
.defaultValue(null)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.required(true)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor HIVE_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder()
.name("hive-config-resources")
.displayName("Hive Configuration Resources")
.description("A file or comma separated list of files which contains the Hive configuration (hive-site.xml, e.g.). Without this, Hadoop "
+ "will search the classpath for a 'hive-site.xml' file or will revert to a default configuration. Note that to enable authentication "
+ "with Kerberos e.g., the appropriate properties must be set in the configuration files. Please see the Hive documentation for more details.")
.required(false)
.identifiesExternalResource(ResourceCardinality.MULTIPLE, ResourceType.FILE)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor DB_USER = new PropertyDescriptor.Builder()
.name("hive-db-user")
.displayName("Database User")
.description("Database user name")
.defaultValue(null)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor DB_PASSWORD = new PropertyDescriptor.Builder()
.name("hive-db-password")
.displayName("Password")
.description("The password for the database user")
.defaultValue(null)
.required(false)
.sensitive(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor MAX_WAIT_TIME = new PropertyDescriptor.Builder()
.name("hive-max-wait-time")
.displayName("Max Wait Time")
.description("The maximum amount of time that the pool will wait (when there are no available connections) "
+ " for a connection to be returned before failing, or -1 to wait indefinitely. ")
.defaultValue("500 millis")
.required(true)
.addValidator(StandardValidators.TIME_PERIOD_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor MAX_TOTAL_CONNECTIONS = new PropertyDescriptor.Builder()
.name("hive-max-total-connections")
.displayName("Max Total Connections")
.description("The maximum number of active connections that can be allocated from this pool at the same time, "
+ "or negative for no limit.")
.defaultValue("8")
.required(true)
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor MAX_CONN_LIFETIME = new PropertyDescriptor.Builder()
.displayName("Max Connection Lifetime")
.name("hive-max-conn-lifetime")
.description("The maximum lifetime in milliseconds of a connection. After this time is exceeded the " +
"connection pool will invalidate the connection. A value of zero or -1 " +
"means the connection has an infinite lifetime.")
.defaultValue(DEFAULT_MAX_CONN_LIFETIME)
.required(true)
.addValidator(DBCPValidator.CUSTOM_TIME_PERIOD_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor VALIDATION_QUERY = new PropertyDescriptor.Builder()
.name("Validation-query")
.displayName("Validation query")
.description("Validation query used to validate connections before returning them. "
+ "When a borrowed connection is invalid, it gets dropped and a new valid connection will be returned. "
+ "NOTE: Using validation may have a performance penalty.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
static final PropertyDescriptor KERBEROS_CREDENTIALS_SERVICE = new PropertyDescriptor.Builder()
.name("kerberos-credentials-service")
.displayName("Kerberos Credentials Service")
.description("Specifies the Kerberos Credentials Controller Service that should be used for authenticating with Kerberos")
.identifiesControllerService(KerberosCredentialsService.class)
.required(false)
.build();
private List<PropertyDescriptor> properties;
private String connectionUrl = "unknown";
// Holder of cached Configuration information so validation does not reload the same config over and over
private final AtomicReference<ValidationResources> validationResourceHolder = new AtomicReference<>();
private volatile BasicDataSource dataSource;
private volatile HiveConfigurator hiveConfigurator = new HiveConfigurator();
private volatile UserGroupInformation ugi;
private final AtomicReference<KerberosUser> kerberosUserReference = new AtomicReference<>();
private volatile File kerberosConfigFile = null;
private volatile KerberosProperties kerberosProperties;
@Override
protected void init(final ControllerServiceInitializationContext context) {
List<PropertyDescriptor> props = new ArrayList<>();
props.add(DATABASE_URL);
props.add(HIVE_CONFIGURATION_RESOURCES);
props.add(DB_USER);
props.add(DB_PASSWORD);
props.add(MAX_WAIT_TIME);
props.add(MAX_TOTAL_CONNECTIONS);
props.add(MAX_CONN_LIFETIME);
props.add(VALIDATION_QUERY);
props.add(KERBEROS_CREDENTIALS_SERVICE);
kerberosConfigFile = context.getKerberosConfigurationFile();
kerberosProperties = new KerberosProperties(kerberosConfigFile);
props.add(kerberosProperties.getKerberosPrincipal());
props.add(kerberosProperties.getKerberosKeytab());
props.add(kerberosProperties.getKerberosPassword());
properties = props;
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
boolean confFileProvided = validationContext.getProperty(HIVE_CONFIGURATION_RESOURCES).isSet();
final List<ValidationResult> problems = new ArrayList<>();
if (confFileProvided) {
final String explicitPrincipal = validationContext.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
final String explicitKeytab = validationContext.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
final String explicitPassword = validationContext.getProperty(kerberosProperties.getKerberosPassword()).getValue();
final KerberosCredentialsService credentialsService = validationContext.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService != null) {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
} else {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = explicitKeytab;
}
final String configFiles = validationContext.getProperty(HIVE_CONFIGURATION_RESOURCES).evaluateAttributeExpressions().getValue();
problems.addAll(hiveConfigurator.validate(configFiles, resolvedPrincipal, resolvedKeytab, explicitPassword, validationResourceHolder, getLogger()));
if (credentialsService != null && (explicitPrincipal != null || explicitKeytab != null || explicitPassword != null)) {
problems.add(new ValidationResult.Builder()
.subject("Kerberos Credentials")
.valid(false)
.explanation("Cannot specify a Kerberos Credentials Service while also specifying a Kerberos Principal, Kerberos Keytab, or Kerberos Password")
.build());
}
if (!isAllowExplicitKeytab() && explicitKeytab != null) {
problems.add(new ValidationResult.Builder()
.subject("Kerberos Credentials")
.valid(false)
.explanation("The '" + ALLOW_EXPLICIT_KEYTAB + "' system environment variable is configured to forbid explicitly configuring Kerberos Keytab in processors. "
+ "The Kerberos Credentials Service should be used instead of setting the Kerberos Keytab or Kerberos Principal property.")
.build());
}
}
return problems;
}
/**
* Configures connection pool by creating an instance of the
* {@link BasicDataSource} based on configuration provided with
* {@link ConfigurationContext}.
* <p>
* This operation makes no guarantees that the actual connection could be
* made since the underlying system may still go off-line during normal
* operation of the connection pool.
* <p/>
* As of Apache NiFi 1.5.0, due to changes made to
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}, which is used by this class invoking
* {@link HiveConfigurator#authenticate(Configuration, String, String)}
* to authenticate a principal with Kerberos, Hive controller services no longer use a separate thread to
* relogin, and instead call {@link UserGroupInformation#checkTGTAndReloginFromKeytab()} from
* {@link HiveConnectionPool#getConnection()}. The relogin request is performed in a synchronized block to prevent
* threads from requesting concurrent relogins. For more information, please read the documentation for
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}.
* <p/>
* In previous versions of NiFi, a {@link org.apache.nifi.hadoop.KerberosTicketRenewer} was started by
* {@link HiveConfigurator#authenticate(Configuration, String, String, long)} when the Hive
* controller service was enabled. The use of a separate thread to explicitly relogin could cause race conditions
* with the implicit relogin attempts made by hadoop/Hive code on a thread that references the same
* {@link UserGroupInformation} instance. One of these threads could leave the
* {@link javax.security.auth.Subject} in {@link UserGroupInformation} to be cleared or in an unexpected state
* while the other thread is attempting to use the {@link javax.security.auth.Subject}, resulting in failed
* authentication attempts that would leave the Hive controller service in an unrecoverable state.
*
* @see SecurityUtil#loginKerberos(Configuration, String, String)
* @see HiveConfigurator#authenticate(Configuration, String, String)
* @see HiveConfigurator#authenticate(Configuration, String, String, long)
* @param context the configuration context
* @throws InitializationException if unable to create a database connection
*/
@OnEnabled
public void onConfigured(final ConfigurationContext context) throws InitializationException {
ComponentLog log = getLogger();
final String configFiles = context.getProperty(HIVE_CONFIGURATION_RESOURCES).evaluateAttributeExpressions().getValue();
final Configuration hiveConfig = hiveConfigurator.getConfigurationFromFiles(configFiles);
final String validationQuery = context.getProperty(VALIDATION_QUERY).evaluateAttributeExpressions().getValue();
// add any dynamic properties to the Hive configuration
for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
final PropertyDescriptor descriptor = entry.getKey();
if (descriptor.isDynamic()) {
hiveConfig.set(descriptor.getName(), context.getProperty(descriptor).evaluateAttributeExpressions().getValue());
}
}
final String drv = HiveDriver.class.getName();
if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
final String explicitPrincipal = context.getProperty(kerberosProperties.getKerberosPrincipal()).evaluateAttributeExpressions().getValue();
final String explicitKeytab = context.getProperty(kerberosProperties.getKerberosKeytab()).evaluateAttributeExpressions().getValue();
final String explicitPassword = context.getProperty(kerberosProperties.getKerberosPassword()).getValue();
final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService != null) {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
} else {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = explicitKeytab;
}
if (resolvedKeytab != null) {
kerberosUserReference.set(new KerberosKeytabUser(resolvedPrincipal, resolvedKeytab));
log.info("Hive Security Enabled, logging in as principal {} with keytab {}", new Object[] {resolvedPrincipal, resolvedKeytab});
} else if (explicitPassword != null) {
kerberosUserReference.set(new KerberosPasswordUser(resolvedPrincipal, explicitPassword));
log.info("Hive Security Enabled, logging in as principal {} with password", new Object[] {resolvedPrincipal});
} else {
throw new InitializationException("Unable to authenticate with Kerberos, no keytab or password was provided");
}
try {
ugi = hiveConfigurator.authenticate(hiveConfig, kerberosUserReference.get());
} catch (AuthenticationFailedException ae) {
log.error(ae.getMessage(), ae);
throw new InitializationException(ae);
}
getLogger().info("Successfully logged in as principal " + resolvedPrincipal);
}
final String user = context.getProperty(DB_USER).evaluateAttributeExpressions().getValue();
final String passw = context.getProperty(DB_PASSWORD).evaluateAttributeExpressions().getValue();
final Long maxWaitMillis = context.getProperty(MAX_WAIT_TIME).evaluateAttributeExpressions().asTimePeriod(TimeUnit.MILLISECONDS);
final Integer maxTotal = context.getProperty(MAX_TOTAL_CONNECTIONS).evaluateAttributeExpressions().asInteger();
final long maxConnectionLifetimeMillis = extractMillisWithInfinite(context.getProperty(MAX_CONN_LIFETIME).evaluateAttributeExpressions());
dataSource = new BasicDataSource();
dataSource.setDriverClassName(drv);
connectionUrl = context.getProperty(DATABASE_URL).evaluateAttributeExpressions().getValue();
dataSource.setMaxWaitMillis(maxWaitMillis);
dataSource.setMaxTotal(maxTotal);
dataSource.setMaxConnLifetimeMillis(maxConnectionLifetimeMillis);
if (validationQuery != null && !validationQuery.isEmpty()) {
dataSource.setValidationQuery(validationQuery);
dataSource.setTestOnBorrow(true);
}
dataSource.setUrl(connectionUrl);
dataSource.setUsername(user);
dataSource.setPassword(passw);
}
/**
* Shutdown pool, close all open connections.
*/
@OnDisabled
public void shutdown() {
try {
dataSource.close();
} catch (final SQLException e) {
throw new ProcessException(e);
}
}
@Override
public Connection getConnection() throws ProcessException {
try {
if (ugi != null) {
/*
* Explicitly check the TGT and relogin if necessary with the KerberosUser instance. No synchronization
* is necessary in the client code, since AbstractKerberosUser's checkTGTAndRelogin method is synchronized.
*/
getLogger().trace("getting UGI instance");
if (kerberosUserReference.get() != null) {
// if there's a KerberosUser associated with this UGI, check the TGT and relogin if it is close to expiring
KerberosUser kerberosUser = kerberosUserReference.get();
getLogger().debug("kerberosUser is " + kerberosUser);
try {
getLogger().debug("checking TGT on kerberosUser [{}]", new Object[]{kerberosUser});
kerberosUser.checkTGTAndRelogin();
} catch (final KerberosLoginException e) {
throw new ProcessException("Unable to relogin with kerberos credentials for " + kerberosUser.getPrincipal(), e);
}
} else {
getLogger().debug("kerberosUser was null, will not refresh TGT with KerberosUser");
// no synchronization is needed for UserGroupInformation.checkTGTAndReloginFromKeytab; UGI handles the synchronization internally
ugi.checkTGTAndReloginFromKeytab();
}
try {
return ugi.doAs((PrivilegedExceptionAction<Connection>) () -> dataSource.getConnection());
} catch (UndeclaredThrowableException e) {
Throwable cause = e.getCause();
if (cause instanceof SQLException) {
throw (SQLException) cause;
} else {
throw e;
}
}
} else {
getLogger().info("Simple Authentication");
return dataSource.getConnection();
}
} catch (SQLException | IOException | InterruptedException e) {
getLogger().error("Error getting Hive connection", e);
throw new ProcessException(e);
}
}
@Override
public String toString() {
return "HiveConnectionPool[id=" + getIdentifier() + "]";
}
@Override
public String getConnectionURL() {
return connectionUrl;
}
/*
* Overridable by subclasses in the same package, mainly intended for testing purposes to allow verification without having to set environment variables.
*/
boolean isAllowExplicitKeytab() {
return Boolean.parseBoolean(System.getenv(ALLOW_EXPLICIT_KEYTAB));
}
private long extractMillisWithInfinite(PropertyValue prop) {
if (prop.getValue() == null || DEFAULT_MAX_CONN_LIFETIME.equals(prop.getValue())) {
return -1;
} else {
return prop.asTimePeriod(TimeUnit.MILLISECONDS);
}
}
}

View File

@ -1,344 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.antlr.runtime.tree.CommonTree;
import org.apache.hadoop.hive.ql.parse.ASTNode;
import org.apache.hadoop.hive.ql.parse.ParseDriver;
import org.apache.hadoop.hive.ql.parse.ParseException;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.AbstractSessionFactoryProcessor;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.stream.io.StreamUtils;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.sql.Date;
import java.sql.PreparedStatement;
import java.sql.SQLDataException;
import java.sql.SQLException;
import java.sql.Time;
import java.sql.Timestamp;
import java.sql.Types;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* An abstract base class for HiveQL processors to share common data, methods, etc.
*/
public abstract class AbstractHiveQLProcessor extends AbstractSessionFactoryProcessor {
protected static final Pattern HIVEQL_TYPE_ATTRIBUTE_PATTERN = Pattern.compile("hiveql\\.args\\.(\\d+)\\.type");
protected static final Pattern NUMBER_PATTERN = Pattern.compile("-?\\d+");
static String ATTR_INPUT_TABLES = "query.input.tables";
static String ATTR_OUTPUT_TABLES = "query.output.tables";
public static final PropertyDescriptor HIVE_DBCP_SERVICE = new PropertyDescriptor.Builder()
.name("Hive Database Connection Pooling Service")
.description("The Hive Controller Service that is used to obtain connection(s) to the Hive database")
.required(true)
.identifiesControllerService(HiveDBCPService.class)
.build();
public static final PropertyDescriptor CHARSET = new PropertyDescriptor.Builder()
.name("hive-charset")
.displayName("Character Set")
.description("Specifies the character set of the record data.")
.required(true)
.defaultValue("UTF-8")
.addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
.build();
/**
* Determines the HiveQL statement that should be executed for the given FlowFile
*
* @param session the session that can be used to access the given FlowFile
* @param flowFile the FlowFile whose HiveQL statement should be executed
* @return the HiveQL that is associated with the given FlowFile
*/
protected String getHiveQL(final ProcessSession session, final FlowFile flowFile, final Charset charset) {
// Read the HiveQL from the FlowFile's content
final byte[] buffer = new byte[(int) flowFile.getSize()];
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream in) throws IOException {
StreamUtils.fillBuffer(in, buffer);
}
});
// Create the PreparedStatement to use for this FlowFile.
return new String(buffer, charset);
}
private class ParameterHolder {
String attributeName;
int jdbcType;
String value;
}
/**
* Sets all of the appropriate parameters on the given PreparedStatement, based on the given FlowFile attributes.
*
* @param stmt the statement to set the parameters on
* @param attributes the attributes from which to derive parameter indices, values, and types
* @throws SQLException if the PreparedStatement throws a SQLException when the appropriate setter is called
*/
protected int setParameters(int base, final PreparedStatement stmt, int paramCount, final Map<String, String> attributes) throws SQLException {
Map<Integer, ParameterHolder> parmMap = new TreeMap<Integer, ParameterHolder>();
for (final Map.Entry<String, String> entry : attributes.entrySet()) {
final String key = entry.getKey();
final Matcher matcher = HIVEQL_TYPE_ATTRIBUTE_PATTERN.matcher(key);
if (matcher.matches()) {
final int parameterIndex = Integer.parseInt(matcher.group(1));
if (parameterIndex >= base && parameterIndex < base + paramCount) {
final boolean isNumeric = NUMBER_PATTERN.matcher(entry.getValue()).matches();
if (!isNumeric) {
throw new SQLDataException("Value of the " + key + " attribute is '" + entry.getValue() + "', which is not a valid JDBC numeral jdbcType");
}
final String valueAttrName = "hiveql.args." + parameterIndex + ".value";
ParameterHolder ph = new ParameterHolder();
int realIndexLoc = parameterIndex - base +1;
ph.jdbcType = Integer.parseInt(entry.getValue());
ph.value = attributes.get(valueAttrName);
ph.attributeName = valueAttrName;
parmMap.put(realIndexLoc, ph);
}
}
}
// Now that's we've retrieved the correct number of parameters and it's sorted, let's set them.
for (final Map.Entry<Integer, ParameterHolder> entry : parmMap.entrySet()) {
final Integer index = entry.getKey();
final ParameterHolder ph = entry.getValue();
try {
setParameter(stmt, ph.attributeName, index, ph.value, ph.jdbcType);
} catch (final NumberFormatException nfe) {
throw new SQLDataException("The value of the " + ph.attributeName + " is '" + ph.value + "', which cannot be converted into the necessary data jdbcType", nfe);
}
}
return base + paramCount;
}
/**
* Determines how to map the given value to the appropriate JDBC data jdbcType and sets the parameter on the
* provided PreparedStatement
*
* @param stmt the PreparedStatement to set the parameter on
* @param attrName the name of the attribute that the parameter is coming from - for logging purposes
* @param parameterIndex the index of the HiveQL parameter to set
* @param parameterValue the value of the HiveQL parameter to set
* @param jdbcType the JDBC Type of the HiveQL parameter to set
* @throws SQLException if the PreparedStatement throws a SQLException when calling the appropriate setter
*/
protected void setParameter(final PreparedStatement stmt, final String attrName, final int parameterIndex, final String parameterValue, final int jdbcType) throws SQLException {
if (parameterValue == null) {
stmt.setNull(parameterIndex, jdbcType);
} else {
try {
switch (jdbcType) {
case Types.BIT:
case Types.BOOLEAN:
stmt.setBoolean(parameterIndex, Boolean.parseBoolean(parameterValue));
break;
case Types.TINYINT:
stmt.setByte(parameterIndex, Byte.parseByte(parameterValue));
break;
case Types.SMALLINT:
stmt.setShort(parameterIndex, Short.parseShort(parameterValue));
break;
case Types.INTEGER:
stmt.setInt(parameterIndex, Integer.parseInt(parameterValue));
break;
case Types.BIGINT:
stmt.setLong(parameterIndex, Long.parseLong(parameterValue));
break;
case Types.REAL:
stmt.setFloat(parameterIndex, Float.parseFloat(parameterValue));
break;
case Types.FLOAT:
case Types.DOUBLE:
stmt.setDouble(parameterIndex, Double.parseDouble(parameterValue));
break;
case Types.DECIMAL:
case Types.NUMERIC:
stmt.setBigDecimal(parameterIndex, new BigDecimal(parameterValue));
break;
case Types.DATE:
stmt.setDate(parameterIndex, new Date(Long.parseLong(parameterValue)));
break;
case Types.TIME:
stmt.setTime(parameterIndex, new Time(Long.parseLong(parameterValue)));
break;
case Types.TIMESTAMP:
stmt.setTimestamp(parameterIndex, new Timestamp(Long.parseLong(parameterValue)));
break;
case Types.CHAR:
case Types.VARCHAR:
case Types.LONGNVARCHAR:
case Types.LONGVARCHAR:
stmt.setString(parameterIndex, parameterValue);
break;
default:
stmt.setObject(parameterIndex, parameterValue, jdbcType);
break;
}
} catch (SQLException e) {
// Log which attribute/parameter had an error, then rethrow to be handled at the top level
getLogger().error("Error setting parameter {} to value from {} ({})", new Object[]{parameterIndex, attrName, parameterValue}, e);
throw e;
}
}
}
protected static class TableName {
private final String database;
private final String table;
private final boolean input;
TableName(String database, String table, boolean input) {
this.database = database;
this.table = table;
this.input = input;
}
public String getDatabase() {
return database;
}
public String getTable() {
return table;
}
public boolean isInput() {
return input;
}
@Override
public String toString() {
return database == null || database.isEmpty() ? table : database + '.' + table;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TableName tableName = (TableName) o;
if (input != tableName.input) return false;
if (database != null ? !database.equals(tableName.database) : tableName.database != null) return false;
return table.equals(tableName.table);
}
@Override
public int hashCode() {
int result = database != null ? database.hashCode() : 0;
result = 31 * result + table.hashCode();
result = 31 * result + (input ? 1 : 0);
return result;
}
}
protected Set<TableName> findTableNames(final String query) {
final ASTNode node;
try {
node = new ParseDriver().parse(normalize(query));
} catch (ParseException e) {
// If failed to parse the query, just log a message, but continue.
getLogger().debug("Failed to parse query: {} due to {}", new Object[]{query, e}, e);
return Collections.emptySet();
}
final HashSet<TableName> tableNames = new HashSet<>();
findTableNames(node, tableNames);
return tableNames;
}
/**
* Normalize query.
* Hive resolves prepared statement parameters before executing a query,
* see {@link org.apache.hive.jdbc.HivePreparedStatement#updateSql(String, HashMap)} for detail.
* HiveParser does not expect '?' to be in a query string, and throws an Exception if there is one.
* In this normalize method, '?' is replaced to 'x' to avoid that.
*/
private String normalize(String query) {
return query.replace('?', 'x');
}
private void findTableNames(final Object obj, final Set<TableName> tableNames) {
if (!(obj instanceof CommonTree)) {
return;
}
final CommonTree tree = (CommonTree) obj;
final int childCount = tree.getChildCount();
if ("TOK_TABNAME".equals(tree.getText())) {
final TableName tableName;
final boolean isInput = "TOK_TABREF".equals(tree.getParent().getText());
switch (childCount) {
case 1 :
tableName = new TableName(null, tree.getChild(0).getText(), isInput);
break;
case 2:
tableName = new TableName(tree.getChild(0).getText(), tree.getChild(1).getText(), isInput);
break;
default:
throw new IllegalStateException("TOK_TABNAME does not have expected children, childCount=" + childCount);
}
// If parent is TOK_TABREF, then it is an input table.
tableNames.add(tableName);
return;
}
for (int i = 0; i < childCount; i++) {
findTableNames(tree.getChild(i), tableNames);
}
}
protected Map<String, String> toQueryTableAttributes(Set<TableName> tableNames) {
final Map<String, String> attributes = new HashMap<>();
for (TableName tableName : tableNames) {
final String attributeName = tableName.isInput() ? ATTR_INPUT_TABLES : ATTR_OUTPUT_TABLES;
if (attributes.containsKey(attributeName)) {
attributes.put(attributeName, attributes.get(attributeName) + "," + tableName);
} else {
attributes.put(attributeName, tableName.toString());
}
}
return attributes;
}
}

View File

@ -1,293 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.CompressionKind;
import org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils;
import org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.SideEffectFree;
import org.apache.nifi.annotation.behavior.SupportsBatching;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.resource.ResourceCardinality;
import org.apache.nifi.components.resource.ResourceType;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.DataUnit;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.util.hive.HiveJdbcCommon;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
/**
* The ConvertAvroToORC processor takes an Avro-formatted flow file as input and converts it into ORC format.
*/
@SideEffectFree
@SupportsBatching
@Tags({"avro", "orc", "hive", "convert"})
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@CapabilityDescription("Converts an Avro record into ORC file format. This processor provides a direct mapping of an Avro record to an ORC record, such "
+ "that the resulting ORC file will have the same hierarchical structure as the Avro document. If an incoming FlowFile contains a stream of "
+ "multiple Avro records, the resultant FlowFile will contain a ORC file containing all of the Avro records. If an incoming FlowFile does "
+ "not contain any records, an empty ORC file is the output. NOTE: Many Avro datatypes (collections, primitives, and unions of primitives, e.g.) can "
+ "be converted to ORC, but unions of collections and other complex datatypes may not be able to be converted to ORC.")
@WritesAttributes({
@WritesAttribute(attribute = "mime.type", description = "Sets the mime type to application/octet-stream"),
@WritesAttribute(attribute = "filename", description = "Sets the filename to the existing filename with the extension replaced by / added to by .orc"),
@WritesAttribute(attribute = "record.count", description = "Sets the number of records in the ORC file."),
@WritesAttribute(attribute = "hive.ddl", description = "Creates a partial Hive DDL statement for creating a table in Hive from this ORC file. "
+ "This can be used in ReplaceText for setting the content to the DDL. To make it valid DDL, add \"LOCATION '<path_to_orc_file_in_hdfs>'\", where "
+ "the path is the directory that contains this ORC file on HDFS. For example, ConvertAvroToORC can send flow files to a PutHDFS processor to send the file to "
+ "HDFS, then to a ReplaceText to set the content to this DDL (plus the LOCATION clause as described), then to PutHiveQL processor to create the table "
+ "if it doesn't exist.")
})
public class ConvertAvroToORC extends AbstractProcessor {
// Attributes
public static final String ORC_MIME_TYPE = "application/octet-stream";
public static final String HIVE_DDL_ATTRIBUTE = "hive.ddl";
public static final String RECORD_COUNT_ATTRIBUTE = "record.count";
// Properties
public static final PropertyDescriptor ORC_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder()
.name("orc-config-resources")
.displayName("ORC Configuration Resources")
.description("A file or comma separated list of files which contains the ORC configuration (hive-site.xml, e.g.). Without this, Hadoop "
+ "will search the classpath for a 'hive-site.xml' file or will revert to a default configuration. Please see the ORC documentation for more details.")
.required(false)
.identifiesExternalResource(ResourceCardinality.MULTIPLE, ResourceType.FILE)
.build();
public static final PropertyDescriptor STRIPE_SIZE = new PropertyDescriptor.Builder()
.name("orc-stripe-size")
.displayName("Stripe Size")
.description("The size of the memory buffer (in bytes) for writing stripes to an ORC file")
.required(true)
.addValidator(StandardValidators.DATA_SIZE_VALIDATOR)
.defaultValue("64 MB")
.build();
public static final PropertyDescriptor BUFFER_SIZE = new PropertyDescriptor.Builder()
.name("orc-buffer-size")
.displayName("Buffer Size")
.description("The maximum size of the memory buffers (in bytes) used for compressing and storing a stripe in memory. This is a hint to the ORC writer, "
+ "which may choose to use a smaller buffer size based on stripe size and number of columns for efficient stripe writing and memory utilization.")
.required(true)
.addValidator(StandardValidators.DATA_SIZE_VALIDATOR)
.defaultValue("10 KB")
.build();
public static final PropertyDescriptor COMPRESSION_TYPE = new PropertyDescriptor.Builder()
.name("orc-compression-type")
.displayName("Compression Type")
.required(true)
.allowableValues("NONE", "ZLIB", "SNAPPY", "LZO")
.defaultValue("NONE")
.build();
public static final PropertyDescriptor HIVE_TABLE_NAME = new PropertyDescriptor.Builder()
.name("orc-hive-table-name")
.displayName("Hive Table Name")
.description("An optional table name to insert into the hive.ddl attribute. The generated DDL can be used by "
+ "a PutHiveQL processor (presumably after a PutHDFS processor) to create a table backed by the converted ORC file. "
+ "If this property is not provided, the full name (including namespace) of the incoming Avro record will be normalized "
+ "and used as the table name.")
.required(false)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_BLANK_VALIDATOR)
.build();
// Relationships
static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("A FlowFile is routed to this relationship after it has been converted to ORC format.")
.build();
static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("A FlowFile is routed to this relationship if it cannot be parsed as Avro or cannot be converted to ORC for any reason")
.build();
private final static List<PropertyDescriptor> propertyDescriptors;
private final static Set<Relationship> relationships;
private volatile Configuration orcConfig;
/*
* Will ensure that the list of property descriptors is built only once.
* Will also create a Set of relationships
*/
static {
List<PropertyDescriptor> _propertyDescriptors = new ArrayList<>();
_propertyDescriptors.add(ORC_CONFIGURATION_RESOURCES);
_propertyDescriptors.add(STRIPE_SIZE);
_propertyDescriptors.add(BUFFER_SIZE);
_propertyDescriptors.add(COMPRESSION_TYPE);
_propertyDescriptors.add(HIVE_TABLE_NAME);
propertyDescriptors = Collections.unmodifiableList(_propertyDescriptors);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(_relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@OnScheduled
public void setup(ProcessContext context) {
boolean confFileProvided = context.getProperty(ORC_CONFIGURATION_RESOURCES).isSet();
if (confFileProvided) {
final String configFiles = context.getProperty(ORC_CONFIGURATION_RESOURCES).getValue();
orcConfig = HiveJdbcCommon.getConfigurationFromFiles(configFiles);
}
}
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
try {
long startTime = System.currentTimeMillis();
final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue();
final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue();
final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue());
final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null);
final AtomicInteger totalRecordCount = new AtomicInteger(0);
final String fileName = flowFile.getAttribute(CoreAttributes.FILENAME.key());
flowFile = session.write(flowFile, (rawIn, rawOut) -> {
try (final InputStream in = new BufferedInputStream(rawIn);
final OutputStream out = new BufferedOutputStream(rawOut);
final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<>())) {
// Create ORC schema from Avro schema
Schema avroSchema = reader.getSchema();
TypeInfo orcSchema = NiFiOrcUtils.getOrcField(avroSchema);
if (orcConfig == null) {
orcConfig = new Configuration();
}
OrcFlowFileWriter orcWriter = NiFiOrcUtils.createWriter(
out,
new Path(fileName),
orcConfig,
orcSchema,
stripeSize,
compressionType,
bufferSize);
try {
int recordCount = 0;
while (reader.hasNext()) {
GenericRecord currRecord = reader.next();
List<Schema.Field> fields = currRecord.getSchema().getFields();
if (fields != null) {
Object[] row = new Object[fields.size()];
for (int i = 0; i < fields.size(); i++) {
Schema.Field field = fields.get(i);
Schema fieldSchema = field.schema();
Object o = currRecord.get(field.name());
try {
row[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), o);
} catch (ArrayIndexOutOfBoundsException aioobe) {
getLogger().error("Index out of bounds at record {} for column {}, type {}, and object {}",
new Object[]{recordCount, i, fieldSchema.getType().getName(), o.toString()},
aioobe);
throw new IOException(aioobe);
}
}
orcWriter.addRow(NiFiOrcUtils.createOrcStruct(orcSchema, row));
recordCount++;
}
}
hiveAvroSchema.set(avroSchema);
totalRecordCount.set(recordCount);
} finally {
// finished writing this record, close the writer (which will flush to the flow file)
orcWriter.close();
}
}
});
final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet()
? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue()
: NiFiOrcUtils.normalizeHiveTableName(hiveAvroSchema.get().getFullName());
String hiveDDL = NiFiOrcUtils.generateHiveDDL(hiveAvroSchema.get(), hiveTableName);
// Add attributes and transfer to success
flowFile = session.putAttribute(flowFile, RECORD_COUNT_ATTRIBUTE, Integer.toString(totalRecordCount.get()));
flowFile = session.putAttribute(flowFile, HIVE_DDL_ATTRIBUTE, hiveDDL);
StringBuilder newFilename = new StringBuilder();
int extensionIndex = fileName.lastIndexOf(".");
if (extensionIndex != -1) {
newFilename.append(fileName.substring(0, extensionIndex));
} else {
newFilename.append(fileName);
}
newFilename.append(".orc");
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), ORC_MIME_TYPE);
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename.toString());
session.transfer(flowFile, REL_SUCCESS);
session.getProvenanceReporter().modifyContent(flowFile, "Converted " + totalRecordCount.get() + " records", System.currentTimeMillis() - startTime);
} catch (ProcessException | IllegalArgumentException e) {
getLogger().error("Failed to convert {} from Avro to ORC due to {}; transferring to failure", new Object[]{flowFile, e});
session.transfer(flowFile, REL_FAILURE);
}
}
}

View File

@ -1,300 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessSessionFactory;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.pattern.ErrorTypes;
import org.apache.nifi.processor.util.pattern.ExceptionHandler;
import org.apache.nifi.processor.util.pattern.ExceptionHandler.OnError;
import org.apache.nifi.processor.util.pattern.PartialFunctions.FetchFlowFiles;
import org.apache.nifi.processor.util.pattern.PartialFunctions.InitConnection;
import org.apache.nifi.processor.util.pattern.Put;
import org.apache.nifi.processor.util.pattern.RollbackOnFailure;
import org.apache.nifi.processor.util.pattern.RoutingResult;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.SQLNonTransientException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
@SeeAlso(SelectHiveQL.class)
@InputRequirement(Requirement.INPUT_REQUIRED)
@Tags({"sql", "hive", "put", "database", "update", "insert"})
@CapabilityDescription("Executes a HiveQL DDL/DML command (UPDATE, INSERT, e.g.). The content of an incoming FlowFile is expected to be the HiveQL command "
+ "to execute. The HiveQL command may use the ? to escape parameters. In this case, the parameters to use must exist as FlowFile attributes "
+ "with the naming convention hiveql.args.N.type and hiveql.args.N.value, where N is a positive integer. The hiveql.args.N.type is expected to be "
+ "a number indicating the JDBC Type. The content of the FlowFile is expected to be in UTF-8 format.")
@ReadsAttributes({
@ReadsAttribute(attribute = "hiveql.args.N.type", description = "Incoming FlowFiles are expected to be parametrized HiveQL statements. The type of each Parameter is specified as an integer "
+ "that represents the JDBC Type of the parameter."),
@ReadsAttribute(attribute = "hiveql.args.N.value", description = "Incoming FlowFiles are expected to be parametrized HiveQL statements. The value of the Parameters are specified as "
+ "hiveql.args.1.value, hiveql.args.2.value, hiveql.args.3.value, and so on. The type of the hiveql.args.1.value Parameter is specified by the hiveql.args.1.type attribute.")
})
@WritesAttributes({
@WritesAttribute(attribute = "query.input.tables", description = "This attribute is written on the flow files routed to the 'success' relationships, "
+ "and contains input table names (if any) in comma delimited 'databaseName.tableName' format."),
@WritesAttribute(attribute = "query.output.tables", description = "This attribute is written on the flow files routed to the 'success' relationships, "
+ "and contains the target table names in 'databaseName.tableName' format.")
})
@DeprecationNotice(classNames = "org.apache.nifi.processors.hive.PutHive3QL")
public class PutHiveQL extends AbstractHiveQLProcessor {
public static final PropertyDescriptor BATCH_SIZE = new PropertyDescriptor.Builder()
.name("hive-batch-size")
.displayName("Batch Size")
.description("The preferred number of FlowFiles to put to the database in a single transaction")
.required(true)
.addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
.defaultValue("100")
.build();
public static final PropertyDescriptor STATEMENT_DELIMITER = new PropertyDescriptor.Builder()
.name("statement-delimiter")
.displayName("Statement Delimiter")
.description("Statement Delimiter used to separate SQL statements in a multiple statement script")
.required(true)
.defaultValue(";")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("A FlowFile is routed to this relationship after the database is successfully updated")
.build();
public static final Relationship REL_RETRY = new Relationship.Builder()
.name("retry")
.description("A FlowFile is routed to this relationship if the database cannot be updated but attempting the operation again may succeed")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("A FlowFile is routed to this relationship if the database cannot be updated and retrying the operation will also fail, "
+ "such as an invalid query or an integrity constraint violation")
.build();
private final static List<PropertyDescriptor> propertyDescriptors;
private final static Set<Relationship> relationships;
/*
* Will ensure that the list of property descriptors is built only once.
* Will also create a Set of relationships
*/
static {
List<PropertyDescriptor> _propertyDescriptors = new ArrayList<>();
_propertyDescriptors.add(HIVE_DBCP_SERVICE);
_propertyDescriptors.add(BATCH_SIZE);
_propertyDescriptors.add(CHARSET);
_propertyDescriptors.add(STATEMENT_DELIMITER);
_propertyDescriptors.add(RollbackOnFailure.ROLLBACK_ON_FAILURE);
propertyDescriptors = Collections.unmodifiableList(_propertyDescriptors);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
_relationships.add(REL_RETRY);
relationships = Collections.unmodifiableSet(_relationships);
}
private Put<FunctionContext, Connection> process;
private ExceptionHandler<FunctionContext> exceptionHandler;
@OnScheduled
public void constructProcess() {
exceptionHandler = new ExceptionHandler<>();
exceptionHandler.mapException(e -> {
if (e instanceof SQLNonTransientException) {
return ErrorTypes.InvalidInput;
} else if (e instanceof SQLException) {
// Use the SQLException's vendor code for guidance -- see Hive's ErrorMsg class for details on error codes
int errorCode = ((SQLException) e).getErrorCode();
getLogger().debug("Error occurred during Hive operation, Hive returned error code {}", new Object[]{errorCode});
if (errorCode >= 10000 && errorCode < 20000) {
return ErrorTypes.InvalidInput;
} else if (errorCode >= 20000 && errorCode < 30000) {
return ErrorTypes.InvalidInput;
} else if (errorCode >= 30000 && errorCode < 40000) {
return ErrorTypes.TemporalInputFailure;
} else if (errorCode >= 40000 && errorCode < 50000) {
// These are unknown errors (to include some parse errors), but rather than generating an UnknownFailure which causes
// a ProcessException, we'll route to failure via an InvalidInput error type.
return ErrorTypes.InvalidInput;
} else {
// Default unknown errors to TemporalFailure (as they were implemented originally), so they can be routed to failure
// or rolled back depending on the user's setting of Rollback On Failure.
return ErrorTypes.TemporalFailure;
}
} else {
return ErrorTypes.UnknownFailure;
}
});
exceptionHandler.adjustError(RollbackOnFailure.createAdjustError(getLogger()));
process = new Put<>();
process.setLogger(getLogger());
process.initConnection(initConnection);
process.fetchFlowFiles(fetchFlowFiles);
process.putFlowFile(putFlowFile);
process.adjustRoute(RollbackOnFailure.createAdjustRoute(REL_FAILURE, REL_RETRY));
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
private class FunctionContext extends RollbackOnFailure {
final Charset charset;
final String statementDelimiter;
final long startNanos = System.nanoTime();
String connectionUrl;
private FunctionContext(boolean rollbackOnFailure, Charset charset, String statementDelimiter) {
super(rollbackOnFailure, false);
this.charset = charset;
this.statementDelimiter = statementDelimiter;
}
}
private InitConnection<FunctionContext, Connection> initConnection = (context, session, fc, ffs) -> {
final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
final Connection connection = dbcpService.getConnection(ffs == null || ffs.isEmpty() ? Collections.emptyMap() : ffs.get(0).getAttributes());
fc.connectionUrl = dbcpService.getConnectionURL();
return connection;
};
private FetchFlowFiles<FunctionContext> fetchFlowFiles = (context, session, functionContext, result) -> {
final int batchSize = context.getProperty(BATCH_SIZE).asInteger();
return session.get(batchSize);
};
private Put.PutFlowFile<FunctionContext, Connection> putFlowFile = (context, session, fc, conn, flowFile, result) -> {
final String script = getHiveQL(session, flowFile, fc.charset);
String regex = "(?<!\\\\)" + Pattern.quote(fc.statementDelimiter);
String[] hiveQLs = script.split(regex);
final Set<TableName> tableNames = new HashSet<>();
exceptionHandler.execute(fc, flowFile, input -> {
int loc = 1;
for (String hiveQLStr: hiveQLs) {
getLogger().debug("HiveQL: {}", new Object[]{hiveQLStr});
final String hiveQL = hiveQLStr.trim();
if (!StringUtils.isEmpty(hiveQL)) {
try (final PreparedStatement stmt = conn.prepareStatement(hiveQL)) {
// Get ParameterMetadata
// Hive JDBC Doesn't support this yet:
// ParameterMetaData pmd = stmt.getParameterMetaData();
// int paramCount = pmd.getParameterCount();
int paramCount = StringUtils.countMatches(hiveQL, "?");
if (paramCount > 0) {
loc = setParameters(loc, stmt, paramCount, flowFile.getAttributes());
}
// Parse hiveQL and extract input/output tables
try {
tableNames.addAll(findTableNames(hiveQL));
} catch (Exception e) {
// If failed to parse the query, just log a warning message, but continue.
getLogger().warn("Failed to parse hiveQL: {} due to {}", new Object[]{hiveQL, e}, e);
}
// Execute the statement
stmt.execute();
fc.proceed();
}
}
}
// Emit a Provenance SEND event
final long transmissionMillis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - fc.startNanos);
final FlowFile updatedFlowFile = session.putAllAttributes(flowFile, toQueryTableAttributes(tableNames));
session.getProvenanceReporter().send(updatedFlowFile, fc.connectionUrl, transmissionMillis, true);
result.routeTo(flowFile, REL_SUCCESS);
}, onFlowFileError(context, session, result));
};
private OnError<FunctionContext, FlowFile> onFlowFileError(final ProcessContext context, final ProcessSession session, final RoutingResult result) {
OnError<FunctionContext, FlowFile> onFlowFileError = ExceptionHandler.createOnError(context, session, result, REL_FAILURE, REL_RETRY);
onFlowFileError = onFlowFileError.andThen((c, i, r, e) -> {
switch (r.destination()) {
case Failure:
getLogger().error("Failed to update Hive for {} due to {}; routing to failure", new Object[] {i, e}, e);
break;
case Retry:
getLogger().error("Failed to update Hive for {} due to {}; it is possible that retrying the operation will succeed, so routing to retry",
new Object[] {i, e}, e);
break;
case Self:
getLogger().error("Failed to update Hive for {} due to {};", new Object[] {i, e}, e);
break;
}
});
return RollbackOnFailure.createOnError(onFlowFileError);
}
@Override
public void onTrigger(ProcessContext context, ProcessSessionFactory sessionFactory) throws ProcessException {
final Boolean rollbackOnFailure = context.getProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE).asBoolean();
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
final String statementDelimiter = context.getProperty(STATEMENT_DELIMITER).getValue();
final FunctionContext functionContext = new FunctionContext(rollbackOnFailure, charset, statementDelimiter);
RollbackOnFailure.onTrigger(context, sessionFactory, functionContext, getLogger(), session -> process.onTrigger(context, session, functionContext));
}
}

View File

@ -1,572 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessSessionFactory;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.pattern.PartialFunctions;
import org.apache.nifi.util.StopWatch;
import org.apache.nifi.util.hive.CsvOutputOptions;
import org.apache.nifi.util.hive.HiveJdbcCommon;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import static org.apache.nifi.util.hive.HiveJdbcCommon.AVRO;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV_MIME_TYPE;
import static org.apache.nifi.util.hive.HiveJdbcCommon.MIME_TYPE_AVRO_BINARY;
import static org.apache.nifi.util.hive.HiveJdbcCommon.NORMALIZE_NAMES_FOR_AVRO;
@EventDriven
@InputRequirement(Requirement.INPUT_ALLOWED)
@Tags({"hive", "sql", "select", "jdbc", "query", "database"})
@CapabilityDescription("Execute provided HiveQL SELECT query against a Hive database connection. Query result will be converted to Avro or CSV format."
+ " Streaming is used so arbitrarily large result sets are supported. This processor can be scheduled to run on "
+ "a timer, or cron expression, using the standard scheduling methods, or it can be triggered by an incoming FlowFile. "
+ "If it is triggered by an incoming FlowFile, then attributes of that FlowFile will be available when evaluating the "
+ "select query. FlowFile attribute 'selecthiveql.row.count' indicates how many rows were selected.")
@WritesAttributes({
@WritesAttribute(attribute = "mime.type", description = "Sets the MIME type for the outgoing flowfile to application/avro-binary for Avro or text/csv for CSV."),
@WritesAttribute(attribute = "filename", description = "Adds .avro or .csv to the filename attribute depending on which output format is selected."),
@WritesAttribute(attribute = "selecthiveql.row.count", description = "Indicates how many rows were selected/returned by the query."),
@WritesAttribute(attribute = "selecthiveql.query.duration", description = "Combined duration of the query execution time and fetch time in milliseconds. "
+ "If 'Max Rows Per Flow File' is set, then this number will reflect only the fetch time for the rows in the Flow File instead of the entire result set."),
@WritesAttribute(attribute = "selecthiveql.query.executiontime", description = "Duration of the query execution time in milliseconds. "
+ "This number will reflect the query execution time regardless of the 'Max Rows Per Flow File' setting."),
@WritesAttribute(attribute = "selecthiveql.query.fetchtime", description = "Duration of the result set fetch time in milliseconds. "
+ "If 'Max Rows Per Flow File' is set, then this number will reflect only the fetch time for the rows in the Flow File instead of the entire result set."),
@WritesAttribute(attribute = "fragment.identifier", description = "If 'Max Rows Per Flow File' is set then all FlowFiles from the same query result set "
+ "will have the same value for the fragment.identifier attribute. This can then be used to correlate the results."),
@WritesAttribute(attribute = "fragment.count", description = "If 'Max Rows Per Flow File' is set then this is the total number of "
+ "FlowFiles produced by a single ResultSet. This can be used in conjunction with the "
+ "fragment.identifier attribute in order to know how many FlowFiles belonged to the same incoming ResultSet."),
@WritesAttribute(attribute = "fragment.index", description = "If 'Max Rows Per Flow File' is set then the position of this FlowFile in the list of "
+ "outgoing FlowFiles that were all derived from the same result set FlowFile. This can be "
+ "used in conjunction with the fragment.identifier attribute to know which FlowFiles originated from the same query result set and in what order "
+ "FlowFiles were produced"),
@WritesAttribute(attribute = "query.input.tables", description = "Contains input table names in comma delimited 'databaseName.tableName' format.")
})
@DeprecationNotice(classNames = "org.apache.nifi.processors.hive.SelectHive3QL")
public class SelectHiveQL extends AbstractHiveQLProcessor {
public static final String RESULT_ROW_COUNT = "selecthiveql.row.count";
public static final String RESULT_QUERY_DURATION = "selecthiveql.query.duration";
public static final String RESULT_QUERY_EXECUTION_TIME = "selecthiveql.query.executiontime";
public static final String RESULT_QUERY_FETCH_TIME = "selecthiveql.query.fetchtime";
// Relationships
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("Successfully created FlowFile from HiveQL query result set.")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("HiveQL query execution failed. Incoming FlowFile will be penalized and routed to this relationship.")
.build();
public static final PropertyDescriptor HIVEQL_PRE_QUERY = new PropertyDescriptor.Builder()
.name("hive-pre-query")
.displayName("HiveQL Pre-Query")
.description("A semicolon-delimited list of queries executed before the main SQL query is executed. "
+ "Example: 'set tez.queue.name=queue1; set hive.exec.orc.split.strategy=ETL; set hive.exec.reducers.bytes.per.reducer=1073741824'. "
+ "Note, the results/outputs of these queries will be suppressed if successfully executed.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_SELECT_QUERY = new PropertyDescriptor.Builder()
.name("hive-query")
.displayName("HiveQL Select Query")
.description("HiveQL SELECT query to execute. If this is not set, the query is assumed to be in the content of an incoming FlowFile.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_POST_QUERY = new PropertyDescriptor.Builder()
.name("hive-post-query")
.displayName("HiveQL Post-Query")
.description("A semicolon-delimited list of queries executed after the main SQL query is executed. "
+ "Note, the results/outputs of these queries will be suppressed if successfully executed.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor FETCH_SIZE = new PropertyDescriptor.Builder()
.name("hive-fetch-size")
.displayName("Fetch Size")
.description("The number of result rows to be fetched from the result set at a time. This is a hint to the driver and may not be "
+ "honored and/or exact. If the value specified is zero, then the hint is ignored.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor MAX_ROWS_PER_FLOW_FILE = new PropertyDescriptor.Builder()
.name("hive-max-rows")
.displayName("Max Rows Per Flow File")
.description("The maximum number of result rows that will be included in a single FlowFile. " +
"This will allow you to break up very large result sets into multiple FlowFiles. If the value specified is zero, then all rows are returned in a single FlowFile.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor MAX_FRAGMENTS = new PropertyDescriptor.Builder()
.name("hive-max-frags")
.displayName("Maximum Number of Fragments")
.description("The maximum number of fragments. If the value specified is zero, then all fragments are returned. " +
"This prevents OutOfMemoryError when this processor ingests huge table.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_CSV_HEADER = new PropertyDescriptor.Builder()
.name("csv-header")
.displayName("CSV Header")
.description("Include Header in Output")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor HIVEQL_CSV_ALT_HEADER = new PropertyDescriptor.Builder()
.name("csv-alt-header")
.displayName("Alternate CSV Header")
.description("Comma separated list of header fields")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_CSV_DELIMITER = new PropertyDescriptor.Builder()
.name("csv-delimiter")
.displayName("CSV Delimiter")
.description("CSV Delimiter used to separate fields")
.required(true)
.defaultValue(",")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_CSV_QUOTE = new PropertyDescriptor.Builder()
.name("csv-quote")
.displayName("CSV Quote")
.description("Whether to force quoting of CSV fields. Note that this might conflict with the setting for CSV Escape.")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor HIVEQL_CSV_ESCAPE = new PropertyDescriptor.Builder()
.name("csv-escape")
.displayName("CSV Escape")
.description("Whether to escape CSV strings in output. Note that this might conflict with the setting for CSV Quote.")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor HIVEQL_OUTPUT_FORMAT = new PropertyDescriptor.Builder()
.name("hive-output-format")
.displayName("Output Format")
.description("How to represent the records coming from Hive (Avro, CSV, e.g.)")
.required(true)
.allowableValues(AVRO, CSV)
.defaultValue(AVRO)
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
.build();
private final static List<PropertyDescriptor> propertyDescriptors;
private final static Set<Relationship> relationships;
/*
* Will ensure that the list of property descriptors is built only once.
* Will also create a Set of relationships
*/
static {
List<PropertyDescriptor> _propertyDescriptors = new ArrayList<>();
_propertyDescriptors.add(HIVE_DBCP_SERVICE);
_propertyDescriptors.add(HIVEQL_PRE_QUERY);
_propertyDescriptors.add(HIVEQL_SELECT_QUERY);
_propertyDescriptors.add(HIVEQL_POST_QUERY);
_propertyDescriptors.add(FETCH_SIZE);
_propertyDescriptors.add(MAX_ROWS_PER_FLOW_FILE);
_propertyDescriptors.add(MAX_FRAGMENTS);
_propertyDescriptors.add(HIVEQL_OUTPUT_FORMAT);
_propertyDescriptors.add(NORMALIZE_NAMES_FOR_AVRO);
_propertyDescriptors.add(HIVEQL_CSV_HEADER);
_propertyDescriptors.add(HIVEQL_CSV_ALT_HEADER);
_propertyDescriptors.add(HIVEQL_CSV_DELIMITER);
_propertyDescriptors.add(HIVEQL_CSV_QUOTE);
_propertyDescriptors.add(HIVEQL_CSV_ESCAPE);
_propertyDescriptors.add(CHARSET);
propertyDescriptors = Collections.unmodifiableList(_propertyDescriptors);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(_relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@OnScheduled
public void setup(ProcessContext context) {
// If the query is not set, then an incoming flow file is needed. Otherwise fail the initialization
if (!context.getProperty(HIVEQL_SELECT_QUERY).isSet() && !context.hasIncomingConnection()) {
final String errorString = "Either the Select Query must be specified or there must be an incoming connection "
+ "providing flowfile(s) containing a SQL select query";
getLogger().error(errorString);
throw new ProcessException(errorString);
}
}
@Override
public void onTrigger(ProcessContext context, ProcessSessionFactory sessionFactory) throws ProcessException {
PartialFunctions.onTrigger(context, sessionFactory, getLogger(), session -> onTrigger(context, session));
}
private void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile fileToProcess = (context.hasIncomingConnection() ? session.get() : null);
FlowFile flowfile = null;
// If we have no FlowFile, and all incoming connections are self-loops then we can continue on.
// However, if we have no FlowFile and we have connections coming from other Processors, then
// we know that we should run only if we have a FlowFile.
if (context.hasIncomingConnection()) {
if (fileToProcess == null && context.hasNonLoopConnection()) {
return;
}
}
final ComponentLog logger = getLogger();
final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
List<String> preQueries = getQueries(context.getProperty(HIVEQL_PRE_QUERY).evaluateAttributeExpressions(fileToProcess).getValue());
List<String> postQueries = getQueries(context.getProperty(HIVEQL_POST_QUERY).evaluateAttributeExpressions(fileToProcess).getValue());
final boolean flowbased = !(context.getProperty(HIVEQL_SELECT_QUERY).isSet());
// Source the SQL
String hqlStatement;
if (context.getProperty(HIVEQL_SELECT_QUERY).isSet()) {
hqlStatement = context.getProperty(HIVEQL_SELECT_QUERY).evaluateAttributeExpressions(fileToProcess).getValue();
} else {
// If the query is not set, then an incoming flow file is required, and expected to contain a valid SQL select query.
// If there is no incoming connection, onTrigger will not be called as the processor will fail when scheduled.
final StringBuilder queryContents = new StringBuilder();
session.read(fileToProcess, in -> queryContents.append(IOUtils.toString(in, charset)));
hqlStatement = queryContents.toString();
}
final Integer fetchSize = context.getProperty(FETCH_SIZE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxRowsPerFlowFile = context.getProperty(MAX_ROWS_PER_FLOW_FILE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxFragments = context.getProperty(MAX_FRAGMENTS).isSet()
? context.getProperty(MAX_FRAGMENTS).evaluateAttributeExpressions(fileToProcess).asInteger()
: 0;
final String outputFormat = context.getProperty(HIVEQL_OUTPUT_FORMAT).getValue();
final boolean convertNamesForAvro = context.getProperty(NORMALIZE_NAMES_FOR_AVRO).asBoolean();
final StopWatch stopWatch = new StopWatch(true);
final boolean header = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String altHeader = context.getProperty(HIVEQL_CSV_ALT_HEADER).evaluateAttributeExpressions(fileToProcess).getValue();
final String delimiter = context.getProperty(HIVEQL_CSV_DELIMITER).evaluateAttributeExpressions(fileToProcess).getValue();
final boolean quote = context.getProperty(HIVEQL_CSV_QUOTE).asBoolean();
final boolean escape = context.getProperty(HIVEQL_CSV_ESCAPE).asBoolean();
final String fragmentIdentifier = UUID.randomUUID().toString();
try (final Connection con = dbcpService.getConnection(fileToProcess == null ? Collections.emptyMap() : fileToProcess.getAttributes());
final Statement st = (flowbased ? con.prepareStatement(hqlStatement) : con.createStatement())
) {
Pair<String,SQLException> failure = executeConfigStatements(con, preQueries);
if (failure != null) {
// In case of failure, assigning config query to "hqlStatement" to follow current error handling
hqlStatement = failure.getLeft();
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw failure.getRight();
}
if (fetchSize != null && fetchSize > 0) {
try {
st.setFetchSize(fetchSize);
} catch (SQLException se) {
// Not all drivers support this, just log the error (at debug level) and move on
logger.debug("Cannot set fetch size to {} due to {}", new Object[]{fetchSize, se.getLocalizedMessage()}, se);
}
}
final List<FlowFile> resultSetFlowFiles = new ArrayList<>();
try {
logger.debug("Executing query {}", new Object[]{hqlStatement});
if (flowbased) {
// Hive JDBC Doesn't Support this yet:
// ParameterMetaData pmd = ((PreparedStatement)st).getParameterMetaData();
// int paramCount = pmd.getParameterCount();
// Alternate way to determine number of params in SQL.
int paramCount = StringUtils.countMatches(hqlStatement, "?");
if (paramCount > 0) {
setParameters(1, (PreparedStatement) st, paramCount, fileToProcess.getAttributes());
}
}
final StopWatch executionTime = new StopWatch(true);
final ResultSet resultSet;
try {
resultSet = (flowbased ? ((PreparedStatement) st).executeQuery() : st.executeQuery(hqlStatement));
} catch (SQLException se) {
// If an error occurs during the query, a flowfile is expected to be routed to failure, so ensure one here
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw se;
}
long executionTimeElapsed = executionTime.getElapsed(TimeUnit.MILLISECONDS);
int fragmentIndex = 0;
String baseFilename = (fileToProcess != null) ? fileToProcess.getAttribute(CoreAttributes.FILENAME.key()) : null;
while (true) {
final AtomicLong nrOfRows = new AtomicLong(0L);
final StopWatch fetchTime = new StopWatch(true);
flowfile = (fileToProcess == null) ? session.create() : session.create(fileToProcess);
if (baseFilename == null) {
baseFilename = flowfile.getAttribute(CoreAttributes.FILENAME.key());
}
try {
flowfile = session.write(flowfile, out -> {
try {
if (AVRO.equals(outputFormat)) {
nrOfRows.set(HiveJdbcCommon.convertToAvroStream(resultSet, out, maxRowsPerFlowFile, convertNamesForAvro));
} else if (CSV.equals(outputFormat)) {
CsvOutputOptions options = new CsvOutputOptions(header, altHeader, delimiter, quote, escape, maxRowsPerFlowFile);
nrOfRows.set(HiveJdbcCommon.convertToCsvStream(resultSet, out, options));
} else {
nrOfRows.set(0L);
throw new ProcessException("Unsupported output format: " + outputFormat);
}
} catch (final SQLException | RuntimeException e) {
throw new ProcessException("Error during database query or conversion of records.", e);
}
});
} catch (ProcessException e) {
// Add flowfile to results before rethrowing so it will be removed from session in outer catch
resultSetFlowFiles.add(flowfile);
throw e;
}
long fetchTimeElapsed = fetchTime.getElapsed(TimeUnit.MILLISECONDS);
if (nrOfRows.get() > 0 || resultSetFlowFiles.isEmpty()) {
final Map<String, String> attributes = new HashMap<>();
// Set attribute for how many rows were selected
attributes.put(RESULT_ROW_COUNT, String.valueOf(nrOfRows.get()));
try {
// Set input/output table names by parsing the query
attributes.putAll(toQueryTableAttributes(findTableNames(hqlStatement)));
} catch (Exception e) {
// If failed to parse the query, just log a warning message, but continue.
getLogger().warn("Failed to parse query: {} due to {}", new Object[]{hqlStatement, e}, e);
}
// Set MIME type on output document and add extension to filename
if (AVRO.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), MIME_TYPE_AVRO_BINARY);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".avro");
} else if (CSV.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".csv");
}
if (maxRowsPerFlowFile > 0) {
attributes.put("fragment.identifier", fragmentIdentifier);
attributes.put("fragment.index", String.valueOf(fragmentIndex));
}
attributes.put(RESULT_QUERY_DURATION, String.valueOf(executionTimeElapsed + fetchTimeElapsed));
attributes.put(RESULT_QUERY_EXECUTION_TIME, String.valueOf(executionTimeElapsed));
attributes.put(RESULT_QUERY_FETCH_TIME, String.valueOf(fetchTimeElapsed));
flowfile = session.putAllAttributes(flowfile, attributes);
logger.info("{} contains {} " + outputFormat + " records; transferring to 'success'",
new Object[]{flowfile, nrOfRows.get()});
if (context.hasIncomingConnection()) {
// If the flow file came from an incoming connection, issue a Fetch provenance event
session.getProvenanceReporter().fetch(flowfile, dbcpService.getConnectionURL(),
"Retrieved " + nrOfRows.get() + " rows", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
} else {
// If we created a flow file from rows received from Hive, issue a Receive provenance event
session.getProvenanceReporter().receive(flowfile, dbcpService.getConnectionURL(), stopWatch.getElapsed(TimeUnit.MILLISECONDS));
}
resultSetFlowFiles.add(flowfile);
} else {
// If there were no rows returned (and the first flow file has been sent, we're done processing, so remove the flowfile and carry on
session.remove(flowfile);
if (resultSetFlowFiles != null && resultSetFlowFiles.size()>0) {
flowfile = resultSetFlowFiles.get(resultSetFlowFiles.size()-1);
}
break;
}
fragmentIndex++;
if (maxFragments > 0 && fragmentIndex >= maxFragments) {
break;
}
}
for (int i = 0; i < resultSetFlowFiles.size(); i++) {
// Set count on all FlowFiles
if (maxRowsPerFlowFile > 0) {
resultSetFlowFiles.set(i,
session.putAttribute(resultSetFlowFiles.get(i), "fragment.count", Integer.toString(fragmentIndex)));
}
}
} catch (final SQLException e) {
throw e;
}
failure = executeConfigStatements(con, postQueries);
if (failure != null) {
hqlStatement = failure.getLeft();
if (resultSetFlowFiles != null) {
resultSetFlowFiles.forEach(ff -> session.remove(ff));
}
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw failure.getRight();
}
session.transfer(resultSetFlowFiles, REL_SUCCESS);
if (fileToProcess != null) {
session.remove(fileToProcess);
}
} catch (final ProcessException | SQLException e) {
logger.error("Issue processing SQL {} due to {}.", new Object[]{hqlStatement, e});
if (flowfile == null) {
// This can happen if any exceptions occur while setting up the connection, statement, etc.
logger.error("Unable to execute HiveQL select query {} due to {}. No FlowFile to route to failure",
new Object[]{hqlStatement, e});
context.yield();
} else {
if (context.hasIncomingConnection()) {
logger.error("Unable to execute HiveQL select query {} for {} due to {}; routing to failure",
new Object[]{hqlStatement, flowfile, e});
flowfile = session.penalize(flowfile);
} else {
logger.error("Unable to execute HiveQL select query {} due to {}; routing to failure",
new Object[]{hqlStatement, e});
context.yield();
}
session.transfer(flowfile, REL_FAILURE);
}
}
}
/*
* Executes given queries using pre-defined connection.
* Returns null on success, or a query string if failed.
*/
protected Pair<String,SQLException> executeConfigStatements(final Connection con, final List<String> configQueries){
if (configQueries == null || configQueries.isEmpty()) {
return null;
}
for (String confSQL : configQueries) {
try(final Statement st = con.createStatement()){
st.execute(confSQL);
} catch (SQLException e) {
return Pair.of(confSQL, e);
}
}
return null;
}
protected List<String> getQueries(final String value) {
if (value == null || value.length() == 0 || value.trim().length() == 0) {
return null;
}
final List<String> queries = new LinkedList<>();
for (String query : value.split(";")) {
if (query.trim().length() > 0) {
queries.add(query.trim());
}
}
return queries;
}
}

View File

@ -1,769 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.pattern.DiscontinuedException;
import org.apache.nifi.processors.hadoop.exception.RecordReaderFactoryException;
import org.apache.nifi.serialization.MalformedRecordException;
import org.apache.nifi.serialization.RecordReader;
import org.apache.nifi.serialization.RecordReaderFactory;
import org.apache.nifi.serialization.RecordSetWriter;
import org.apache.nifi.serialization.RecordSetWriterFactory;
import org.apache.nifi.serialization.SimpleRecordSchema;
import org.apache.nifi.serialization.WriteResult;
import org.apache.nifi.serialization.record.MapRecord;
import org.apache.nifi.serialization.record.Record;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordSchema;
import org.apache.nifi.util.StringUtils;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@Tags({"hive", "metadata", "jdbc", "database", "table"})
@CapabilityDescription("This processor uses a Hive JDBC connection and incoming records to generate any Hive 1.2 table changes needed to support the incoming records.")
@ReadsAttributes({
@ReadsAttribute(attribute = "hive.table.management.strategy", description = "This attribute is read if the 'Table Management Strategy' property is configured "
+ "to use the value of this attribute. The value of this attribute should correspond (ignoring case) to a valid option of the 'Table Management Strategy' property.")
})
@WritesAttributes({
@WritesAttribute(attribute = "output.table", description = "This attribute is written on the flow files routed to the 'success' "
+ "and 'failure' relationships, and contains the target table name."),
@WritesAttribute(attribute = "output.path", description = "This attribute is written on the flow files routed to the 'success' "
+ "and 'failure' relationships, and contains the path on the file system to the table (or partition location if the table is partitioned)."),
@WritesAttribute(attribute = "mime.type", description = "Sets the mime.type attribute to the MIME Type specified by the Record Writer, only if a Record Writer is specified "
+ "and Update Field Names is 'true'."),
@WritesAttribute(attribute = "record.count", description = "Sets the number of records in the FlowFile, only if a Record Writer is specified and Update Field Names is 'true'.")
})
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@RequiresInstanceClassLoading
@DeprecationNotice(classNames = "org.apache.nifi.processors.hive.UpdateHive3Table")
public class UpdateHiveTable extends AbstractProcessor {
static final String TEXTFILE = "TEXTFILE";
static final String SEQUENCEFILE = "SEQUENCEFILE";
static final String ORC = "ORC";
static final String PARQUET = "PARQUET";
static final String AVRO = "AVRO";
static final String RCFILE = "RCFILE";
static final AllowableValue TEXTFILE_STORAGE = new AllowableValue(TEXTFILE, TEXTFILE, "Stored as plain text files. TEXTFILE is the default file format, unless the configuration "
+ "parameter hive.default.fileformat has a different setting.");
static final AllowableValue SEQUENCEFILE_STORAGE = new AllowableValue(SEQUENCEFILE, SEQUENCEFILE, "Stored as compressed Sequence Files.");
static final AllowableValue ORC_STORAGE = new AllowableValue(ORC, ORC, "Stored as ORC file format. Supports ACID Transactions & Cost-based Optimizer (CBO). "
+ "Stores column-level metadata.");
static final AllowableValue PARQUET_STORAGE = new AllowableValue(PARQUET, PARQUET, "Stored as Parquet format for the Parquet columnar storage format.");
static final AllowableValue AVRO_STORAGE = new AllowableValue(AVRO, AVRO, "Stored as Avro format.");
static final AllowableValue RCFILE_STORAGE = new AllowableValue(RCFILE, RCFILE, "Stored as Record Columnar File format.");
static final AllowableValue CREATE_IF_NOT_EXISTS = new AllowableValue("Create If Not Exists", "Create If Not Exists",
"Create a table with the given schema if it does not already exist");
static final AllowableValue FAIL_IF_NOT_EXISTS = new AllowableValue("Fail If Not Exists", "Fail If Not Exists",
"If the target does not already exist, log an error and route the flowfile to failure");
static final String TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE = "hive.table.management.strategy";
static final AllowableValue MANAGED_TABLE = new AllowableValue("Managed", "Managed",
"Any tables created by this processor will be managed tables (see Hive documentation for details).");
static final AllowableValue EXTERNAL_TABLE = new AllowableValue("External", "External",
"Any tables created by this processor will be external tables located at the `External Table Location` property value.");
static final AllowableValue ATTRIBUTE_DRIVEN_TABLE = new AllowableValue("Use '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' Attribute",
"Use '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' Attribute",
"Inspects the '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' FlowFile attribute to determine the table management strategy. The value "
+ "of this attribute must be a case-insensitive match to one of the other allowable values (Managed, External, e.g.).");
static final String ATTR_OUTPUT_TABLE = "output.table";
static final String ATTR_OUTPUT_PATH = "output.path";
// Properties
static final PropertyDescriptor RECORD_READER = new PropertyDescriptor.Builder()
.name("record-reader")
.displayName("Record Reader")
.description("The service for reading incoming flow files. The reader is only used to determine the schema of the records, the actual records will not be processed.")
.identifiesControllerService(RecordReaderFactory.class)
.required(true)
.build();
static final PropertyDescriptor HIVE_DBCP_SERVICE = new PropertyDescriptor.Builder()
.name("hive-dbcp-service")
.displayName("Hive Database Connection Pooling Service")
.description("The Hive Controller Service that is used to obtain connection(s) to the Hive database")
.required(true)
.identifiesControllerService(HiveDBCPService.class)
.build();
static final PropertyDescriptor TABLE_NAME = new PropertyDescriptor.Builder()
.name("hive-table-name")
.displayName("Table Name")
.description("The name of the database table to update. If the table does not exist, then it will either be created or an error thrown, depending "
+ "on the value of the Create Table property.")
.required(true)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
static final PropertyDescriptor CREATE_TABLE = new PropertyDescriptor.Builder()
.name("hive-create-table")
.displayName("Create Table Strategy")
.description("Specifies how to process the target table when it does not exist (create it, fail, e.g.).")
.required(true)
.addValidator(Validator.VALID)
.allowableValues(CREATE_IF_NOT_EXISTS, FAIL_IF_NOT_EXISTS)
.defaultValue(FAIL_IF_NOT_EXISTS.getValue())
.build();
static final PropertyDescriptor TABLE_MANAGEMENT_STRATEGY = new PropertyDescriptor.Builder()
.name("hive-create-table-management")
.displayName("Create Table Management Strategy")
.description("Specifies (when a table is to be created) whether the table is a managed table or an external table. Note that when External is specified, the "
+ "'External Table Location' property must be specified. If the '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' value is selected, 'External Table Location' "
+ "must still be specified, but can contain Expression Language or be set to the empty string, and is ignored when the attribute evaluates to 'Managed'.")
.required(true)
.addValidator(Validator.VALID)
.allowableValues(MANAGED_TABLE, EXTERNAL_TABLE, ATTRIBUTE_DRIVEN_TABLE)
.defaultValue(MANAGED_TABLE.getValue())
.dependsOn(CREATE_TABLE, CREATE_IF_NOT_EXISTS)
.build();
static final PropertyDescriptor UPDATE_FIELD_NAMES = new PropertyDescriptor.Builder()
.name("hive-update-field-names")
.displayName("Update Field Names")
.description("This property indicates whether to update the output schema such that the field names are set to the exact column names from the specified "
+ "table. This should be used if the incoming record field names may not match the table's column names in terms of upper- and lower-case. For example, this property should be "
+ "set to true if the output FlowFile (and target table storage) is Avro format, as Hive/Impala expects the field names to match the column names exactly.")
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
static final PropertyDescriptor RECORD_WRITER_FACTORY = new PropertyDescriptor.Builder()
.name("hive-record-writer")
.displayName("Record Writer")
.description("Specifies the Controller Service to use for writing results to a FlowFile. The Record Writer should use Inherit Schema to emulate the inferred schema behavior, i.e. "
+ "an explicit schema need not be defined in the writer, and will be supplied by the same logic used to infer the schema from the column types. If Create Table Strategy is set "
+ "'Create If Not Exists', the Record Writer's output format must match the Record Reader's format in order for the data to be placed in the created table location. Note that "
+ "this property is only used if 'Update Field Names' is set to true and the field names do not all match the column names exactly. If no "
+ "update is needed for any field names (or 'Update Field Names' is false), the Record Writer is not used and instead the input FlowFile is routed to success or failure "
+ "without modification.")
.identifiesControllerService(RecordSetWriterFactory.class)
.dependsOn(UPDATE_FIELD_NAMES, "true")
.required(true)
.build();
static final PropertyDescriptor EXTERNAL_TABLE_LOCATION = new PropertyDescriptor.Builder()
.name("hive-external-table-location")
.displayName("External Table Location")
.description("Specifies (when an external table is to be created) the file path (in HDFS, e.g.) to store table data.")
.required(true)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.dependsOn(TABLE_MANAGEMENT_STRATEGY, EXTERNAL_TABLE, ATTRIBUTE_DRIVEN_TABLE)
.build();
static final PropertyDescriptor TABLE_STORAGE_FORMAT = new PropertyDescriptor.Builder()
.name("hive-storage-format")
.displayName("Create Table Storage Format")
.description("If a table is to be created, the specified storage format will be used.")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(TEXTFILE_STORAGE, SEQUENCEFILE_STORAGE, ORC_STORAGE, PARQUET_STORAGE, AVRO_STORAGE, RCFILE_STORAGE)
.defaultValue(TEXTFILE)
.dependsOn(CREATE_TABLE, CREATE_IF_NOT_EXISTS)
.build();
static final PropertyDescriptor QUERY_TIMEOUT = new PropertyDescriptor.Builder()
.name("hive-query-timeout")
.displayName("Query Timeout")
.description("Sets the number of seconds the driver will wait for a query to execute. "
+ "A value of 0 means no timeout. NOTE: Non-zero values may not be supported by the driver.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
static final PropertyDescriptor PARTITION_CLAUSE = new PropertyDescriptor.Builder()
.name("hive-partition-clause")
.displayName("Partition Clause")
.description("Specifies a comma-separated list of attribute names and optional data types corresponding to the partition columns of the target table. Simply put, if the table is "
+ "partitioned or is to be created with partitions, each partition name should be an attribute on the FlowFile and listed in this property. This assumes all incoming records "
+ "belong to the same partition and the partition columns are not fields in the record. An example of specifying this field is if PartitionRecord "
+ "is upstream and two partition columns 'name' (of type string) and 'age' (of type integer) are used, then this property can be set to 'name string, age int'. The data types "
+ "are optional and if partition(s) are to be created they will default to string type if not specified. For non-string primitive types, specifying the data type for existing "
+ "partition columns is helpful for interpreting the partition value(s). If the table exists, the data types need not be specified "
+ "(and are ignored in that case). This property must be set if the table is partitioned, and there must be an attribute for each partition column in the table. "
+ "The values of the attributes will be used as the partition values, and the resulting output.path attribute value will reflect the location of the partition in the filesystem "
+ "(for use downstream in processors such as PutHDFS).")
.required(false)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
// Relationships
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("A FlowFile containing records routed to this relationship after the record has been successfully transmitted to Hive.")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("A FlowFile containing records routed to this relationship if the record could not be transmitted to Hive.")
.build();
private List<PropertyDescriptor> propertyDescriptors;
private Set<Relationship> relationships;
@Override
protected void init(ProcessorInitializationContext context) {
List<PropertyDescriptor> props = new ArrayList<>();
props.add(RECORD_READER);
props.add(HIVE_DBCP_SERVICE);
props.add(TABLE_NAME);
props.add(PARTITION_CLAUSE);
props.add(CREATE_TABLE);
props.add(TABLE_MANAGEMENT_STRATEGY);
props.add(EXTERNAL_TABLE_LOCATION);
props.add(TABLE_STORAGE_FORMAT);
props.add(UPDATE_FIELD_NAMES);
props.add(RECORD_WRITER_FACTORY);
props.add(QUERY_TIMEOUT);
propertyDescriptors = Collections.unmodifiableList(props);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(_relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
List<ValidationResult> validationResults = new ArrayList<>(super.customValidate(validationContext));
final boolean recordWriterFactorySet = validationContext.getProperty(RECORD_WRITER_FACTORY).isSet();
final boolean createIfNotExists = validationContext.getProperty(CREATE_TABLE).getValue().equals(CREATE_IF_NOT_EXISTS.getValue());
final boolean updateFieldNames = validationContext.getProperty(UPDATE_FIELD_NAMES).asBoolean();
if (!recordWriterFactorySet && updateFieldNames) {
validationResults.add(new ValidationResult.Builder().subject(RECORD_WRITER_FACTORY.getDisplayName())
.explanation("Record Writer must be set if 'Update Field Names' is true").valid(false).build());
}
final String tableManagementStrategy = validationContext.getProperty(TABLE_MANAGEMENT_STRATEGY).getValue();
final boolean managedTable;
if (!ATTRIBUTE_DRIVEN_TABLE.getValue().equals(tableManagementStrategy)) {
managedTable = MANAGED_TABLE.getValue().equals(tableManagementStrategy);
// Ensure valid configuration for external tables
if (createIfNotExists && !managedTable && !validationContext.getProperty(EXTERNAL_TABLE_LOCATION).isSet()) {
validationResults.add(new ValidationResult.Builder().subject(EXTERNAL_TABLE_LOCATION.getDisplayName())
.explanation("External Table Location must be set when Table Management Strategy is set to External").valid(false).build());
}
}
return validationResults;
}
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER).asControllerService(RecordReaderFactory.class);
final RecordSetWriterFactory recordWriterFactory = context.getProperty(RECORD_WRITER_FACTORY).asControllerService(RecordSetWriterFactory.class);
final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String partitionClauseString = context.getProperty(PARTITION_CLAUSE).evaluateAttributeExpressions(flowFile).getValue();
List<String> partitionClauseElements = null;
if (!StringUtils.isEmpty(partitionClauseString)) {
partitionClauseElements = Arrays.stream(partitionClauseString.split(",")).filter(Objects::nonNull).map(String::trim).collect(Collectors.toList());
}
final ComponentLog log = getLogger();
try {
final RecordReader reader;
try (final InputStream in = session.read(flowFile)) {
// if we fail to create the RecordReader then we want to route to failure, so we need to
// handle this separately from the other IOExceptions which normally route to retry
try {
reader = recordReaderFactory.createRecordReader(flowFile, in, getLogger());
} catch (Exception e) {
throw new RecordReaderFactoryException("Unable to create RecordReader", e);
}
} catch (RecordReaderFactoryException rrfe) {
log.error(
"Failed to create {} for {} - routing to failure",
new Object[]{RecordReader.class.getSimpleName(), flowFile},
rrfe
);
// Since we are wrapping the exceptions above there should always be a cause
// but it's possible it might not have a message. This handles that by logging
// the name of the class thrown.
Throwable c = rrfe.getCause();
if (c != null) {
session.putAttribute(flowFile, "record.error.message", (c.getLocalizedMessage() != null) ? c.getLocalizedMessage() : c.getClass().getCanonicalName() + " Thrown");
} else {
session.putAttribute(flowFile, "record.error.message", rrfe.getClass().getCanonicalName() + " Thrown");
}
session.transfer(flowFile, REL_FAILURE);
return;
}
RecordSchema recordSchema = reader.getSchema();
final boolean createIfNotExists = context.getProperty(CREATE_TABLE).getValue().equals(CREATE_IF_NOT_EXISTS.getValue());
final boolean updateFieldNames = context.getProperty(UPDATE_FIELD_NAMES).asBoolean();
if (recordWriterFactory == null && updateFieldNames) {
throw new ProcessException("Record Writer must be set if 'Update Field Names' is true");
}
final String tableManagementStrategy = context.getProperty(TABLE_MANAGEMENT_STRATEGY).getValue();
final boolean managedTable;
if (ATTRIBUTE_DRIVEN_TABLE.getValue().equals(tableManagementStrategy)) {
String tableManagementStrategyAttribute = flowFile.getAttribute(TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE);
if (MANAGED_TABLE.getValue().equalsIgnoreCase(tableManagementStrategyAttribute)) {
managedTable = true;
} else if (EXTERNAL_TABLE.getValue().equalsIgnoreCase(tableManagementStrategyAttribute)) {
managedTable = false;
} else {
log.error("The '{}' attribute either does not exist or has invalid value: {}. Must be one of (ignoring case): Managed, External. "
+ "Routing flowfile to failure",
new Object[]{TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE, tableManagementStrategyAttribute});
session.transfer(flowFile, REL_FAILURE);
return;
}
} else {
managedTable = MANAGED_TABLE.getValue().equals(tableManagementStrategy);
}
// Ensure valid configuration for external tables
if (createIfNotExists && !managedTable && !context.getProperty(EXTERNAL_TABLE_LOCATION).isSet()) {
throw new IOException("External Table Location must be set when Table Management Strategy is set to External");
}
final String externalTableLocation = managedTable ? null : context.getProperty(EXTERNAL_TABLE_LOCATION).evaluateAttributeExpressions(flowFile).getValue();
if (!managedTable && StringUtils.isEmpty(externalTableLocation)) {
log.error("External Table Location has invalid value: {}. Routing flowfile to failure", new Object[]{externalTableLocation});
session.transfer(flowFile, REL_FAILURE);
return;
}
final String storageFormat = context.getProperty(TABLE_STORAGE_FORMAT).getValue();
final HiveDBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(HiveDBCPService.class);
try (final Connection connection = dbcpService.getConnection()) {
final Map<String,String> attributes = new HashMap<>(flowFile.getAttributes());
OutputMetadataHolder outputMetadataHolder = checkAndUpdateTableSchema(attributes, connection, recordSchema, tableName, partitionClauseElements,
createIfNotExists, externalTableLocation, storageFormat, updateFieldNames);
if (outputMetadataHolder != null) {
// The output schema changed (i.e. field names were updated), so write out the corresponding FlowFile
try {
final FlowFile inputFlowFile = flowFile;
flowFile = session.write(flowFile, (in, out) -> {
// if we fail to create the RecordReader then we want to route to failure, so we need to
// handle this separately from the other IOExceptions which normally route to retry
final RecordReader recordReader;
final RecordSetWriter recordSetWriter;
try {
recordReader = recordReaderFactory.createRecordReader(inputFlowFile, in, getLogger());
recordSetWriter = recordWriterFactory.createWriter(getLogger(), outputMetadataHolder.getOutputSchema(), out, attributes);
} catch (Exception e) {
if(e instanceof IOException) {
throw (IOException) e;
}
throw new IOException(new RecordReaderFactoryException("Unable to create RecordReader", e));
}
WriteResult writeResult = updateRecords(recordSchema, outputMetadataHolder, recordReader, recordSetWriter);
recordSetWriter.flush();
recordSetWriter.close();
attributes.put("record.count", String.valueOf(writeResult.getRecordCount()));
attributes.put(CoreAttributes.MIME_TYPE.key(), recordSetWriter.getMimeType());
attributes.putAll(writeResult.getAttributes());
});
} catch (final Exception e) {
getLogger().error("Failed to process {}; will route to failure", new Object[]{flowFile, e});
// Since we are wrapping the exceptions above there should always be a cause
// but it's possible it might not have a message. This handles that by logging
// the name of the class thrown.
Throwable c = e.getCause();
if (c != null) {
session.putAttribute(flowFile, "record.error.message", (c.getLocalizedMessage() != null) ? c.getLocalizedMessage() : c.getClass().getCanonicalName() + " Thrown");
} else {
session.putAttribute(flowFile, "record.error.message", e.getClass().getCanonicalName() + " Thrown");
}
session.transfer(flowFile, REL_FAILURE);
return;
}
}
attributes.put(ATTR_OUTPUT_TABLE, tableName);
flowFile = session.putAllAttributes(flowFile, attributes);
session.getProvenanceReporter().invokeRemoteProcess(flowFile, dbcpService.getConnectionURL());
session.transfer(flowFile, REL_SUCCESS);
}
} catch (IOException | SQLException e) {
flowFile = session.putAttribute(flowFile, ATTR_OUTPUT_TABLE, tableName);
log.error("Exception while processing {} - routing to failure", new Object[]{flowFile}, e);
session.transfer(flowFile, REL_FAILURE);
} catch (DiscontinuedException e) {
// The input FlowFile processing is discontinued. Keep it in the input queue.
getLogger().warn("Discontinued processing for {} due to {}", new Object[]{flowFile, e}, e);
session.transfer(flowFile, Relationship.SELF);
} catch (Throwable t) {
throw (t instanceof ProcessException) ? (ProcessException) t : new ProcessException(t);
}
}
private synchronized OutputMetadataHolder checkAndUpdateTableSchema(Map<String,String> attributes, final Connection conn, final RecordSchema schema,
final String tableName, List<String> partitionClause, final boolean createIfNotExists,
final String externalTableLocation, final String storageFormat, final boolean updateFieldNames) throws IOException {
// Read in the current table metadata, compare it to the reader's schema, and
// add any columns from the schema that are missing in the table
try (Statement s = conn.createStatement()) {
// Determine whether the table exists
ResultSet tables = s.executeQuery("SHOW TABLES");
List<String> tableNames = new ArrayList<>();
String hiveTableName;
while (tables.next() && StringUtils.isNotEmpty(hiveTableName = tables.getString(1))) {
tableNames.add(hiveTableName);
}
List<String> columnsToAdd = new ArrayList<>();
String outputPath;
boolean tableCreated = false;
if (!tableNames.contains(tableName) && createIfNotExists) {
StringBuilder createTableStatement = new StringBuilder();
for (RecordField recordField : schema.getFields()) {
String recordFieldName = recordField.getFieldName();
// The field does not exist in the table, add it
columnsToAdd.add("`" + recordFieldName + "` " + NiFiOrcUtils.getHiveTypeFromFieldType(recordField.getDataType(), true));
getLogger().debug("Adding column " + recordFieldName + " to table " + tableName);
}
// Handle partition clause
if (partitionClause == null) {
partitionClause = Collections.emptyList();
}
List<String> validatedPartitionClause = new ArrayList<>(partitionClause.size());
for (String partition : partitionClause) {
String[] partitionInfo = partition.split(" ");
if (partitionInfo.length != 2) {
validatedPartitionClause.add("`" + partitionInfo[0] + "` string");
} else {
validatedPartitionClause.add("`" + partitionInfo[0] + "` " + partitionInfo[1]);
}
}
createTableStatement.append("CREATE ")
.append(externalTableLocation == null ? "" : "EXTERNAL ")
.append("TABLE IF NOT EXISTS `")
.append(tableName)
.append("` (")
.append(String.join(", ", columnsToAdd))
.append(") ")
.append(validatedPartitionClause.isEmpty() ? "" : "PARTITIONED BY (" + String.join(", ", validatedPartitionClause) + ") ")
.append("STORED AS ")
.append(storageFormat)
.append(externalTableLocation == null ? "" : " LOCATION '" + externalTableLocation + "'");
String createTableSql = createTableStatement.toString();
if (StringUtils.isNotEmpty(createTableSql)) {
// Perform the table create
getLogger().info("Executing Hive DDL: " + createTableSql);
s.execute(createTableSql);
}
tableCreated = true;
}
// Process the table (columns, partitions, location, etc.)
List<String> hiveColumns = new ArrayList<>();
String describeTable = "DESC FORMATTED `" + tableName + "`";
ResultSet tableInfo = s.executeQuery(describeTable);
// Result is 3 columns, col_name, data_type, comment. Check the first row for a header and skip if so, otherwise add column name
tableInfo.next();
String columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName) && !columnName.startsWith("#")) {
hiveColumns.add(columnName);
}
// If the column was a header, check for a blank line to follow and skip it, otherwise add the column name
if (columnName.startsWith("#")) {
tableInfo.next();
columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName)) {
hiveColumns.add(columnName);
}
}
// Collect all column names
while (tableInfo.next() && StringUtils.isNotEmpty(columnName = tableInfo.getString(1))) {
hiveColumns.add(columnName);
}
// Collect all partition columns
boolean moreRows = true;
boolean headerFound = false;
while (moreRows && !headerFound) {
String line = tableInfo.getString(1);
if ("# Partition Information".equals(line)) {
headerFound = true;
} else if ("# Detailed Table Information".equals(line)) {
// Not partitioned, exit the loop with headerFound = false
break;
}
moreRows = tableInfo.next();
}
List<String> partitionColumns = new ArrayList<>();
List<String> partitionColumnsEqualsValueList = new ArrayList<>();
List<String> partitionColumnsLocationList = new ArrayList<>();
if (headerFound) {
// If the table is partitioned, construct the partition=value strings for each partition column
String partitionColumnName;
columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName) && !columnName.startsWith("#")) {
partitionColumns.add(columnName);
}
// If the column was a header, check for a blank line to follow and skip it, otherwise add the column name
if (columnName.startsWith("#")) {
tableInfo.next();
columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName)) {
partitionColumns.add(columnName);
}
}
while (tableInfo.next() && StringUtils.isNotEmpty(partitionColumnName = tableInfo.getString(1))) {
partitionColumns.add(partitionColumnName);
}
final int partitionColumnsSize = partitionColumns.size();
final int partitionClauseSize = (partitionClause == null) ? 0 : partitionClause.size();
if (partitionClauseSize != partitionColumnsSize) {
throw new IOException("Found " + partitionColumnsSize + " partition columns but " + partitionClauseSize + " partition values were supplied");
}
for (int i = 0; i < partitionClauseSize; i++) {
String partitionName = partitionClause.get(i).split(" ")[0];
String partitionValue = attributes.get(partitionName);
if (StringUtils.isEmpty(partitionValue)) {
throw new IOException("No value found for partition value attribute '" + partitionName + "'");
}
if (!partitionColumns.contains(partitionName)) {
throw new IOException("Cannot add partition '" + partitionName + "' to existing table");
}
partitionColumnsEqualsValueList.add("`" + partitionName + "`='" + partitionValue + "'");
// Add unquoted version for the output path
partitionColumnsLocationList.add(partitionName + "=" + partitionValue);
}
}
// Get table location
moreRows = true;
headerFound = false;
while (moreRows && !headerFound) {
String line = tableInfo.getString(1);
if (line.startsWith("Location:")) {
headerFound = true;
continue; // Don't do a next() here, need to get the second column value
}
moreRows = tableInfo.next();
}
String tableLocation = tableInfo.getString(2);
String alterTableSql;
// If the table wasn't newly created, alter it accordingly
if (!tableCreated) {
StringBuilder alterTableStatement = new StringBuilder();
// Handle new columns
for (RecordField recordField : schema.getFields()) {
String recordFieldName = recordField.getFieldName().toLowerCase();
if (!hiveColumns.contains(recordFieldName) && !partitionColumns.contains(recordFieldName)) {
// The field does not exist in the table (and is not a partition column), add it
columnsToAdd.add("`" + recordFieldName + "` " + NiFiOrcUtils.getHiveTypeFromFieldType(recordField.getDataType(), true));
getLogger().info("Adding column " + recordFieldName + " to table " + tableName);
}
}
if (!columnsToAdd.isEmpty()) {
alterTableStatement.append("ALTER TABLE `")
.append(tableName)
.append("` ADD COLUMNS (")
.append(String.join(", ", columnsToAdd))
.append(")");
alterTableSql = alterTableStatement.toString();
if (StringUtils.isNotEmpty(alterTableSql)) {
// Perform the table update
getLogger().info("Executing Hive DDL: " + alterTableSql);
s.execute(alterTableSql);
}
}
}
outputPath = tableLocation;
// Handle new partition values
if (!partitionColumnsEqualsValueList.isEmpty()) {
alterTableSql = "ALTER TABLE `" +
tableName +
"` ADD IF NOT EXISTS PARTITION (" +
String.join(", ", partitionColumnsEqualsValueList) +
")";
if (StringUtils.isNotEmpty(alterTableSql)) {
// Perform the table update
getLogger().info("Executing Hive DDL: " + alterTableSql);
s.execute(alterTableSql);
}
// Add attribute for HDFS location of the partition values
outputPath = tableLocation + "/" + String.join("/", partitionColumnsLocationList);
}
// If updating field names, return a new RecordSchema, otherwise return null
OutputMetadataHolder outputMetadataHolder;
if (updateFieldNames) {
List<RecordField> inputRecordFields = schema.getFields();
List<RecordField> outputRecordFields = new ArrayList<>();
Map<String,String> fieldMap = new HashMap<>();
boolean needsUpdating = false;
for (RecordField inputRecordField : inputRecordFields) {
final String inputRecordFieldName = inputRecordField.getFieldName();
boolean found = false;
for (String hiveColumnName : hiveColumns) {
if (inputRecordFieldName.equalsIgnoreCase(hiveColumnName)) {
// Set a flag if the field name doesn't match the column name exactly. This overall flag will determine whether
// the records need updating (if true) or not (if false)
if (!inputRecordFieldName.equals(hiveColumnName)) {
needsUpdating = true;
}
fieldMap.put(inputRecordFieldName, hiveColumnName);
outputRecordFields.add(new RecordField(hiveColumnName, inputRecordField.getDataType(), inputRecordField.getDefaultValue(), inputRecordField.isNullable()));
found = true;
break;
}
}
if (!found) {
// If the input field wasn't a Hive table column, add it back to the schema as-is
fieldMap.put(inputRecordFieldName, inputRecordFieldName);
}
}
outputMetadataHolder = needsUpdating ? new OutputMetadataHolder(new SimpleRecordSchema(outputRecordFields), fieldMap)
: null;
} else {
outputMetadataHolder = null;
}
attributes.put(ATTR_OUTPUT_PATH, outputPath);
return outputMetadataHolder;
} catch (Exception e) {
throw new IOException(e);
}
}
private synchronized WriteResult updateRecords(final RecordSchema inputRecordSchema, final OutputMetadataHolder outputMetadataHolder,
final RecordReader reader, final RecordSetWriter writer) throws IOException {
try {
writer.beginRecordSet();
Record inputRecord;
while((inputRecord = reader.nextRecord()) != null) {
List<RecordField> inputRecordFields = inputRecordSchema.getFields();
Map<String,Object> outputRecordFields = new HashMap<>(inputRecordFields.size());
// Copy values from input field name to output field name
for(Map.Entry<String,String> mapping : outputMetadataHolder.getFieldMap().entrySet()) {
outputRecordFields.put(mapping.getValue(), inputRecord.getValue(mapping.getKey()));
}
Record outputRecord = new MapRecord(outputMetadataHolder.getOutputSchema(), outputRecordFields);
writer.write(outputRecord);
}
return writer.finishRecordSet();
} catch (MalformedRecordException mre) {
throw new IOException("Error reading records: "+mre.getMessage(), mre);
}
}
private static class OutputMetadataHolder {
private final RecordSchema outputSchema;
private final Map<String,String> fieldMap;
public OutputMetadataHolder(RecordSchema outputSchema, Map<String, String> fieldMap) {
this.outputSchema = outputSchema;
this.fieldMap = fieldMap;
}
public RecordSchema getOutputSchema() {
return outputSchema;
}
public Map<String, String> getFieldMap() {
return fieldMap;
}
}
}

View File

@ -1,23 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
public class AuthenticationFailedException extends Exception {
public AuthenticationFailedException(String reason, Exception cause) {
super(reason, cause);
}
}

View File

@ -1,63 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
public class CsvOutputOptions {
private boolean header = true;
private String altHeader = null;
private String delimiter = ",";
private boolean quote = false;
private boolean escape = true;
private int maxRowsPerFlowFile = 0;
public boolean isHeader() {
return header;
}
public String getAltHeader() {
return altHeader;
}
public String getDelimiter() {
return delimiter;
}
public boolean isQuote() {
return quote;
}
public boolean isEscape() {
return escape;
}
public int getMaxRowsPerFlowFile() {
return maxRowsPerFlowFile;
}
public CsvOutputOptions(boolean header, String altHeader, String delimiter, boolean quote, boolean escape, int maxRowsPerFlowFile) {
this.header = header;
this.altHeader = altHeader;
this.delimiter = delimiter;
this.quote = quote;
this.escape = escape;
this.maxRowsPerFlowFile = maxRowsPerFlowFile;
}
}

View File

@ -1,136 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.hadoop.KerberosProperties;
import org.apache.nifi.hadoop.SecurityUtil;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.security.krb.KerberosUser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicReference;
public class HiveConfigurator {
public Collection<ValidationResult> validate(String configFiles, String principal, String keyTab, String password,
AtomicReference<ValidationResources> validationResourceHolder, ComponentLog log) {
final List<ValidationResult> problems = new ArrayList<>();
ValidationResources resources = validationResourceHolder.get();
// if no resources in the holder, or if the holder has different resources loaded,
// then load the Configuration and set the new resources in the holder
if (resources == null || !configFiles.equals(resources.getConfigResources())) {
log.debug("Reloading validation resources");
resources = new ValidationResources(configFiles, getConfigurationFromFiles(configFiles));
validationResourceHolder.set(resources);
}
final Configuration hiveConfig = resources.getConfiguration();
problems.addAll(KerberosProperties.validatePrincipalWithKeytabOrPassword(this.getClass().getSimpleName(), hiveConfig, principal, keyTab, password, log));
return problems;
}
public HiveConf getConfigurationFromFiles(final String configFiles) {
final HiveConf hiveConfig = new HiveConf();
if (StringUtils.isNotBlank(configFiles)) {
for (final String configFile : configFiles.split(",")) {
hiveConfig.addResource(new Path(configFile.trim()));
}
}
return hiveConfig;
}
public void preload(Configuration configuration) {
try {
FileSystem.get(configuration).close();
UserGroupInformation.setConfiguration(configuration);
} catch (IOException ioe) {
// Suppress exception as future uses of this configuration will fail
}
}
/**
* Acquires a {@link UserGroupInformation} using the given {@link Configuration} and {@link KerberosUser}.
* @see SecurityUtil#getUgiForKerberosUser(Configuration, KerberosUser)
* @param hiveConfig The Configuration to apply to the acquired UserGroupInformation
* @param kerberosUser The KerberosUser to authenticate
* @return A UserGroupInformation instance created using the Subject of the given KerberosUser
* @throws AuthenticationFailedException if authentication fails
*/
public UserGroupInformation authenticate(final Configuration hiveConfig, KerberosUser kerberosUser) throws AuthenticationFailedException {
try {
return SecurityUtil.getUgiForKerberosUser(hiveConfig, kerberosUser);
} catch (IOException ioe) {
throw new AuthenticationFailedException("Kerberos Authentication for Hive failed", ioe);
}
}
/**
* As of Apache NiFi 1.5.0, due to changes made to
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}, which is used by this
* class to authenticate a principal with Kerberos, Hive controller services no longer
* attempt relogins explicitly. For more information, please read the documentation for
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}.
* <p/>
* In previous versions of NiFi, a {@link org.apache.nifi.hadoop.KerberosTicketRenewer} was started by
* {@link HiveConfigurator#authenticate(Configuration, String, String, long)} when the Hive
* controller service was enabled. The use of a separate thread to explicitly relogin could cause race conditions
* with the implicit relogin attempts made by hadoop/Hive code on a thread that references the same
* {@link UserGroupInformation} instance. One of these threads could leave the
* {@link javax.security.auth.Subject} in {@link UserGroupInformation} to be cleared or in an unexpected state
* while the other thread is attempting to use the {@link javax.security.auth.Subject}, resulting in failed
* authentication attempts that would leave the Hive controller service in an unrecoverable state.
*
* @see SecurityUtil#loginKerberos(Configuration, String, String)
* @deprecated Use {@link SecurityUtil#getUgiForKerberosUser(Configuration, KerberosUser)}
*/
@Deprecated
public UserGroupInformation authenticate(final Configuration hiveConfig, String principal, String keyTab) throws AuthenticationFailedException {
UserGroupInformation ugi;
try {
ugi = SecurityUtil.loginKerberos(hiveConfig, principal, keyTab);
} catch (IOException ioe) {
throw new AuthenticationFailedException("Kerberos Authentication for Hive failed", ioe);
}
return ugi;
}
/**
* As of Apache NiFi 1.5.0, this method has been deprecated and is now a wrapper
* method which invokes {@link HiveConfigurator#authenticate(Configuration, String, String)}. It will no longer start a
* {@link org.apache.nifi.hadoop.KerberosTicketRenewer} to perform explicit relogins.
*
* @see HiveConfigurator#authenticate(Configuration, String, String)
*/
@Deprecated
public UserGroupInformation authenticate(final Configuration hiveConfig, String principal, String keyTab, long ticketRenewalPeriod) throws AuthenticationFailedException {
return authenticate(hiveConfig, principal, keyTab);
}
}

View File

@ -1,462 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.SchemaBuilder.FieldAssembler;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.nifi.components.PropertyDescriptor;
import java.io.IOException;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static java.sql.Types.ARRAY;
import static java.sql.Types.BIGINT;
import static java.sql.Types.BINARY;
import static java.sql.Types.BIT;
import static java.sql.Types.BLOB;
import static java.sql.Types.BOOLEAN;
import static java.sql.Types.CHAR;
import static java.sql.Types.CLOB;
import static java.sql.Types.DATE;
import static java.sql.Types.DECIMAL;
import static java.sql.Types.DOUBLE;
import static java.sql.Types.FLOAT;
import static java.sql.Types.INTEGER;
import static java.sql.Types.JAVA_OBJECT;
import static java.sql.Types.LONGNVARCHAR;
import static java.sql.Types.LONGVARBINARY;
import static java.sql.Types.LONGVARCHAR;
import static java.sql.Types.NCHAR;
import static java.sql.Types.NUMERIC;
import static java.sql.Types.NVARCHAR;
import static java.sql.Types.OTHER;
import static java.sql.Types.REAL;
import static java.sql.Types.ROWID;
import static java.sql.Types.SMALLINT;
import static java.sql.Types.SQLXML;
import static java.sql.Types.STRUCT;
import static java.sql.Types.TIME;
import static java.sql.Types.TIMESTAMP;
import static java.sql.Types.TINYINT;
import static java.sql.Types.VARBINARY;
import static java.sql.Types.VARCHAR;
/**
* JDBC / HiveQL common functions.
*/
public class HiveJdbcCommon {
public static final String AVRO = "Avro";
public static final String CSV = "CSV";
public static final String MIME_TYPE_AVRO_BINARY = "application/avro-binary";
public static final String CSV_MIME_TYPE = "text/csv";
public static final PropertyDescriptor NORMALIZE_NAMES_FOR_AVRO = new PropertyDescriptor.Builder()
.name("hive-normalize-avro")
.displayName("Normalize Table/Column Names")
.description("Whether to change non-Avro-compatible characters in column names to Avro-compatible characters. For example, colons and periods "
+ "will be changed to underscores in order to build a valid Avro record.")
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, final int maxRows, boolean convertNames) throws SQLException, IOException {
return convertToAvroStream(rs, outStream, null, maxRows, convertNames, null);
}
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, String recordName, final int maxRows, boolean convertNames, ResultSetRowCallback callback)
throws SQLException, IOException {
final Schema schema = createSchema(rs, recordName, convertNames);
final GenericRecord rec = new GenericData.Record(schema);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, outStream);
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
long nrOfRows = 0;
while (rs.next()) {
if (callback != null) {
callback.processRow(rs);
}
for (int i = 1; i <= nrOfColumns; i++) {
final int javaSqlType = meta.getColumnType(i);
Object value = rs.getObject(i);
if (value == null) {
rec.put(i - 1, null);
} else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == BLOB || javaSqlType == CLOB) {
// bytes requires little bit different handling
ByteBuffer bb = null;
if (value instanceof byte[]) {
bb = ByteBuffer.wrap((byte[]) value);
} else if (value instanceof ByteBuffer) {
bb = (ByteBuffer) value;
}
if (bb != null) {
rec.put(i - 1, bb);
} else {
throw new IOException("Could not process binary object of type " + value.getClass().getName());
}
} else if (value instanceof Byte) {
// tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
// But value is returned by JDBC as java.lang.Byte
// (at least H2 JDBC works this way)
// direct put to avro record results:
// org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
rec.put(i - 1, ((Byte) value).intValue());
} else if (value instanceof BigDecimal || value instanceof BigInteger) {
// Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
rec.put(i - 1, value.toString());
} else if (value instanceof Number) {
// Need to call the right getXYZ() method (instead of the getObject() method above), since Doubles are sometimes returned
// when the JDBC type is 6 (Float) for example.
if (javaSqlType == FLOAT) {
value = rs.getFloat(i);
} else if (javaSqlType == DOUBLE) {
value = rs.getDouble(i);
} else if (javaSqlType == INTEGER || javaSqlType == TINYINT || javaSqlType == SMALLINT) {
value = rs.getInt(i);
}
rec.put(i - 1, value);
} else if (value instanceof Boolean) {
rec.put(i - 1, value);
} else if (value instanceof java.sql.SQLXML) {
rec.put(i - 1, ((java.sql.SQLXML) value).getString());
} else {
// The different types that we support are numbers (int, long, double, float),
// as well as boolean values and Strings. Since Avro doesn't provide
// timestamp types, we want to convert those to Strings. So we will cast anything other
// than numbers or booleans to strings by using the toString() method.
rec.put(i - 1, value.toString());
}
}
dataFileWriter.append(rec);
nrOfRows += 1;
if (maxRows > 0 && nrOfRows == maxRows)
break;
}
return nrOfRows;
}
}
public static Schema createSchema(final ResultSet rs, boolean convertNames) throws SQLException {
return createSchema(rs, null, false);
}
/**
* Creates an Avro schema from a result set. If the table/record name is known a priori and provided, use that as a
* fallback for the record name if it cannot be retrieved from the result set, and finally fall back to a default value.
*
* @param rs The result set to convert to Avro
* @param recordName The a priori record name to use if it cannot be determined from the result set.
* @param convertNames Whether to convert column/table names to be legal Avro names
* @return A Schema object representing the result set converted to an Avro record
* @throws SQLException if any error occurs during conversion
*/
public static Schema createSchema(final ResultSet rs, String recordName, boolean convertNames) throws SQLException {
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
String tableName = StringUtils.isEmpty(recordName) ? "NiFi_SelectHiveQL_Record" : recordName;
try {
if (nrOfColumns > 0) {
// Hive JDBC doesn't support getTableName, instead it returns table.column for column name. Grab the table name from the first column
String firstColumnNameFromMeta = meta.getColumnName(1);
int tableNameDelimiter = firstColumnNameFromMeta.lastIndexOf(".");
if (tableNameDelimiter > -1) {
String tableNameFromMeta = firstColumnNameFromMeta.substring(0, tableNameDelimiter);
if (!StringUtils.isBlank(tableNameFromMeta)) {
tableName = tableNameFromMeta;
}
}
}
} catch (SQLException se) {
// Not all drivers support getTableName, so just use the previously-set default
}
if (convertNames) {
tableName = normalizeNameForAvro(tableName);
}
final FieldAssembler<Schema> builder = SchemaBuilder.record(tableName).namespace("any.data").fields();
/**
* Some missing Avro types - Decimal, Date types. May need some additional work.
*/
for (int i = 1; i <= nrOfColumns; i++) {
String columnNameFromMeta = meta.getColumnName(i);
// Hive returns table.column for column name. Grab the column name as the string after the last period
int columnNameDelimiter = columnNameFromMeta.lastIndexOf(".");
String columnName = columnNameFromMeta.substring(columnNameDelimiter + 1);
switch (meta.getColumnType(i)) {
case CHAR:
case LONGNVARCHAR:
case LONGVARCHAR:
case NCHAR:
case NVARCHAR:
case VARCHAR:
case ARRAY:
case STRUCT:
case JAVA_OBJECT:
case OTHER:
case SQLXML:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
case BIT:
case BOOLEAN:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().booleanType().endUnion().noDefault();
break;
case INTEGER:
// Default to signed type unless otherwise noted. Some JDBC drivers don't implement isSigned()
boolean signedType = true;
try {
signedType = meta.isSigned(i);
} catch (SQLException se) {
// Use signed types as default
}
if (signedType) {
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().intType().endUnion().noDefault();
} else {
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().longType().endUnion().noDefault();
}
break;
case SMALLINT:
case TINYINT:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().intType().endUnion().noDefault();
break;
case BIGINT:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().longType().endUnion().noDefault();
break;
// java.sql.RowId is interface, is seems to be database
// implementation specific, let's convert to String
case ROWID:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
case FLOAT:
case REAL:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().floatType().endUnion().noDefault();
break;
case DOUBLE:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().doubleType().endUnion().noDefault();
break;
// Did not find direct suitable type, need to be clarified!!!!
case DECIMAL:
case NUMERIC:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
// Did not find direct suitable type, need to be clarified!!!!
case DATE:
case TIME:
case TIMESTAMP:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
case BINARY:
case VARBINARY:
case LONGVARBINARY:
case BLOB:
case CLOB:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().bytesType().endUnion().noDefault();
break;
default:
throw new IllegalArgumentException("createSchema: Unknown SQL type " + meta.getColumnType(i) + " cannot be converted to Avro type");
}
}
return builder.endRecord();
}
public static long convertToCsvStream(final ResultSet rs, final OutputStream outStream, CsvOutputOptions outputOptions) throws SQLException, IOException {
return convertToCsvStream(rs, outStream, null, null, outputOptions);
}
public static long convertToCsvStream(final ResultSet rs, final OutputStream outStream, String recordName, ResultSetRowCallback callback, CsvOutputOptions outputOptions)
throws SQLException, IOException {
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
List<String> columnNames = new ArrayList<>(nrOfColumns);
if (outputOptions.isHeader()) {
if (outputOptions.getAltHeader() == null) {
for (int i = 1; i <= nrOfColumns; i++) {
String columnNameFromMeta = meta.getColumnName(i);
// Hive returns table.column for column name. Grab the column name as the string after the last period
int columnNameDelimiter = columnNameFromMeta.lastIndexOf(".");
columnNames.add(columnNameFromMeta.substring(columnNameDelimiter + 1));
}
} else {
String[] altHeaderNames = outputOptions.getAltHeader().split(",");
columnNames = Arrays.asList(altHeaderNames);
}
}
// Write column names as header row
outStream.write(StringUtils.join(columnNames, outputOptions.getDelimiter()).getBytes(StandardCharsets.UTF_8));
if (outputOptions.isHeader()) {
outStream.write("\n".getBytes(StandardCharsets.UTF_8));
}
// Iterate over the rows
int maxRows = outputOptions.getMaxRowsPerFlowFile();
long nrOfRows = 0;
while (rs.next()) {
if (callback != null) {
callback.processRow(rs);
}
List<String> rowValues = new ArrayList<>(nrOfColumns);
for (int i = 1; i <= nrOfColumns; i++) {
final int javaSqlType = meta.getColumnType(i);
final Object value = rs.getObject(i);
switch (javaSqlType) {
case CHAR:
case LONGNVARCHAR:
case LONGVARCHAR:
case NCHAR:
case NVARCHAR:
case VARCHAR:
String valueString = rs.getString(i);
if (valueString != null) {
// Removed extra quotes as those are a part of the escapeCsv when required.
StringBuilder sb = new StringBuilder();
if (outputOptions.isQuote()) {
sb.append("\"");
if (outputOptions.isEscape()) {
sb.append(StringEscapeUtils.escapeCsv(valueString));
} else {
sb.append(valueString);
}
sb.append("\"");
rowValues.add(sb.toString());
} else {
if (outputOptions.isEscape()) {
rowValues.add(StringEscapeUtils.escapeCsv(valueString));
} else {
rowValues.add(valueString);
}
}
} else {
rowValues.add("");
}
break;
case ARRAY:
case STRUCT:
case JAVA_OBJECT:
String complexValueString = rs.getString(i);
if (complexValueString != null) {
rowValues.add(StringEscapeUtils.escapeCsv(complexValueString));
} else {
rowValues.add("");
}
break;
case SQLXML:
if (value != null) {
rowValues.add(StringEscapeUtils.escapeCsv(((java.sql.SQLXML) value).getString()));
} else {
rowValues.add("");
}
default:
if (value != null) {
rowValues.add(value.toString());
} else {
rowValues.add("");
}
}
}
// Write row values
outStream.write(StringUtils.join(rowValues, outputOptions.getDelimiter()).getBytes(StandardCharsets.UTF_8));
outStream.write("\n".getBytes(StandardCharsets.UTF_8));
nrOfRows++;
if (maxRows > 0 && nrOfRows == maxRows)
break;
}
return nrOfRows;
}
public static String normalizeNameForAvro(String inputName) {
String normalizedName = inputName.replaceAll("[^A-Za-z0-9_]", "_");
if (Character.isDigit(normalizedName.charAt(0))) {
normalizedName = "_" + normalizedName;
}
return normalizedName;
}
/**
* An interface for callback methods which allows processing of a row during the convertToXYZStream() processing.
* <b>IMPORTANT:</b> This method should only work on the row pointed at by the current ResultSet reference.
* Advancing the cursor (e.g.) can cause rows to be skipped during Avro transformation.
*/
public interface ResultSetRowCallback {
void processRow(ResultSet resultSet) throws IOException;
}
public static Configuration getConfigurationFromFiles(final String configFiles) {
final Configuration hiveConfig = new HiveConf();
if (StringUtils.isNotBlank(configFiles)) {
for (final String configFile : configFiles.split(",")) {
hiveConfig.addResource(new Path(configFile.trim()));
}
}
return hiveConfig;
}
}

View File

@ -1,155 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import java.io.Serializable;
public class HiveOptions implements Serializable {
/**
* Half of the default Config.TOPOLOGY_MESSAGE_TIMEOUT_SECS
*/
private static final int DEFAULT_TICK_TUPLE_INTERVAL_SECS = 15;
protected String databaseName;
protected String tableName;
protected String metaStoreURI;
protected Integer txnsPerBatch = 100;
protected Integer maxOpenConnections = 10;
protected Integer batchSize = 15000;
protected Integer idleTimeout = 60000;
protected Integer callTimeout = 0;
protected Integer heartBeatInterval = 60;
protected Boolean autoCreatePartitions = true;
protected String kerberosPrincipal;
protected String kerberosKeytab;
protected Integer tickTupleInterval = DEFAULT_TICK_TUPLE_INTERVAL_SECS;
public HiveOptions(String metaStoreURI, String databaseName, String tableName) {
this.metaStoreURI = metaStoreURI;
this.databaseName = databaseName;
this.tableName = tableName;
}
public HiveOptions withTickTupleInterval(Integer tickInterval) {
this.tickTupleInterval = tickInterval;
return this;
}
public HiveOptions withTxnsPerBatch(Integer txnsPerBatch) {
this.txnsPerBatch = txnsPerBatch;
return this;
}
public HiveOptions withMaxOpenConnections(Integer maxOpenConnections) {
this.maxOpenConnections = maxOpenConnections;
return this;
}
public HiveOptions withBatchSize(Integer batchSize) {
this.batchSize = batchSize;
return this;
}
public HiveOptions withIdleTimeout(Integer idleTimeout) {
this.idleTimeout = idleTimeout;
return this;
}
public HiveOptions withCallTimeout(Integer callTimeout) {
this.callTimeout = callTimeout;
return this;
}
public HiveOptions withHeartBeatInterval(Integer heartBeatInterval) {
this.heartBeatInterval = heartBeatInterval;
return this;
}
public HiveOptions withAutoCreatePartitions(Boolean autoCreatePartitions) {
this.autoCreatePartitions = autoCreatePartitions;
return this;
}
public HiveOptions withKerberosKeytab(String kerberosKeytab) {
this.kerberosKeytab = kerberosKeytab;
return this;
}
public HiveOptions withKerberosPrincipal(String kerberosPrincipal) {
this.kerberosPrincipal = kerberosPrincipal;
return this;
}
public String getMetaStoreURI() {
return metaStoreURI;
}
public String getDatabaseName() {
return databaseName;
}
public String getTableName() {
return tableName;
}
public String getQualifiedTableName() {
return databaseName + "." + tableName;
}
public Integer getBatchSize() {
return batchSize;
}
public Integer getCallTimeOut() {
return callTimeout;
}
public Integer getHeartBeatInterval() {
return heartBeatInterval;
}
public Integer getMaxOpenConnections() {
return maxOpenConnections;
}
public Integer getIdleTimeout() {
return idleTimeout;
}
public Integer getTxnsPerBatch() {
return txnsPerBatch;
}
public Boolean getAutoCreatePartitions() {
return autoCreatePartitions;
}
public String getKerberosPrincipal() {
return kerberosPrincipal;
}
public String getKerberosKeytab() {
return kerberosKeytab;
}
public Integer getTickTupleInterval() {
return tickTupleInterval;
}
}

View File

@ -1,50 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.hcatalog.streaming.ConnectionError;
import org.apache.hive.hcatalog.streaming.HiveEndPoint;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
public class HiveUtils {
private static final Logger LOG = LoggerFactory.getLogger(HiveUtils.class);
public static HiveEndPoint makeEndPoint(List<String> partitionVals, HiveOptions options) throws ConnectionError {
return new HiveEndPoint(options.getMetaStoreURI(), options.getDatabaseName(), options.getTableName(), partitionVals);
}
public static HiveWriter makeHiveWriter(HiveEndPoint endPoint, ExecutorService callTimeoutPool, UserGroupInformation ugi, HiveOptions options, HiveConf hiveConf)
throws HiveWriter.ConnectFailure, InterruptedException {
return new HiveWriter(endPoint, options.getTxnsPerBatch(), options.getAutoCreatePartitions(),
options.getCallTimeOut(), callTimeoutPool, ugi, hiveConf);
}
public static void logAllHiveEndPoints(Map<HiveEndPoint, HiveWriter> allWriters) {
for (Map.Entry<HiveEndPoint,HiveWriter> entry : allWriters.entrySet()) {
LOG.info("cached writers {} ", entry.getValue());
}
}
}

View File

@ -1,462 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.hcatalog.streaming.HiveEndPoint;
import org.apache.hive.hcatalog.streaming.RecordWriter;
import org.apache.hive.hcatalog.streaming.SerializationError;
import org.apache.hive.hcatalog.streaming.StreamingConnection;
import org.apache.hive.hcatalog.streaming.StreamingException;
import org.apache.hive.hcatalog.streaming.StreamingIOFailure;
import org.apache.hive.hcatalog.streaming.StrictJsonWriter;
import org.apache.hive.hcatalog.streaming.TransactionBatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.UndeclaredThrowableException;
import java.security.PrivilegedExceptionAction;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
public class HiveWriter {
private static final Logger LOG = LoggerFactory.getLogger(HiveWriter.class);
private final HiveEndPoint endPoint;
private final StreamingConnection connection;
private final int txnsPerBatch;
private final RecordWriter recordWriter;
private final ExecutorService callTimeoutPool;
private final long callTimeout;
private final Object txnBatchLock = new Object();
private final UserGroupInformation ugi;
private TransactionBatch txnBatch;
private long lastUsed; // time of last flush on this writer
protected boolean closed; // flag indicating HiveWriter was closed
private int totalRecords = 0;
public HiveWriter(HiveEndPoint endPoint, int txnsPerBatch, boolean autoCreatePartitions, long callTimeout, ExecutorService callTimeoutPool, UserGroupInformation ugi, HiveConf hiveConf)
throws InterruptedException, ConnectFailure {
try {
this.ugi = ugi;
this.callTimeout = callTimeout;
this.callTimeoutPool = callTimeoutPool;
this.endPoint = endPoint;
this.connection = newConnection(endPoint, autoCreatePartitions, hiveConf, ugi);
this.txnsPerBatch = txnsPerBatch;
this.recordWriter = getRecordWriter(endPoint, ugi, hiveConf);
this.txnBatch = nextTxnBatch(recordWriter);
this.closed = false;
this.lastUsed = System.currentTimeMillis();
} catch (InterruptedException | RuntimeException | ConnectFailure e) {
throw e;
} catch (Exception e) {
throw new ConnectFailure(endPoint, e);
}
}
protected RecordWriter getRecordWriter(HiveEndPoint endPoint, UserGroupInformation ugi, HiveConf hiveConf) throws StreamingException, IOException, InterruptedException {
if (ugi == null) {
return new StrictJsonWriter(endPoint, hiveConf);
} else {
try {
return ugi.doAs((PrivilegedExceptionAction<StrictJsonWriter>) () -> new StrictJsonWriter(endPoint, hiveConf));
} catch (UndeclaredThrowableException e) {
Throwable cause = e.getCause();
if (cause instanceof StreamingException) {
throw (StreamingException) cause;
} else {
throw e;
}
}
}
}
@Override
public String toString() {
return "{ endPoint = " + endPoint + ", TransactionBatch = " + txnBatch + " }";
}
/**
* Write the record data to Hive
*
* @throws IOException if an error occurs during the write
* @throws InterruptedException if the write operation is interrupted
*/
public synchronized void write(final byte[] record)
throws WriteFailure, SerializationError, InterruptedException {
if (closed) {
throw new IllegalStateException("This hive streaming writer was closed " +
"and thus no longer able to write : " + endPoint);
}
// write the tuple
try {
LOG.debug("Writing event to {}", endPoint);
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws StreamingException, InterruptedException {
txnBatch.write(record);
totalRecords++;
return null;
}
});
} catch (SerializationError se) {
throw new SerializationError(endPoint.toString() + " SerializationError", se);
} catch (StreamingException | TimeoutException e) {
throw new WriteFailure(endPoint, txnBatch.getCurrentTxnId(), e);
}
}
/**
* Commits the current Txn if totalRecordsPerTransaction > 0 .
* If 'rollToNext' is true, will switch to next Txn in batch or to a
* new TxnBatch if current Txn batch is exhausted
*/
public void flush(boolean rollToNext)
throws CommitFailure, TxnBatchFailure, TxnFailure, InterruptedException {
// if there are no records do not call flush
if (totalRecords <= 0) return;
try {
synchronized (txnBatchLock) {
commitTxn();
nextTxn(rollToNext);
totalRecords = 0;
lastUsed = System.currentTimeMillis();
}
} catch (StreamingException e) {
throw new TxnFailure(txnBatch, e);
}
}
/** Queues up a heartbeat request on the current and remaining txns using the
* heartbeatThdPool and returns immediately
*/
public void heartBeat() throws InterruptedException {
// 1) schedule the heartbeat on one thread in pool
synchronized (txnBatchLock) {
try {
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws Exception {
try {
LOG.info("Sending heartbeat on batch " + txnBatch);
txnBatch.heartbeat();
} catch (StreamingException e) {
LOG.warn("Heartbeat error on batch " + txnBatch, e);
}
return null;
}
});
} catch (InterruptedException e) {
throw e;
} catch (Exception e) {
LOG.warn("Unable to send heartbeat on Txn Batch " + txnBatch, e);
// Suppressing exceptions as we don't care for errors on heartbeats
}
}
}
/**
* Returns totalRecords written so far in a transaction
* @returns totalRecords
*/
public int getTotalRecords() {
return totalRecords;
}
/**
* Flush and Close current transactionBatch.
*/
public void flushAndClose() throws TxnBatchFailure, TxnFailure, CommitFailure,
IOException, InterruptedException {
flush(false);
close();
}
/**
* Close the Transaction Batch and connection
* @throws IOException if an error occurs during close
* @throws InterruptedException if the close operation is interrupted
*/
public void close() throws IOException, InterruptedException {
closeTxnBatch();
closeConnection();
closed = true;
}
protected void closeConnection() throws InterruptedException {
LOG.info("Closing connection to end point : {}", endPoint);
try {
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws Exception {
connection.close(); // could block
return null;
}
});
} catch (Exception e) {
LOG.warn("Error closing connection to EndPoint : " + endPoint, e);
// Suppressing exceptions as we don't care for errors on connection close
}
}
protected void commitTxn() throws CommitFailure, InterruptedException {
LOG.debug("Committing Txn id {} to {}", txnBatch.getCurrentTxnId(), endPoint);
try {
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws Exception {
txnBatch.commit(); // could block
return null;
}
});
} catch (StreamingException | TimeoutException e) {
throw new CommitFailure(endPoint, txnBatch.getCurrentTxnId(), e);
}
}
protected StreamingConnection newConnection(HiveEndPoint endPoint, boolean autoCreatePartitions, HiveConf conf, UserGroupInformation ugi) throws InterruptedException, ConnectFailure {
try {
return callWithTimeout(() -> {
return endPoint.newConnection(autoCreatePartitions, conf, ugi); // could block
});
} catch (StreamingException | TimeoutException e) {
throw new ConnectFailure(endPoint, e);
}
}
protected TransactionBatch nextTxnBatch(final RecordWriter recordWriter)
throws InterruptedException, TxnBatchFailure {
LOG.debug("Fetching new Txn Batch for {}", endPoint);
TransactionBatch batch = null;
try {
batch = callWithTimeout(() -> {
return connection.fetchTransactionBatch(txnsPerBatch, recordWriter); // could block
});
batch.beginNextTransaction();
LOG.debug("Acquired {}. Switching to first txn", batch);
} catch (TimeoutException | StreamingException e) {
throw new TxnBatchFailure(endPoint, e);
}
return batch;
}
protected void closeTxnBatch() throws InterruptedException {
try {
LOG.debug("Closing Txn Batch {}", txnBatch);
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws Exception {
if (txnBatch != null) {
txnBatch.close(); // could block
}
return null;
}
});
} catch (InterruptedException e) {
throw e;
} catch (Exception e) {
LOG.warn("Error closing txn batch " + txnBatch, e);
}
}
/**
* Aborts the current Txn and switches to next Txn.
* @throws StreamingException if could not get new Transaction Batch, or switch to next Txn
*/
public void abort() throws StreamingException, TxnBatchFailure, InterruptedException {
synchronized (txnBatchLock) {
abortTxn();
nextTxn(true); // roll to next
}
}
/**
* Aborts current Txn in the txnBatch.
*/
protected void abortTxn() throws InterruptedException {
LOG.info("Aborting Txn id {} on End Point {}", txnBatch.getCurrentTxnId(), endPoint);
try {
callWithTimeout(new CallRunner<Void>() {
@Override
public Void call() throws StreamingException, InterruptedException {
txnBatch.abort(); // could block
return null;
}
});
} catch (InterruptedException e) {
throw e;
} catch (TimeoutException e) {
LOG.warn("Timeout while aborting Txn " + txnBatch.getCurrentTxnId() + " on EndPoint: " + endPoint, e);
} catch (Exception e) {
LOG.warn("Error aborting Txn " + txnBatch.getCurrentTxnId() + " on EndPoint: " + endPoint, e);
// Suppressing exceptions as we don't care for errors on abort
}
}
/**
* if there are remainingTransactions in current txnBatch, begins nextTransactions
* otherwise creates new txnBatch.
* @param rollToNext Whether to roll to the next transaction batch
*/
protected void nextTxn(boolean rollToNext) throws StreamingException, InterruptedException, TxnBatchFailure {
if (txnBatch.remainingTransactions() == 0) {
closeTxnBatch();
txnBatch = null;
if (rollToNext) {
txnBatch = nextTxnBatch(recordWriter);
}
} else if (rollToNext) {
LOG.debug("Switching to next Txn for {}", endPoint);
txnBatch.beginNextTransaction(); // does not block
}
}
/**
* If the current thread has been interrupted, then throws an
* exception.
* @throws InterruptedException uf the current thread has been interrupted
*/
protected static void checkAndThrowInterruptedException()
throws InterruptedException {
if (Thread.currentThread().interrupted()) {
throw new InterruptedException("Timed out before Hive call was made. "
+ "Your callTimeout might be set too low or Hive calls are "
+ "taking too long.");
}
}
/**
* Execute the callable on a separate thread and wait for the completion
* for the specified amount of time in milliseconds. In case of timeout
* cancel the callable and throw an IOException
*/
private <T> T callWithTimeout(final CallRunner<T> callRunner)
throws TimeoutException, StreamingException, InterruptedException {
Future<T> future = callTimeoutPool.submit(() -> {
if (ugi == null) {
return callRunner.call();
}
try {
return ugi.doAs((PrivilegedExceptionAction<T>) () -> callRunner.call());
} catch (UndeclaredThrowableException e) {
Throwable cause = e.getCause();
// Unwrap exception so it is thrown the same way as without ugi
if (!(cause instanceof Exception)) {
throw e;
}
throw (Exception)cause;
}
});
try {
if (callTimeout > 0) {
return future.get(callTimeout, TimeUnit.MILLISECONDS);
} else {
return future.get();
}
} catch (TimeoutException eT) {
future.cancel(true);
throw eT;
} catch (ExecutionException e1) {
Throwable cause = e1.getCause();
if (cause instanceof IOException) {
throw new StreamingIOFailure("I/O Failure", (IOException) cause);
} else if (cause instanceof StreamingException) {
throw (StreamingException) cause;
} else if (cause instanceof InterruptedException) {
throw (InterruptedException) cause;
} else if (cause instanceof RuntimeException) {
throw (RuntimeException) cause;
} else if (cause instanceof TimeoutException) {
throw new StreamingException("Operation Timed Out.", (TimeoutException) cause);
} else {
throw new RuntimeException(e1);
}
}
}
public long getLastUsed() {
return lastUsed;
}
private byte[] generateRecord(List<String> tuple) {
StringBuilder buf = new StringBuilder();
for (String o : tuple) {
buf.append(o);
buf.append(",");
}
return buf.toString().getBytes();
}
/**
* Simple interface whose <tt>call</tt> method is called by
* {#callWithTimeout} in a new thread inside a
* {@linkplain java.security.PrivilegedExceptionAction#run()} call.
* @param <T> the type of object returned from the call
*/
private interface CallRunner<T> {
T call() throws Exception;
}
public static class Failure extends Exception {
public Failure(String message, Throwable cause) {
super(message, cause);
}
}
public static class WriteFailure extends Failure {
public WriteFailure(HiveEndPoint endPoint, Long currentTxnId, Throwable cause) {
super("Failed writing to : " + endPoint + ". TxnID : " + currentTxnId, cause);
}
}
public static class CommitFailure extends Failure {
public CommitFailure(HiveEndPoint endPoint, Long txnID, Throwable cause) {
super("Commit of Txn " + txnID + " failed on EndPoint: " + endPoint, cause);
}
}
public static class ConnectFailure extends Failure {
public ConnectFailure(HiveEndPoint ep, Throwable cause) {
super("Failed connecting to EndPoint " + ep, cause);
}
}
public static class TxnBatchFailure extends Failure {
public TxnBatchFailure(HiveEndPoint ep, Throwable cause) {
super("Failed acquiring Transaction Batch from EndPoint: " + ep, cause);
}
}
public static class TxnFailure extends Failure {
public TxnFailure(TransactionBatch txnBatch, Throwable cause) {
super("Failed switching to next Txn in TxnBatch " + txnBatch, cause);
}
}
}

View File

@ -1,41 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.hadoop.conf.Configuration;
/**
* A helper class for maintaining loaded configurations (to avoid reloading on use unless necessary)
*/
public class ValidationResources {
private final String configResources;
private final Configuration configuration;
public ValidationResources(String configResources, Configuration configuration) {
this.configResources = configResources;
this.configuration = configuration;
}
public String getConfigResources() {
return configResources;
}
public Configuration getConfiguration() {
return configuration;
}
}

View File

@ -1,15 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.dbcp.hive.HiveConnectionPool

View File

@ -1,19 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.processors.hive.ConvertAvroToORC
org.apache.nifi.processors.hive.SelectHiveQL
org.apache.nifi.processors.hive.PutHiveQL
org.apache.nifi.processors.hive.PutHiveStreaming
org.apache.nifi.processors.hive.UpdateHiveTable

View File

@ -1,201 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.dbcp.hive;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.hadoop.KerberosProperties;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.registry.VariableDescriptor;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.MockConfigurationContext;
import org.apache.nifi.util.MockVariableRegistry;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.UndeclaredThrowableException;
import java.security.PrivilegedExceptionAction;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.mockito.ArgumentMatchers.isA;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class HiveConnectionPoolTest {
private UserGroupInformation userGroupInformation;
private HiveConnectionPool hiveConnectionPool;
private BasicDataSource basicDataSource;
private ComponentLog componentLog;
private KerberosProperties kerberosProperties;
private File krb5conf = new File("src/test/resources/krb5.conf");
@BeforeEach
public void setup() throws Exception {
// have to initialize this system property before anything else
System.setProperty("java.security.krb5.conf", krb5conf.getAbsolutePath());
System.setProperty("java.security.krb5.realm", "nifi.com");
System.setProperty("java.security.krb5.kdc", "nifi.kdc");
userGroupInformation = mock(UserGroupInformation.class);
basicDataSource = mock(BasicDataSource.class);
componentLog = mock(ComponentLog.class);
kerberosProperties = mock(KerberosProperties.class);
when(userGroupInformation.doAs(isA(PrivilegedExceptionAction.class))).thenAnswer(invocation -> {
try {
return ((PrivilegedExceptionAction) invocation.getArguments()[0]).run();
} catch (IOException | Error | RuntimeException | InterruptedException e) {
throw e;
} catch (Throwable e) {
throw new UndeclaredThrowableException(e);
}
});
when(kerberosProperties.getKerberosKeytab()).thenReturn(new PropertyDescriptor.Builder()
.name("Kerberos Principal")
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build());
when(kerberosProperties.getKerberosPrincipal()).thenReturn(new PropertyDescriptor.Builder()
.name("Kerberos Keytab")
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build());
initPool();
}
private void initPool() throws Exception {
hiveConnectionPool = new HiveConnectionPool();
Field ugiField = HiveConnectionPool.class.getDeclaredField("ugi");
ugiField.setAccessible(true);
ugiField.set(hiveConnectionPool, userGroupInformation);
Field dataSourceField = HiveConnectionPool.class.getDeclaredField("dataSource");
dataSourceField.setAccessible(true);
dataSourceField.set(hiveConnectionPool, basicDataSource);
Field componentLogField = AbstractControllerService.class.getDeclaredField("logger");
componentLogField.setAccessible(true);
componentLogField.set(hiveConnectionPool, componentLog);
Field kerberosPropertiesField = HiveConnectionPool.class.getDeclaredField("kerberosProperties");
kerberosPropertiesField.setAccessible(true);
kerberosPropertiesField.set(hiveConnectionPool, kerberosProperties);
}
@Test
public void testGetConnectionSqlException() throws SQLException {
SQLException sqlException = new SQLException("bad sql");
when(basicDataSource.getConnection()).thenThrow(sqlException);
ProcessException e = assertThrows(ProcessException.class, () -> hiveConnectionPool.getConnection());
assertEquals(sqlException, e.getCause());
}
@Test
public void testExpressionLanguageSupport() throws Exception {
final String URL = "jdbc:hive2://localhost:10000/default";
final String USER = "user";
final String PASS = "pass";
final int MAX_CONN = 7;
final String MAX_CONN_LIFETIME = "1 sec";
final String MAX_WAIT = "10 sec"; // 10000 milliseconds
final String CONF = "/path/to/hive-site.xml";
hiveConnectionPool = new HiveConnectionPool();
Map<PropertyDescriptor, String> props = new HashMap<PropertyDescriptor, String>() {{
put(HiveConnectionPool.DATABASE_URL, "${url}");
put(HiveConnectionPool.DB_USER, "${username}");
put(HiveConnectionPool.DB_PASSWORD, "${password}");
put(HiveConnectionPool.MAX_TOTAL_CONNECTIONS, "${maxconn}");
put(HiveConnectionPool.MAX_CONN_LIFETIME, "${maxconnlifetime}");
put(HiveConnectionPool.MAX_WAIT_TIME, "${maxwait}");
put(HiveConnectionPool.HIVE_CONFIGURATION_RESOURCES, "${hiveconf}");
}};
MockVariableRegistry registry = new MockVariableRegistry();
registry.setVariable(new VariableDescriptor("url"), URL);
registry.setVariable(new VariableDescriptor("username"), USER);
registry.setVariable(new VariableDescriptor("password"), PASS);
registry.setVariable(new VariableDescriptor("maxconn"), Integer.toString(MAX_CONN));
registry.setVariable(new VariableDescriptor("maxconnlifetime"), MAX_CONN_LIFETIME);
registry.setVariable(new VariableDescriptor("maxwait"), MAX_WAIT);
registry.setVariable(new VariableDescriptor("hiveconf"), CONF);
MockConfigurationContext context = new MockConfigurationContext(props, null, registry);
hiveConnectionPool.onConfigured(context);
Field dataSourceField = HiveConnectionPool.class.getDeclaredField("dataSource");
dataSourceField.setAccessible(true);
basicDataSource = (BasicDataSource) dataSourceField.get(hiveConnectionPool);
assertEquals(URL, basicDataSource.getUrl());
assertEquals(USER, basicDataSource.getUsername());
assertEquals(PASS, basicDataSource.getPassword());
assertEquals(MAX_CONN, basicDataSource.getMaxTotal());
assertEquals(1000L, basicDataSource.getMaxConnLifetimeMillis());
assertEquals(10000L, basicDataSource.getMaxWaitMillis());
assertEquals(URL, hiveConnectionPool.getConnectionURL());
}
@EnabledIfSystemProperty(
named = "nifi.test.unstable",
matches = "true",
disabledReason = "Kerberos does not seem to be properly handled in Travis build, but, locally, this test should successfully run")
@Test
public void testKerberosAuthException() {
final String URL = "jdbc:hive2://localhost:10000/default";
final String conf = "src/test/resources/hive-site-security.xml";
final String ktab = "src/test/resources/fake.keytab";
final String kprinc = "bad@PRINCIPAL.COM";
KerberosProperties kerbProperties = new KerberosProperties(krb5conf);
Map<PropertyDescriptor, String> props = new HashMap<PropertyDescriptor, String>() {{
put(HiveConnectionPool.DATABASE_URL, "${url}");
put(HiveConnectionPool.HIVE_CONFIGURATION_RESOURCES, "${conf}");
put(kerbProperties.getKerberosKeytab(), "${ktab}");
put(kerbProperties.getKerberosPrincipal(), "${kprinc}");
}};
MockVariableRegistry registry = new MockVariableRegistry();
registry.setVariable(new VariableDescriptor("url"), URL);
registry.setVariable(new VariableDescriptor("conf"), conf);
registry.setVariable(new VariableDescriptor("ktab"), ktab);
registry.setVariable(new VariableDescriptor("kprinc"), kprinc);
MockConfigurationContext context = new MockConfigurationContext(props, null, registry);
assertThrows(InitializationException.class, () -> hiveConnectionPool.onConfigured(context));
}
}

View File

@ -1,568 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.ql.io.orc.Reader;
import org.apache.hadoop.hive.ql.io.orc.RecordReader;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.apache.nifi.util.orc.TestNiFiOrcUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Unit tests for ConvertAvroToORC processor
*/
public class TestConvertAvroToORC {
private ConvertAvroToORC processor;
private TestRunner runner;
@BeforeEach
public void setUp() throws Exception {
processor = new ConvertAvroToORC();
runner = TestRunners.newTestRunner(processor);
}
@Test
public void test_onTrigger_routing_to_failure_null_type() throws Exception {
String testString = "Hello World";
GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithNull(testString);
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test.avro");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (string STRING, null BOOLEAN) STORED AS ORC",
resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
}
@Test
public void test_onTrigger_routing_to_failure_empty_array_type() throws Exception {
String testString = "Hello World";
GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithEmptyArray(testString);
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test.avro");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (string STRING, emptyArray ARRAY<BOOLEAN>) STORED AS ORC",
resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
}
@Test
public void test_onTrigger_routing_to_failure_fixed_type() throws Exception {
String testString = "Hello!";
GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithFixed(testString);
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test.avro");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_FAILURE, 1);
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_FAILURE).get(0);
assertEquals("test.avro", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
final InputStream in = new ByteArrayInputStream(resultFlowFile.toByteArray());
final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
assertTrue(dataFileReader.hasNext());
GenericRecord testedRecord = dataFileReader.next();
assertNotNull(testedRecord.get("fixed"));
assertArrayEquals(testString.getBytes(StandardCharsets.UTF_8), ((GenericData.Fixed) testedRecord.get("fixed")).bytes());
}
}
@Test
public void test_onTrigger_primitive_record() throws Exception {
GenericData.Record record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(10, 20L, true, 30.0f, 40, StandardCharsets.UTF_8.encode("Hello"), "World");
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
// Put another record in
record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(1, 2L, false, 3.0f, 4L, StandardCharsets.UTF_8.encode("I am"), "another record");
fileWriter.append(record);
// And one more
record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(100, 200L, true, 300.0f, 400L, StandardCharsets.UTF_8.encode("Me"), "too!");
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test.avro");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (int INT, long BIGINT, boolean BOOLEAN, float FLOAT, double DOUBLE, bytes BINARY, string STRING)"
+ " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("3", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
TypeInfo resultSchema = TestNiFiOrcUtils.buildPrimitiveOrcSchema();
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
// Check some fields in the first row
Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("int"));
assertTrue(intFieldObject instanceof IntWritable);
assertEquals(10, ((IntWritable) intFieldObject).get());
Object stringFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("string"));
assertTrue(stringFieldObject instanceof Text);
assertEquals("World", stringFieldObject.toString());
}
@Test
public void test_onTrigger_complex_record() throws Exception {
Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
put("key1", 1.0);
put("key2", 2.0);
}};
BigDecimal sampleBigDecimal = new BigDecimal("12.34");
ByteBuffer bigDecimalAsBytes = ByteBuffer.wrap(sampleBigDecimal.unscaledValue().toByteArray());
GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20), bigDecimalAsBytes);
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
// Put another record in
Map<String, Double> mapData2 = new TreeMap<String, Double>() {{
put("key1", 3.0);
put("key2", 4.0);
}};
record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200), bigDecimalAsBytes);
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " +
"(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>, myDecimal DECIMAL(10,2))"
+ " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
// Check some fields in the first row
Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
assertTrue(intFieldObject instanceof IntWritable);
assertEquals(10, ((IntWritable) intFieldObject).get());
Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
assertTrue(mapFieldObject instanceof Map);
Map map = (Map) mapFieldObject;
Object mapValue = map.get(new Text("key1"));
assertNotNull(mapValue);
assertTrue(mapValue instanceof DoubleWritable);
assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
mapValue = map.get(new Text("key2"));
assertNotNull(mapValue);
assertTrue(mapValue instanceof DoubleWritable);
assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
Object decimalFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myDecimal"));
assertTrue(decimalFieldObject instanceof HiveDecimalWritable);
assertEquals(sampleBigDecimal, ((HiveDecimalWritable) decimalFieldObject).getHiveDecimal().bigDecimalValue());
}
@Test
public void test_onTrigger_complex_records_with_bigdecimals() throws Exception {
Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
put("key1", 1.0);
put("key2", 2.0);
}};
BigDecimal sampleBigDecimal1 = new BigDecimal("3500.12");
BigDecimal sampleBigDecimal2 = new BigDecimal("0.01");
GenericData.Record record1 = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData1, "XYZ", 4L, Arrays.asList(100, 200), toByteBuffer(sampleBigDecimal1));
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record1.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record1.getSchema(), out);
fileWriter.append(record1);
fileWriter.append(TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData1, "XYZ", 4L, Arrays.asList(100, 200), toByteBuffer(sampleBigDecimal2)));
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
Object result1 = rows.next(null);
assertNotNull(result1);
Object decimalFieldObject1 = inspector.getStructFieldData(result1, inspector.getStructFieldRef("myDecimal"));
assertEquals(sampleBigDecimal1, ((HiveDecimalWritable) decimalFieldObject1).getHiveDecimal().bigDecimalValue());
Object result2 = rows.next(null);
assertNotNull(result2);
Object decimalFieldObject2 = inspector.getStructFieldData(result2, inspector.getStructFieldRef("myDecimal"));
assertEquals(sampleBigDecimal2, ((HiveDecimalWritable) decimalFieldObject2).getHiveDecimal().bigDecimalValue());
}
private ByteBuffer toByteBuffer(BigDecimal sampleBigDecimal) {
return ByteBuffer.wrap(sampleBigDecimal.unscaledValue().toByteArray());
}
@Test
public void test_onTrigger_array_of_records() throws Exception {
final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array_of_records.avsc"));
List<GenericRecord> innerRecords = new LinkedList<>();
final GenericRecord outerRecord = new GenericData.Record(schema);
Schema arraySchema = schema.getField("records").schema();
Schema innerRecordSchema = arraySchema.getElementType();
final GenericRecord innerRecord1 = new GenericData.Record(innerRecordSchema);
innerRecord1.put("name", "Joe");
innerRecord1.put("age", 42);
innerRecords.add(innerRecord1);
final GenericRecord innerRecord2 = new GenericData.Record(innerRecordSchema);
innerRecord2.put("name", "Mary");
innerRecord2.put("age", 28);
innerRecords.add(innerRecord2);
GenericData.Array<GenericRecord> array = new GenericData.Array<>(arraySchema, innerRecords);
outerRecord.put("records", array);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
ByteArrayOutputStream out = new ByteArrayOutputStream();
try (DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, out);
dataFileWriter.append(outerRecord);
}
out.close();
// Build a flow file from the Avro record
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS org_apache_nifi_outer_record " +
"(records ARRAY<STRUCT<name:STRING, age:INT>>)"
+ " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("1", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(schema));
// Verify the record contains an array
Object arrayFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("records"));
assertTrue(arrayFieldObject instanceof ArrayList);
ArrayList<?> arrayField = (ArrayList<?>) arrayFieldObject;
assertEquals(2, arrayField.size());
// Verify the first element. Should be a record with two fields "name" and "age"
Object element = arrayField.get(0);
assertTrue(element instanceof OrcStruct);
StructObjectInspector elementInspector = (StructObjectInspector) OrcStruct.createObjectInspector(NiFiOrcUtils.getOrcField(innerRecordSchema));
Object nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
assertTrue(nameObject instanceof Text);
assertEquals("Joe", nameObject.toString());
Object ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
assertTrue(ageObject instanceof IntWritable);
assertEquals(42, ((IntWritable) ageObject).get());
// Verify the first element. Should be a record with two fields "name" and "age"
element = arrayField.get(1);
assertTrue(element instanceof OrcStruct);
nameObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("name"));
assertTrue(nameObject instanceof Text);
assertEquals("Mary", nameObject.toString());
ageObject = elementInspector.getStructFieldData(element, elementInspector.getStructFieldRef("age"));
assertTrue(ageObject instanceof IntWritable);
assertEquals(28, ((IntWritable) ageObject).get());
}
@Test
public void test_onTrigger_nested_complex_record() throws Exception {
Map<String, List<Double>> mapData1 = new TreeMap<String, List<Double>>() {{
put("key1", Arrays.asList(1.0, 2.0));
put("key2", Arrays.asList(3.0, 4.0));
}};
Map<String, String> arrayMap11 = new TreeMap<String, String>() {{
put("key1", "v1");
put("key2", "v2");
}};
Map<String, String> arrayMap12 = new TreeMap<String, String>() {{
put("key3", "v3");
put("key4", "v4");
}};
GenericData.Record record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData1, Arrays.asList(arrayMap11, arrayMap12));
DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
ByteArrayOutputStream out = new ByteArrayOutputStream();
fileWriter.create(record.getSchema(), out);
fileWriter.append(record);
// Put another record in
Map<String, List<Double>> mapData2 = new TreeMap<String, List<Double>>() {{
put("key1", Arrays.asList(-1.0, -2.0));
put("key2", Arrays.asList(-3.0, -4.0));
}};
Map<String, String> arrayMap21 = new TreeMap<String, String>() {{
put("key1", "v-1");
put("key2", "v-2");
}};
Map<String, String> arrayMap22 = new TreeMap<String, String>() {{
put("key3", "v-3");
put("key4", "v-4");
}};
record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData2, Arrays.asList(arrayMap21, arrayMap22));
fileWriter.append(record);
fileWriter.flush();
fileWriter.close();
out.close();
Map<String, String> attributes = new HashMap<String, String>() {{
put(CoreAttributes.FILENAME.key(), "test");
}};
runner.enqueue(out.toByteArray(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);
// Write the flow file out to disk, since the ORC Reader needs a path
MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS nested_complex_record " +
"(myMapOfArray MAP<STRING, ARRAY<DOUBLE>>, myArrayOfMap ARRAY<MAP<STRING, STRING>>)"
+ " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
FileOutputStream fos = new FileOutputStream("target/test1.orc");
fos.write(resultContents);
fos.flush();
fos.close();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.getLocal(conf);
Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
RecordReader rows = reader.rows();
Object o = rows.next(null);
assertNotNull(o);
assertTrue(o instanceof OrcStruct);
TypeInfo resultSchema = TestNiFiOrcUtils.buildNestedComplexOrcSchema();
StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);
// check values
Object myMapOfArray = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMapOfArray"));
assertTrue(myMapOfArray instanceof Map);
Map map = (Map) myMapOfArray;
Object mapValue = map.get(new Text("key1"));
assertNotNull(mapValue);
assertTrue(mapValue instanceof List);
assertEquals(Arrays.asList(new DoubleWritable(1.0), new DoubleWritable(2.0)), mapValue);
Object myArrayOfMap = inspector.getStructFieldData(o, inspector.getStructFieldRef("myArrayOfMap"));
assertTrue(myArrayOfMap instanceof List);
List list = (List) myArrayOfMap;
Object el0 = list.get(0);
assertNotNull(el0);
assertTrue(el0 instanceof Map);
assertEquals(new Text("v1"), ((Map) el0).get(new Text("key1")));
}
}

View File

@ -1,292 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSessionFactory;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.util.MockProcessContext;
import org.apache.nifi.util.MockProcessorInitializationContext;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestHiveParser extends AbstractHiveQLProcessor {
@BeforeEach
public void initialize() {
final MockProcessContext processContext = new MockProcessContext(this);
final ProcessorInitializationContext initializationContext = new MockProcessorInitializationContext(this, processContext);
initialize(initializationContext);
}
@Override
public void onTrigger(ProcessContext context, ProcessSessionFactory sessionFactory) throws ProcessException {
}
@Test
public void parseSelect() {
String query = "select a.empid, to_something(b.saraly) from " +
"company.emp a inner join default.salary b where a.empid = b.empid";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(2, tableNames.size());
assertTrue(tableNames.contains(new TableName("company", "emp", true)));
assertTrue(tableNames.contains(new TableName("default", "salary", true)));
}
@Test
public void parseSelectPrepared() {
String query = "select empid from company.emp a where a.firstName = ?";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName("company", "emp", true)));
}
@Test
public void parseLongSelect() {
String query = "select\n" +
"\n" +
" i_item_id,\n" +
"\n" +
" i_item_desc,\n" +
"\n" +
" s_state,\n" +
"\n" +
" count(ss_quantity) as store_sales_quantitycount,\n" +
"\n" +
" avg(ss_quantity) as store_sales_quantityave,\n" +
"\n" +
" stddev_samp(ss_quantity) as store_sales_quantitystdev,\n" +
"\n" +
" stddev_samp(ss_quantity) / avg(ss_quantity) as store_sales_quantitycov,\n" +
"\n" +
" count(sr_return_quantity) as store_returns_quantitycount,\n" +
"\n" +
" avg(sr_return_quantity) as store_returns_quantityave,\n" +
"\n" +
" stddev_samp(sr_return_quantity) as store_returns_quantitystdev,\n" +
"\n" +
" stddev_samp(sr_return_quantity) / avg(sr_return_quantity) as store_returns_quantitycov,\n" +
"\n" +
" count(cs_quantity) as catalog_sales_quantitycount,\n" +
"\n" +
" avg(cs_quantity) as catalog_sales_quantityave,\n" +
"\n" +
" stddev_samp(cs_quantity) / avg(cs_quantity) as catalog_sales_quantitystdev,\n" +
"\n" +
" stddev_samp(cs_quantity) / avg(cs_quantity) as catalog_sales_quantitycov\n" +
"\n" +
"from\n" +
"\n" +
" store_sales,\n" +
"\n" +
" store_returns,\n" +
"\n" +
" catalog_sales,\n" +
"\n" +
" date_dim d1,\n" +
"\n" +
" date_dim d2,\n" +
"\n" +
" date_dim d3,\n" +
"\n" +
" store,\n" +
"\n" +
" item\n" +
"\n" +
"where\n" +
"\n" +
" d1.d_quarter_name = '2000Q1'\n" +
"\n" +
" and d1.d_date_sk = ss_sold_date_sk\n" +
"\n" +
" and i_item_sk = ss_item_sk\n" +
"\n" +
" and s_store_sk = ss_store_sk\n" +
"\n" +
" and ss_customer_sk = sr_customer_sk\n" +
"\n" +
" and ss_item_sk = sr_item_sk\n" +
"\n" +
" and ss_ticket_number = sr_ticket_number\n" +
"\n" +
" and sr_returned_date_sk = d2.d_date_sk\n" +
"\n" +
" and d2.d_quarter_name in ('2000Q1' , '2000Q2', '2000Q3')\n" +
"\n" +
" and sr_customer_sk = cs_bill_customer_sk\n" +
"\n" +
" and sr_item_sk = cs_item_sk\n" +
"\n" +
" and cs_sold_date_sk = d3.d_date_sk\n" +
"\n" +
" and d3.d_quarter_name in ('2000Q1' , '2000Q2', '2000Q3')\n" +
"\n" +
"group by i_item_id , i_item_desc , s_state\n" +
"\n" +
"order by i_item_id , i_item_desc , s_state\n" +
"\n" +
"limit 100";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(6, tableNames.size());
AtomicInteger cnt = new AtomicInteger(0);
for (TableName tableName : tableNames) {
if (tableName.equals(new TableName(null, "store_sales", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "store_returns", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "catalog_sales", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "date_dim", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "store", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "item", true))) {
cnt.incrementAndGet();
}
}
assertEquals(6, cnt.get());
}
@Test
public void parseSelectInsert() {
String query = "insert into databaseA.tableA select key, max(value) from databaseA.tableA where category = 'x'";
// The same database.tableName can appear two times for input and output.
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(2, tableNames.size());
AtomicInteger cnt = new AtomicInteger(0);
tableNames.forEach(tableName -> {
if (tableName.equals(new TableName("databaseA", "tableA", false))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName("databaseA", "tableA", true))) {
cnt.incrementAndGet();
}
});
assertEquals(2, cnt.get());
}
@Test
public void parseInsert() {
String query = "insert into databaseB.tableB1 select something from tableA1 a1 inner join tableA2 a2 where a1.id = a2.id";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(3, tableNames.size());
AtomicInteger cnt = new AtomicInteger(0);
tableNames.forEach(tableName -> {
if (tableName.equals(new TableName("databaseB", "tableB1", false))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "tableA1", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "tableA2", true))) {
cnt.incrementAndGet();
}
});
assertEquals(3, cnt.get());
}
@Test
public void parseUpdate() {
String query = "update table_a set y = 'updated' where x > 100";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "table_a", false)));
}
@Test
public void parseDelete() {
String query = "delete from table_a where x > 100";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "table_a", false)));
}
@Test
public void parseDDL() {
String query = "CREATE TABLE IF NOT EXISTS EMPLOYEES(\n" +
"EmployeeID INT,FirstName STRING, Title STRING,\n" +
"State STRING, Laptop STRING)\n" +
"COMMENT 'Employee Names'\n" +
"STORED AS ORC";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "EMPLOYEES", false)));
}
@Test
public void parseSetProperty() {
String query = " set 'hive.exec.dynamic.partition.mode'=nonstrict";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
@Test
public void parseSetRole() {
String query = "set role all";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
@Test
public void parseShowRoles() {
String query = "show roles";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
@Test
public void parseMsck() {
String query = "msck repair table table_a";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "table_a", false)));
}
@Test
public void parseAddJar() {
String query = "ADD JAR hdfs:///tmp/my_jar.jar";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
}

View File

@ -1,846 +0,0 @@
package org.apache.nifi.processors.hive;/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.dbcp.DBCPService;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.pattern.RollbackOnFailure;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledOnOs;
import org.junit.jupiter.api.condition.OS;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mockito;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Types;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@DisabledOnOs(OS.WINDOWS)
public class TestPutHiveQL {
private static final String createPersons = "CREATE TABLE PERSONS (id integer primary key, name varchar(100), code integer)";
private static final String createPersonsAutoId = "CREATE TABLE PERSONS (id INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1), name VARCHAR(100), code INTEGER check(code <= 100))";
@BeforeAll
public static void setup() {
System.setProperty("derby.stream.error.file", "target/derby.log");
}
@Test
public void testDirectStatements(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (1, 'Mark', 84)".getBytes());
runner.run();
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
runner.enqueue("UPDATE PERSONS SET NAME='George' WHERE ID=1".getBytes());
runner.run();
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("George", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testFailInMiddleWithBadStatement(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', 84)".getBytes());
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
runner.run();
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 1);
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 3);
runner.getFlowFilesForRelationship(PutHiveQL.REL_SUCCESS)
.forEach(f -> f.assertAttributeEquals(PutHiveQL.ATTR_OUTPUT_TABLES, "PERSONS"));
}
@Test
public void testFailInMiddleWithBadStatementRollbackOnFailure(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', 84)".getBytes());
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
runner.run();
// The 1st one should be routed to success, others should stay in queue.
assertEquals(3, runner.getQueueSize().getObjectCount());
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 0);
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 1);
}
@Test
public void testFailAtBeginning(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
runner.run();
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 1);
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 2);
}
@Test
public void testFailAtBeginningRollbackOnFailure(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertTrue(e.getCause() instanceof ProcessException);
assertEquals(3, runner.getQueueSize().getObjectCount());
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 0);
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 0);
}
@Test
public void testFailInMiddleWithBadParameterType(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> goodAttributes = new HashMap<>();
goodAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
goodAttributes.put("hiveql.args.1.value", "84");
final Map<String, String> badAttributes = new HashMap<>();
badAttributes.put("hiveql.args.1.type", String.valueOf(Types.VARCHAR));
badAttributes.put("hiveql.args.1.value", "hello");
final byte[] data = "INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', ?)".getBytes();
runner.enqueue(data, goodAttributes);
runner.enqueue(data, badAttributes);
runner.enqueue(data, goodAttributes);
runner.enqueue(data, goodAttributes);
runner.run();
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 1);
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 3);
}
@Test
public void testFailInMiddleWithBadParameterValue(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> goodAttributes = new HashMap<>();
goodAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
goodAttributes.put("hiveql.args.1.value", "84");
final Map<String, String> badAttributes = new HashMap<>();
badAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
badAttributes.put("hiveql.args.1.value", "101"); // Constraint violation, up to 100
final byte[] data = "INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', ?)".getBytes();
runner.enqueue(data, goodAttributes);
runner.enqueue(data, badAttributes);
runner.enqueue(data, goodAttributes);
runner.enqueue(data, goodAttributes);
runner.run();
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 3);
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertTrue(rs.next());
assertTrue(rs.next());
assertFalse(rs.next());
}
}
}
@Test
public void testFailInMiddleWithBadNumberFormat(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> goodAttributes = new HashMap<>();
goodAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
goodAttributes.put("hiveql.args.1.value", "84");
final Map<String, String> badAttributes = new HashMap<>();
badAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
badAttributes.put("hiveql.args.1.value", "NOT_NUMBER");
final byte[] data = "INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', ?)".getBytes();
runner.enqueue(data, goodAttributes);
runner.enqueue(data, badAttributes);
runner.enqueue(data, goodAttributes);
runner.enqueue(data, goodAttributes);
runner.run();
runner.assertTransferCount(PutHiveQL.REL_SUCCESS, 3);
runner.assertTransferCount(PutHiveQL.REL_FAILURE, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertTrue(rs.next());
assertTrue(rs.next());
assertFalse(rs.next());
}
}
}
@Test
public void testUsingSqlDataTypesWithNegativeValues(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate("CREATE TABLE PERSONS (id integer primary key, name varchar(100), code bigint)");
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", "-5");
attributes.put("hiveql.args.1.value", "84");
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (1, 'Mark', ?)".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
runner.getFlowFilesForRelationship(PutHiveQL.REL_SUCCESS).get(0).assertAttributeEquals(PutHiveQL.ATTR_OUTPUT_TABLES, "PERSONS");
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testStatementsWithPreparedParameters(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?)".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
runner.clearTransferState();
attributes.clear();
attributes.put("hiveql.args.1.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.1.value", "George");
attributes.put("hiveql.args.2.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.2.value", "1");
runner.enqueue("UPDATE PERSONS SET NAME=? WHERE ID=?".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("George", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testMultipleStatementsWithinFlowFile(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because of the semicolon
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
runner.getFlowFilesForRelationship(PutHiveQL.REL_SUCCESS)
.forEach(f -> f.assertAttributeEquals(PutHiveQL.ATTR_OUTPUT_TABLES, "PERSONS"));
// Now we can check that the values were inserted by the multi-statement script.
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals( 1, rs.getInt(1), "Record ID mismatch");
assertEquals("George", rs.getString(2), "Record NAME mismatch");
}
}
}
@Test
public void testMultipleStatementsWithinFlowFilePlusEmbeddedDelimiter(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George\\;' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because of the semicolon
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
// Now we can check that the values were inserted by the multi-statement script.
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1), "Record ID mismatch");
assertEquals( "George\\;", rs.getString(2), "Record NAME mismatch");
}
}
}
@Test
public void testWithNullParameter(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?)".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(0, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testInvalidStatement(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE SOME_RANDOM_TABLE NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because of the table is invalid
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_FAILURE, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
}
}
}
@Test
public void testRetryableFailure() throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final DBCPService service = new SQLExceptionService(null);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because there isn't a valid connection and tables don't exist.
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_RETRY, 1);
}
@Test
public void testRetryableFailureRollbackOnFailure() throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final DBCPService service = new SQLExceptionService(null);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertTrue(e.getCause() instanceof ProcessException);
assertEquals(1, runner.getQueueSize().getObjectCount());
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_RETRY, 0);
}
@Test
public void testUnknownFailure() throws InitializationException, ProcessException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final SQLExceptionService service = new SQLExceptionService(null);
service.setErrorCode(2);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because there isn't a valid connection and tables don't exist.
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_RETRY, 1);
}
@Test
public void testUnknownFailureRollbackOnFailure() throws InitializationException, ProcessException {
final TestRunner runner = TestRunners.newTestRunner(PutHiveQL.class);
final SQLExceptionService service = new SQLExceptionService(null);
service.setErrorCode(0);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHiveQL.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertTrue(e.getCause() instanceof ProcessException);
assertEquals(1, runner.getQueueSize().getObjectCount());
runner.assertAllFlowFilesTransferred(PutHiveQL.REL_RETRY, 0);
}
/**
* Simple implementation only for testing purposes
*/
private static class MockDBCPService extends AbstractControllerService implements HiveDBCPService {
private final String dbLocation;
MockDBCPService(final String dbLocation) {
this.dbLocation = dbLocation;
}
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
Class.forName("org.apache.derby.jdbc.EmbeddedDriver");
return DriverManager.getConnection("jdbc:derby:" + dbLocation + ";create=true");
} catch (final Exception e) {
e.printStackTrace();
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return "jdbc:derby:" + dbLocation + ";create=true";
}
}
/**
* Simple implementation only for testing purposes
*/
private static class SQLExceptionService extends AbstractControllerService implements HiveDBCPService {
private final HiveDBCPService service;
private int allowedBeforeFailure = 0;
private int successful = 0;
private int errorCode = 30000; // Default to a retryable exception code
SQLExceptionService(final HiveDBCPService service) {
this.service = service;
}
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
if (++successful > allowedBeforeFailure) {
final Connection conn = Mockito.mock(Connection.class);
Mockito.when(conn.prepareStatement(Mockito.any(String.class))).thenThrow(new SQLException("Unit Test Generated SQLException", "42000", errorCode));
return conn;
} else {
return service.getConnection();
}
} catch (final Exception e) {
e.printStackTrace();
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return service != null ? service.getConnectionURL() : null;
}
void setErrorCode(int errorCode) {
this.errorCode = errorCode;
}
}
}

View File

@ -1,736 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.dbcp.DBCPService;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.provenance.ProvenanceEventRecord;
import org.apache.nifi.provenance.ProvenanceEventType;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.apache.nifi.util.hive.HiveJdbcCommon;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Types;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import static org.apache.nifi.processors.hive.SelectHiveQL.HIVEQL_OUTPUT_FORMAT;
import static org.apache.nifi.util.hive.HiveJdbcCommon.AVRO;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV_MIME_TYPE;
import static org.apache.nifi.util.hive.HiveJdbcCommon.MIME_TYPE_AVRO_BINARY;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestSelectHiveQL {
private static final Logger LOGGER;
private final static String MAX_ROWS_KEY = "maxRows";
private final int NUM_OF_ROWS = 100;
static {
System.setProperty("org.slf4j.simpleLogger.defaultLogLevel", "info");
System.setProperty("org.slf4j.simpleLogger.showDateTime", "true");
System.setProperty("org.slf4j.simpleLogger.log.nifi.io.nio", "debug");
System.setProperty("org.slf4j.simpleLogger.log.nifi.processors.hive.SelectHiveQL", "debug");
System.setProperty("org.slf4j.simpleLogger.log.nifi.processors.hive.TestSelectHiveQL", "debug");
LOGGER = LoggerFactory.getLogger(TestSelectHiveQL.class);
}
private final static String DB_LOCATION = "target/db";
private final static String QUERY_WITH_EL = "select "
+ " PER.ID as PersonId, PER.NAME as PersonName, PER.CODE as PersonCode"
+ " from persons PER"
+ " where PER.ID > ${person.id}";
private final static String QUERY_WITHOUT_EL = "select "
+ " PER.ID as PersonId, PER.NAME as PersonName, PER.CODE as PersonCode"
+ " from persons PER"
+ " where PER.ID > 10";
@BeforeAll
public static void setupClass() {
System.setProperty("derby.stream.error.file", "target/derby.log");
}
private TestRunner runner;
@BeforeEach
public void setup() throws InitializationException {
final DBCPService dbcp = new DBCPServiceSimpleImpl();
final Map<String, String> dbcpProperties = new HashMap<>();
runner = TestRunners.newTestRunner(SelectHiveQL.class);
runner.addControllerService("dbcp", dbcp, dbcpProperties);
runner.enableControllerService(dbcp);
runner.setProperty(SelectHiveQL.HIVE_DBCP_SERVICE, "dbcp");
}
@Test
public void testIncomingConnectionWithNoFlowFile() throws InitializationException {
runner.setIncomingConnection(true);
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT * FROM persons");
runner.run();
runner.assertTransferCount(SelectHiveQL.REL_SUCCESS, 0);
runner.assertTransferCount(SelectHiveQL.REL_FAILURE, 0);
}
@Test
public void testNoIncomingConnection() throws ClassNotFoundException, SQLException, InitializationException, IOException {
runner.setIncomingConnection(false);
invokeOnTrigger(QUERY_WITHOUT_EL, false, "Avro");
final List<ProvenanceEventRecord> provenanceEvents = runner.getProvenanceEvents();
final ProvenanceEventRecord provenance0 = provenanceEvents.get(0);
assertEquals(ProvenanceEventType.RECEIVE, provenance0.getEventType());
assertEquals("jdbc:derby:target/db;create=true", provenance0.getTransitUri());
}
@Test
public void testNoTimeLimit() throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITH_EL, true, "Avro");
final List<ProvenanceEventRecord> provenanceEvents = runner.getProvenanceEvents();
assertEquals(4, provenanceEvents.size());
final ProvenanceEventRecord provenance0 = provenanceEvents.get(0);
assertEquals(ProvenanceEventType.FORK, provenance0.getEventType());
final ProvenanceEventRecord provenance1 = provenanceEvents.get(1);
assertEquals(ProvenanceEventType.FETCH, provenance1.getEventType());
assertEquals("jdbc:derby:target/db;create=true", provenance1.getTransitUri());
final ProvenanceEventRecord provenance2 = provenanceEvents.get(2);
assertEquals(ProvenanceEventType.FORK, provenance2.getEventType());
// The last one was removed as empty
final ProvenanceEventRecord provenance3 = provenanceEvents.get(3);
assertEquals(ProvenanceEventType.DROP, provenance3.getEventType());
}
@Test
public void testWithNullIntColumn() throws SQLException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_NULL_INT");
} catch (final SQLException sqle) {
// Nothing to do, probably means the table didn't exist
}
stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))");
stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)");
stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)");
runner.setIncomingConnection(false);
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_ROW_COUNT);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_DURATION);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_EXECUTION_TIME);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_FETCH_TIME);
final List<MockFlowFile> flowfiles = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS);
flowfiles.get(0).assertAttributeEquals(SelectHiveQL.RESULT_ROW_COUNT, "2");
final long executionTime = Long.parseLong(flowfiles.get(0).getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(flowfiles.get(0).getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(flowfiles.get(0).getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
}
@Test
public void testWithSqlException() throws SQLException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_NO_ROWS");
} catch (final SQLException sqle) {
// Nothing to do, probably means the table didn't exist
}
stmt.execute("create table TEST_NO_ROWS (id integer)");
runner.setIncomingConnection(false);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT val1 FROM TEST_NO_ROWS");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPreQueriesNoIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, false, CSV,
"select 'no exception' from persons; select exception from persons",
null);
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPreQueriesWithIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, true, CSV,
"select 'no exception' from persons; select exception from persons",
null);
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPostQueriesNoIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, false, CSV,
null,
"select 'no exception' from persons; select exception from persons");
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPostQueriesWithIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, true, CSV,
null,
"select 'no exception' from persons; select exception from persons");
// with incoming connections, it should be rolled back
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
}
@Test
public void testWithBadSQL() throws SQLException {
final String BAD_SQL = "create table TEST_NO_ROWS (id integer)";
// Test with incoming flow file (it should be routed to failure intact, i.e. same content and no parent)
runner.setIncomingConnection(true);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.enqueue(BAD_SQL);
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(SelectHiveQL.REL_FAILURE).get(0);
flowFile.assertContentEquals(BAD_SQL);
flowFile.assertAttributeEquals("parentIds", null);
runner.clearTransferState();
// Test with no incoming flow file (an empty flow file is transferred)
runner.setIncomingConnection(false);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, BAD_SQL);
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_FAILURE, 1);
flowFile = runner.getFlowFilesForRelationship(SelectHiveQL.REL_FAILURE).get(0);
flowFile.assertContentEquals("");
}
@Test
public void invokeOnTriggerWithCsv()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV);
}
@Test
public void invokeOnTriggerWithAvro()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, AVRO);
}
@Test
public void invokeOnTriggerWithValidPreQieries()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV,
"select '1' from persons; select '2' from persons", //should not be 'select'. But Derby driver doesn't support "set param=val" format.
null);
}
@Test
public void invokeOnTriggerWithValidPostQieries()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV,
null,
//should not be 'select'. But Derby driver doesn't support "set param=val" format,
//so just providing any "compilable" query.
" select '4' from persons; \nselect '5' from persons");
}
@Test
public void invokeOnTriggerWithValidPrePostQieries()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV,
//should not be 'select'. But Derby driver doesn't support "set param=val" format,
//so just providing any "compilable" query.
"select '1' from persons; select '2' from persons",
" select '4' from persons; \nselect '5' from persons");
}
public void invokeOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat)
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(query, incomingFlowFile, outputFormat, null, null);
}
public void invokeOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat,
String preQueries, String postQueries)
throws InitializationException, ClassNotFoundException, SQLException, IOException {
TestRunner runner = doOnTrigger(query, incomingFlowFile, outputFormat, preQueries, postQueries);
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_ROW_COUNT);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_DURATION);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_EXECUTION_TIME);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_FETCH_TIME);
final List<MockFlowFile> flowfiles = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS);
MockFlowFile flowFile = flowfiles.get(0);
final InputStream in = new ByteArrayInputStream(flowFile.toByteArray());
long recordsFromStream = 0;
if (AVRO.equals(outputFormat)) {
assertEquals(MIME_TYPE_AVRO_BINARY, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
GenericRecord record = null;
while (dataFileReader.hasNext()) {
// Reuse record object by passing it to next(). This saves us from
// allocating and garbage collecting many objects for files with
// many items.
record = dataFileReader.next(record);
recordsFromStream++;
}
}
} else {
assertEquals(CSV_MIME_TYPE, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String headerRow = br.readLine();
// Derby capitalizes column names
assertEquals("PERSONID,PERSONNAME,PERSONCODE", headerRow);
// Validate rows
String line;
while ((line = br.readLine()) != null) {
recordsFromStream++;
String[] values = line.split(",");
if (recordsFromStream < (NUM_OF_ROWS - 10)) {
assertEquals(3, values.length);
assertTrue(values[1].startsWith("\""));
assertTrue(values[1].endsWith("\""));
} else {
assertEquals(2, values.length); // Middle value is null
}
}
}
final long executionTime = Long.parseLong(flowFile.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(flowFile.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(flowFile.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(NUM_OF_ROWS - 10, recordsFromStream);
assertEquals(recordsFromStream, Integer.parseInt(flowFile.getAttribute(SelectHiveQL.RESULT_ROW_COUNT)));
assertEquals(durationTime, fetchTime + executionTime);
flowFile.assertAttributeEquals(AbstractHiveQLProcessor.ATTR_INPUT_TABLES, "persons");
}
public TestRunner doOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat,
String preQueries, String postQueries)
throws InitializationException, ClassNotFoundException, SQLException, IOException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((HiveDBCPService) runner.getControllerService("dbcp")).getConnection();
final Statement stmt = con.createStatement();
try {
stmt.execute("drop table persons");
} catch (final SQLException sqle) {
// Nothing to do here, the table didn't exist
}
stmt.execute("create table persons (id integer, name varchar(100), code integer)");
Random rng = new Random(53496);
stmt.executeUpdate("insert into persons values (1, 'Joe Smith', " + rng.nextInt(469947) + ")");
for (int i = 2; i < NUM_OF_ROWS; i++) {
stmt.executeUpdate("insert into persons values (" + i + ", 'Someone Else', " + rng.nextInt(469947) + ")");
}
stmt.executeUpdate("insert into persons values (" + NUM_OF_ROWS + ", 'Last Person', NULL)");
LOGGER.info("test data loaded");
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, query);
runner.setProperty(HIVEQL_OUTPUT_FORMAT, outputFormat);
if (preQueries != null) {
runner.setProperty(SelectHiveQL.HIVEQL_PRE_QUERY, preQueries);
}
if (postQueries != null) {
runner.setProperty(SelectHiveQL.HIVEQL_POST_QUERY, postQueries);
}
if (incomingFlowFile) {
// incoming FlowFile content is not used, but attributes are used
final Map<String, String> attributes = new HashMap<>();
attributes.put("person.id", "10");
runner.enqueue("Hello".getBytes(), attributes);
}
runner.setIncomingConnection(incomingFlowFile);
runner.run();
return runner;
}
@Test
public void testMaxRowsPerFlowFileAvro() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
InputStream in;
MockFlowFile mff;
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(false);
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_QUERY_DB_TABLE");
runner.setProperty(SelectHiveQL.MAX_ROWS_PER_FLOW_FILE, "${" + MAX_ROWS_KEY + "}");
runner.setProperty(SelectHiveQL.HIVEQL_OUTPUT_FORMAT, HiveJdbcCommon.AVRO);
runner.setVariable(MAX_ROWS_KEY, "9");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 12);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_ROW_COUNT);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_DURATION);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_EXECUTION_TIME);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_FETCH_TIME);
//ensure all but the last file have 9 records each
for (int ff = 0; ff < 11; ff++) {
mff = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(ff);
final long executionTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
in = new ByteArrayInputStream(mff.toByteArray());
assertEquals(9, getNumberOfRecordsFromStream(in));
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(ff), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
}
//last file should have 1 record
mff = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(11);
final long executionTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
in = new ByteArrayInputStream(mff.toByteArray());
assertEquals(1, getNumberOfRecordsFromStream(in));
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(11), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
runner.clearTransferState();
}
@Test
public void testParametrizedQuery() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(true);
runner.setProperty(SelectHiveQL.MAX_ROWS_PER_FLOW_FILE, "${" + MAX_ROWS_KEY + "}");
runner.setProperty(SelectHiveQL.HIVEQL_OUTPUT_FORMAT, HiveJdbcCommon.AVRO);
runner.setVariable(MAX_ROWS_KEY, "9");
Map<String, String> attributes = new HashMap<String, String>();
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
runner.enqueue("SELECT * FROM TEST_QUERY_DB_TABLE WHERE id = ?", attributes );
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 1);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_ROW_COUNT);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_DURATION);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_EXECUTION_TIME);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_FETCH_TIME);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(0);
final long executionTime = Long.parseLong(flowFile.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(flowFile.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(flowFile.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
// Assert the attributes from the incoming flow file are preserved in the outgoing flow file(s)
flowFile.assertAttributeEquals("hiveql.args.1.value", "1");
flowFile.assertAttributeEquals("hiveql.args.1.type", String.valueOf(Types.INTEGER));
runner.clearTransferState();
}
@Test
public void testMaxRowsPerFlowFileCSV() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
InputStream in;
MockFlowFile mff;
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(true);
runner.setProperty(SelectHiveQL.MAX_ROWS_PER_FLOW_FILE, "${" + MAX_ROWS_KEY + "}");
runner.setProperty(SelectHiveQL.HIVEQL_OUTPUT_FORMAT, HiveJdbcCommon.CSV);
runner.enqueue("SELECT * FROM TEST_QUERY_DB_TABLE", new HashMap<String, String>() {{
put(MAX_ROWS_KEY, "9");
}});
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, 12);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_ROW_COUNT);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_DURATION);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_EXECUTION_TIME);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_FETCH_TIME);
//ensure all but the last file have 9 records (10 lines = 9 records + header) each
for (int ff = 0; ff < 11; ff++) {
mff = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(ff);
final long executionTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
in = new ByteArrayInputStream(mff.toByteArray());
BufferedReader br = new BufferedReader(new InputStreamReader(in));
assertEquals(10, br.lines().count());
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(ff), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
}
//last file should have 1 record (2 lines = 1 record + header)
mff = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(11);
final long executionTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
in = new ByteArrayInputStream(mff.toByteArray());
BufferedReader br = new BufferedReader(new InputStreamReader(in));
assertEquals(2, br.lines().count());
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(11), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
runner.clearTransferState();
}
@Test
public void testMaxRowsPerFlowFileWithMaxFragments() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
InputStream in;
MockFlowFile mff;
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(false);
runner.setProperty(SelectHiveQL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_QUERY_DB_TABLE");
runner.setProperty(SelectHiveQL.MAX_ROWS_PER_FLOW_FILE, "9");
Integer maxFragments = 3;
runner.setProperty(SelectHiveQL.MAX_FRAGMENTS, maxFragments.toString());
runner.run();
runner.assertAllFlowFilesTransferred(SelectHiveQL.REL_SUCCESS, maxFragments);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_ROW_COUNT);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_DURATION);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_EXECUTION_TIME);
runner.assertAllFlowFilesContainAttribute(SelectHiveQL.REL_SUCCESS, SelectHiveQL.RESULT_QUERY_FETCH_TIME);
for (int i = 0; i < maxFragments; i++) {
mff = runner.getFlowFilesForRelationship(SelectHiveQL.REL_SUCCESS).get(i);
final long executionTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_EXECUTION_TIME));
final long fetchTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_FETCH_TIME));
final long durationTime = Long.parseLong(mff.getAttribute(SelectHiveQL.RESULT_QUERY_DURATION));
assertEquals(durationTime, fetchTime + executionTime);
in = new ByteArrayInputStream(mff.toByteArray());
assertEquals(9, getNumberOfRecordsFromStream(in));
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(i), mff.getAttribute("fragment.index"));
assertEquals(maxFragments.toString(), mff.getAttribute("fragment.count"));
}
runner.clearTransferState();
}
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
GenericRecord record = null;
long recordsFromStream = 0;
while (dataFileReader.hasNext()) {
// Reuse record object by passing it to next(). This saves us from
// allocating and garbage collecting many objects for files with
// many items.
record = dataFileReader.next(record);
recordsFromStream += 1;
}
return recordsFromStream;
}
}
/**
* Simple implementation only for SelectHiveQL processor testing.
*/
private class DBCPServiceSimpleImpl extends AbstractControllerService implements HiveDBCPService {
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
Class.forName("org.apache.derby.jdbc.EmbeddedDriver");
return DriverManager.getConnection("jdbc:derby:" + DB_LOCATION + ";create=true");
} catch (final Exception e) {
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return "jdbc:derby:" + DB_LOCATION + ";create=true";
}
}
}

View File

@ -1,449 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.dbcp.DBCPService;
import org.apache.nifi.dbcp.hive.HiveDBCPService;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.schema.access.SchemaNotFoundException;
import org.apache.nifi.serialization.RecordReader;
import org.apache.nifi.serialization.SimpleRecordSchema;
import org.apache.nifi.serialization.record.MockRecordParser;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledOnOs;
import org.junit.jupiter.api.condition.OS;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.stubbing.Answer;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@DisabledOnOs(OS.WINDOWS)
public class TestUpdateHiveTable {
private static final String TEST_CONF_PATH = "src/test/resources/core-site.xml";
private static final String TARGET_HIVE = "target/hive";
private static final String[] SHOW_TABLES_COLUMN_NAMES = new String[]{"tab_name"};
private static final String[][] SHOW_TABLES_RESULTSET = new String[][]{
new String[]{"messages"},
new String[]{"users"},
};
private static final String[] DESC_MESSAGES_TABLE_COLUMN_NAMES = new String[]{"id", "msg"};
private static final String[][] DESC_MESSAGES_TABLE_RESULTSET = new String[][]{
new String[]{"# col_name", "data_type", "comment"},
new String[]{"", null, null},
new String[]{"id", "int", ""},
new String[]{"msg", "string", ""},
new String[]{"", null, null},
new String[]{"# Partition Information", null, null},
new String[]{"# col_name", "data_type", "comment"},
new String[]{"", null, null},
new String[]{"continent", "string", ""},
new String[]{"country", "string", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/messages", null}
};
private static final String[] DESC_USERS_TABLE_COLUMN_NAMES = new String[]{"name", "favorite_number", "favorite_color", "scale"};
private static final String[][] DESC_USERS_TABLE_RESULTSET = new String[][]{
new String[]{"name", "string", ""},
new String[]{"favorite_number", "int", ""},
new String[]{"favorite_color", "string", ""},
new String[]{"scale", "double", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/users", null}
};
private static final String[][] DESC_EXTERNAL_USERS_TABLE_RESULTSET = new String[][]{
new String[]{"name", "string", ""},
new String[]{"favorite_number", "int", ""},
new String[]{"favorite_color", "string", ""},
new String[]{"scale", "double", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/path/to/users", null}
};
private static final String[] DESC_NEW_TABLE_COLUMN_NAMES = DESC_USERS_TABLE_COLUMN_NAMES;
private static final String[][] DESC_NEW_TABLE_RESULTSET = new String[][]{
new String[]{"# col_name", "data_type", "comment"},
new String[]{"name", "string", ""},
new String[]{"favorite_number", "int", ""},
new String[]{"favorite_color", "string", ""},
new String[]{"scale", "double", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/_newTable", null}
};
private TestRunner runner;
private MockUpdateHiveTable processor;
@BeforeEach
public void setUp() {
Configuration testConf = new Configuration();
testConf.addResource(new Path(TEST_CONF_PATH));
// Delete any temp files from previous tests
try {
FileUtils.deleteDirectory(new File(TARGET_HIVE));
} catch (IOException ioe) {
// Do nothing, directory may not have existed
}
processor = new MockUpdateHiveTable();
}
private void configure(final UpdateHiveTable processor, final int numUsers) throws InitializationException {
configure(processor, numUsers, false, -1);
}
private void configure(final UpdateHiveTable processor, final int numUsers, boolean failOnCreateReader, int failAfter) throws InitializationException {
configure(processor, numUsers, failOnCreateReader, failAfter, null);
}
private void configure(final UpdateHiveTable processor, final int numUsers, final boolean failOnCreateReader, final int failAfter,
final BiFunction<Integer, MockRecordParser, Void> recordGenerator) throws InitializationException {
runner = TestRunners.newTestRunner(processor);
MockRecordParser readerFactory = new MockRecordParser() {
@Override
public RecordReader createRecordReader(Map<String, String> variables, InputStream in, long inputLength, ComponentLog logger) throws IOException, SchemaNotFoundException {
if (failOnCreateReader) {
throw new SchemaNotFoundException("test");
}
return super.createRecordReader(variables, in, inputLength, logger);
}
};
List<RecordField> fields = Arrays.asList(
new RecordField("name", RecordFieldType.STRING.getDataType()),
new RecordField("favorite_number", RecordFieldType.INT.getDataType()),
new RecordField("favorite_color", RecordFieldType.STRING.getDataType()),
new RecordField("scale", RecordFieldType.DOUBLE.getDataType())
);
final SimpleRecordSchema recordSchema = new SimpleRecordSchema(fields);
for (final RecordField recordField : recordSchema.getFields()) {
readerFactory.addSchemaField(recordField.getFieldName(), recordField.getDataType().getFieldType(), recordField.isNullable());
}
if (recordGenerator == null) {
for (int i = 0; i < numUsers; i++) {
readerFactory.addRecord("name" + i, i, "blue" + i, i * 10.0);
}
} else {
recordGenerator.apply(numUsers, readerFactory);
}
readerFactory.failAfter(failAfter);
runner.addControllerService("mock-reader-factory", readerFactory);
runner.enableControllerService(readerFactory);
runner.setProperty(UpdateHiveTable.RECORD_READER, "mock-reader-factory");
}
@Test
public void testSetup(@TempDir java.nio.file.Path tempDir) throws Exception {
configure(processor, 0);
runner.assertNotValid();
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockHiveConnectionPool(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
runner.assertNotValid();
runner.setProperty(UpdateHiveTable.TABLE_NAME, "users");
runner.assertValid();
runner.run();
}
@Test
public void testNoStatementsExecuted() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHiveTable.TABLE_NAME, "users");
final MockHiveConnectionPool service = new MockHiveConnectionPool("test");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(UpdateHiveTable.PARTITION_CLAUSE, "continent, country");
HashMap<String,String> attrs = new HashMap<>();
attrs.put("continent", "Asia");
attrs.put("country", "China");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHiveTable.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHiveTable.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_TABLE, "users");
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/users");
assertTrue(service.getExecutedStatements().isEmpty());
}
@Test
public void testCreateManagedTable() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHiveTable.TABLE_NAME, "${table.name}");
runner.setProperty(UpdateHiveTable.CREATE_TABLE, UpdateHiveTable.CREATE_IF_NOT_EXISTS);
runner.setProperty(UpdateHiveTable.TABLE_STORAGE_FORMAT, UpdateHiveTable.PARQUET);
final MockHiveConnectionPool service = new MockHiveConnectionPool("_newTable");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
Map<String, String> attrs = new HashMap<>();
attrs.put("db.name", "default");
attrs.put("table.name", "_newTable");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHiveTable.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHiveTable.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_TABLE, "_newTable");
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/_newTable");
List<String> statements = service.getExecutedStatements();
assertEquals(1, statements.size());
assertEquals("CREATE TABLE IF NOT EXISTS `_newTable` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) STORED AS PARQUET",
statements.get(0));
}
@Test
public void testCreateManagedTableWithPartition() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHiveTable.TABLE_NAME, "${table.name}");
runner.setProperty(UpdateHiveTable.CREATE_TABLE, UpdateHiveTable.CREATE_IF_NOT_EXISTS);
runner.setProperty(UpdateHiveTable.PARTITION_CLAUSE, "age int");
runner.setProperty(UpdateHiveTable.TABLE_STORAGE_FORMAT, UpdateHiveTable.PARQUET);
final MockHiveConnectionPool service = new MockHiveConnectionPool("_newTable");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
Map<String, String> attrs = new HashMap<>();
attrs.put("db.name", "default");
attrs.put("table.name", "_newTable");
attrs.put("age", "23");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHiveTable.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHiveTable.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_TABLE, "_newTable");
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/_newTable");
List<String> statements = service.getExecutedStatements();
assertEquals(1, statements.size());
assertEquals("CREATE TABLE IF NOT EXISTS `_newTable` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) PARTITIONED BY (`age` int) STORED AS PARQUET",
statements.get(0));
}
@Test
public void testCreateExternalTable() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHiveTable.TABLE_NAME, "${table.name}");
runner.setProperty(UpdateHiveTable.CREATE_TABLE, UpdateHiveTable.CREATE_IF_NOT_EXISTS);
runner.setProperty(UpdateHiveTable.TABLE_MANAGEMENT_STRATEGY, UpdateHiveTable.EXTERNAL_TABLE);
runner.setProperty(UpdateHiveTable.TABLE_STORAGE_FORMAT, UpdateHiveTable.PARQUET);
final MockHiveConnectionPool service = new MockHiveConnectionPool("ext_users");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
runner.assertNotValid(); // Needs location specified
runner.setProperty(UpdateHiveTable.EXTERNAL_TABLE_LOCATION, "/path/to/users");
runner.assertValid();
Map<String, String> attrs = new HashMap<>();
attrs.put("db.name", "default");
attrs.put("table.name", "ext_users");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHiveTable.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHiveTable.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_TABLE, "ext_users");
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/path/to/users");
List<String> statements = service.getExecutedStatements();
assertEquals(1, statements.size());
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS `ext_users` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) STORED AS PARQUET "
+ "LOCATION '/path/to/users'",
statements.get(0));
}
@Test
public void testAddColumnsAndPartition() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHiveTable.TABLE_NAME, "messages");
final MockHiveConnectionPool service = new MockHiveConnectionPool("test");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(UpdateHiveTable.PARTITION_CLAUSE, "continent, country");
HashMap<String,String> attrs = new HashMap<>();
attrs.put("continent", "Asia");
attrs.put("country", "China");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHiveTable.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHiveTable.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_TABLE, "messages");
flowFile.assertAttributeEquals(UpdateHiveTable.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/messages/continent=Asia/country=China");
List<String> statements = service.getExecutedStatements();
assertEquals(2, statements.size());
// All columns from users table/data should be added to the table, and a new partition should be added
assertEquals("ALTER TABLE `messages` ADD COLUMNS (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE)",
statements.get(0));
assertEquals("ALTER TABLE `messages` ADD IF NOT EXISTS PARTITION (`continent`='Asia', `country`='China')",
statements.get(1));
}
@Test
public void testMissingPartitionValues() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHiveTable.TABLE_NAME, "messages");
final DBCPService service = new MockHiveConnectionPool("test");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHiveTable.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue(new byte[0]);
runner.run();
runner.assertTransferCount(UpdateHiveTable.REL_SUCCESS, 0);
runner.assertTransferCount(UpdateHiveTable.REL_FAILURE, 1);
}
private static final class MockUpdateHiveTable extends UpdateHiveTable {
}
/**
* Simple implementation only for testing purposes
*/
private static class MockHiveConnectionPool extends AbstractControllerService implements HiveDBCPService {
private final String dbLocation;
private final List<String> executedStatements = new ArrayList<>();
MockHiveConnectionPool(final String dbLocation) {
this.dbLocation = dbLocation;
}
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
Connection conn = mock(Connection.class);
Statement s = mock(Statement.class);
when(conn.createStatement()).thenReturn(s);
when(s.executeQuery(anyString())).thenAnswer((Answer<ResultSet>) invocation -> {
final String query = invocation.getArgument(0);
if ("SHOW TABLES".equals(query)) {
return new MockResultSet(SHOW_TABLES_COLUMN_NAMES, SHOW_TABLES_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `messages`".equals(query)) {
return new MockResultSet(DESC_MESSAGES_TABLE_COLUMN_NAMES, DESC_MESSAGES_TABLE_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `users`".equals(query)) {
return new MockResultSet(DESC_USERS_TABLE_COLUMN_NAMES, DESC_USERS_TABLE_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `ext_users`".equals(query)) {
return new MockResultSet(DESC_USERS_TABLE_COLUMN_NAMES, DESC_EXTERNAL_USERS_TABLE_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `_newTable`".equals(query)) {
return new MockResultSet(DESC_NEW_TABLE_COLUMN_NAMES, DESC_NEW_TABLE_RESULTSET).createResultSet();
} else {
return new MockResultSet(new String[]{}, new String[][]{new String[]{}}).createResultSet();
}
});
when(s.execute(anyString())).thenAnswer((Answer<Boolean>) invocation -> {
executedStatements.add(invocation.getArgument(0));
return false;
});
return conn;
} catch (final Exception e) {
e.printStackTrace();
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return "jdbc:fake:" + dbLocation;
}
List<String> getExecutedStatements() {
return executedStatements;
}
}
private static class MockResultSet {
String[] colNames;
String[][] data;
int currentRow;
MockResultSet(String[] colNames, String[][] data) {
this.colNames = colNames;
this.data = data;
currentRow = 0;
}
ResultSet createResultSet() throws SQLException {
ResultSet rs = mock(ResultSet.class);
when(rs.next()).thenAnswer((Answer<Boolean>) invocation -> (data != null) && (++currentRow <= data.length));
when(rs.getString(anyInt())).thenAnswer((Answer<String>) invocation -> {
final int index = invocation.getArgument(0);
if (index < 1) {
throw new SQLException("Columns start with index 1");
}
if (currentRow > data.length) {
throw new SQLException("This result set is already closed");
}
return data[currentRow - 1][index - 1];
});
return rs;
}
}
}

View File

@ -1,137 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import com.google.common.util.concurrent.UncheckedExecutionException;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.hcatalog.streaming.HiveEndPoint;
import org.apache.hive.hcatalog.streaming.InvalidTable;
import org.apache.hive.hcatalog.streaming.RecordWriter;
import org.apache.hive.hcatalog.streaming.StreamingConnection;
import org.apache.hive.hcatalog.streaming.StreamingException;
import org.apache.hive.hcatalog.streaming.TransactionBatch;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.stubbing.Answer;
import java.io.IOException;
import java.lang.reflect.UndeclaredThrowableException;
import java.security.PrivilegedExceptionAction;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyLong;
import static org.mockito.ArgumentMatchers.isA;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class HiveWriterTest {
private HiveEndPoint hiveEndPoint;
private int txnsPerBatch;
private boolean autoCreatePartitions;
private int callTimeout;
private ExecutorService executorService;
private UserGroupInformation userGroupInformation;
private HiveConf hiveConf;
private HiveWriter hiveWriter;
private StreamingConnection streamingConnection;
private RecordWriter recordWriter;
private Callable<RecordWriter> recordWriterCallable;
private TransactionBatch transactionBatch;
@BeforeEach
public void setup() throws Exception {
hiveEndPoint = mock(HiveEndPoint.class);
txnsPerBatch = 100;
autoCreatePartitions = true;
callTimeout = 0;
executorService = mock(ExecutorService.class);
streamingConnection = mock(StreamingConnection.class);
transactionBatch = mock(TransactionBatch.class);
userGroupInformation = mock(UserGroupInformation.class);
hiveConf = mock(HiveConf.class);
recordWriter = mock(RecordWriter.class);
recordWriterCallable = mock(Callable.class);
when(recordWriterCallable.call()).thenReturn(recordWriter);
when(hiveEndPoint.newConnection(autoCreatePartitions, hiveConf, userGroupInformation)).thenReturn(streamingConnection);
when(streamingConnection.fetchTransactionBatch(txnsPerBatch, recordWriter)).thenReturn(transactionBatch);
when(executorService.submit(isA(Callable.class))).thenAnswer(invocation -> {
Future future = mock(Future.class);
Answer<Object> answer = i -> ((Callable) invocation.getArguments()[0]).call();
when(future.get()).thenAnswer(answer);
when(future.get(anyLong(), any(TimeUnit.class))).thenAnswer(answer);
return future;
});
when(userGroupInformation.doAs(isA(PrivilegedExceptionAction.class))).thenAnswer(invocation -> {
try {
try {
return ((PrivilegedExceptionAction) invocation.getArguments()[0]).run();
} catch (UncheckedExecutionException e) {
// Creation of strict json writer will fail due to external deps, this gives us chance to catch it
for (StackTraceElement stackTraceElement : e.getStackTrace()) {
if (stackTraceElement.toString().startsWith("org.apache.hive.hcatalog.streaming.StrictJsonWriter.<init>(")) {
return recordWriterCallable.call();
}
}
throw e;
}
} catch (IOException | Error | RuntimeException | InterruptedException e) {
throw e;
} catch (Throwable e) {
throw new UndeclaredThrowableException(e);
}
});
initWriter();
}
private void initWriter() throws Exception {
hiveWriter = new HiveWriter(hiveEndPoint, txnsPerBatch, autoCreatePartitions, callTimeout, executorService, userGroupInformation, hiveConf);
}
@Test
public void testNormal() {
assertNotNull(hiveWriter);
}
@Test
public void testNewConnectionInvalidTable() throws Exception {
hiveEndPoint = mock(HiveEndPoint.class);
InvalidTable invalidTable = new InvalidTable("badDb", "badTable");
when(hiveEndPoint.newConnection(autoCreatePartitions, hiveConf, userGroupInformation)).thenThrow(invalidTable);
HiveWriter.ConnectFailure e = assertThrows(HiveWriter.ConnectFailure.class, () -> initWriter());
assertEquals(invalidTable, e.getCause());
}
@Test
public void testRecordWriterStreamingException() throws Exception {
recordWriterCallable = mock(Callable.class);
StreamingException streamingException = new StreamingException("Test Exception");
when(recordWriterCallable.call()).thenThrow(streamingException);
HiveWriter.ConnectFailure e = assertThrows(HiveWriter.ConnectFailure.class, () -> initWriter());
assertEquals(streamingException, e.getCause());
}
}

View File

@ -1,467 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.orc;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.generic.GenericData;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObject;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.junit.jupiter.api.Test;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Unit tests for the NiFiOrcUtils helper class
*/
public class TestNiFiOrcUtils {
@Test
public void test_getOrcField_primitive() throws Exception {
// Expected ORC types
TypeInfo[] expectedTypes = {
TypeInfoFactory.getPrimitiveTypeInfo("int"),
TypeInfoFactory.getPrimitiveTypeInfo("bigint"),
TypeInfoFactory.getPrimitiveTypeInfo("boolean"),
TypeInfoFactory.getPrimitiveTypeInfo("float"),
TypeInfoFactory.getPrimitiveTypeInfo("double"),
TypeInfoFactory.getPrimitiveTypeInfo("binary"),
TypeInfoFactory.getPrimitiveTypeInfo("string")
};
// Build a fake Avro record with all types
Schema testSchema = buildPrimitiveAvroSchema();
List<Schema.Field> fields = testSchema.getFields();
for (int i = 0; i < fields.size(); i++) {
assertEquals(expectedTypes[i], NiFiOrcUtils.getOrcField(fields.get(i).schema()));
}
}
@Test
public void test_getOrcField_union_optional_type() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("union").type().unionOf().nullBuilder().endNull().and().booleanType().endUnion().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("union").schema());
assertEquals(TypeInfoCreator.createBoolean(), orcType);
}
@Test
public void test_getOrcField_union() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("union").type().unionOf().intType().and().booleanType().endUnion().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("union").schema());
assertEquals(
TypeInfoFactory.getUnionTypeInfo(Arrays.asList(
TypeInfoCreator.createInt(),
TypeInfoCreator.createBoolean())),
orcType);
}
@Test
public void test_getOrcField_map() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("map").type().map().values().doubleType().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("map").schema());
assertEquals(
TypeInfoFactory.getMapTypeInfo(
TypeInfoCreator.createString(),
TypeInfoCreator.createDouble()),
orcType);
}
@Test
public void test_getOrcField_nested_map() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("map").type().map().values().map().values().doubleType().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("map").schema());
assertEquals(
TypeInfoFactory.getMapTypeInfo(TypeInfoCreator.createString(),
TypeInfoFactory.getMapTypeInfo(TypeInfoCreator.createString(), TypeInfoCreator.createDouble())),
orcType);
}
@Test
public void test_getOrcField_array() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("array").type().array().items().longType().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("array").schema());
assertEquals(
TypeInfoFactory.getListTypeInfo(TypeInfoCreator.createLong()),
orcType);
}
@Test
public void test_getOrcField_complex_array() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("array").type().array().items().map().values().floatType().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("array").schema());
assertEquals(
TypeInfoFactory.getListTypeInfo(TypeInfoFactory.getMapTypeInfo(TypeInfoCreator.createString(), TypeInfoCreator.createFloat())),
orcType);
}
@Test
public void test_getOrcField_record() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("int").type().intType().noDefault();
builder.name("long").type().longType().longDefault(1L);
builder.name("array").type().array().items().stringType().noDefault();
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema);
assertEquals(
TypeInfoFactory.getStructTypeInfo(
Arrays.asList("int", "long", "array"),
Arrays.asList(
TypeInfoCreator.createInt(),
TypeInfoCreator.createLong(),
TypeInfoFactory.getListTypeInfo(TypeInfoCreator.createString()))),
orcType);
}
@Test
public void test_getOrcField_enum() throws Exception {
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("testRecord").namespace("any.data").fields();
builder.name("enumField").type().enumeration("enum").symbols("a", "b", "c").enumDefault("a");
Schema testSchema = builder.endRecord();
TypeInfo orcType = NiFiOrcUtils.getOrcField(testSchema.getField("enumField").schema());
assertEquals(TypeInfoCreator.createString(), orcType);
}
@Test
public void test_getPrimitiveOrcTypeFromPrimitiveAvroType() throws Exception {
// Expected ORC types
TypeInfo[] expectedTypes = {
TypeInfoCreator.createInt(),
TypeInfoCreator.createLong(),
TypeInfoCreator.createBoolean(),
TypeInfoCreator.createFloat(),
TypeInfoCreator.createDouble(),
TypeInfoCreator.createBinary(),
TypeInfoCreator.createString(),
};
Schema testSchema = buildPrimitiveAvroSchema();
List<Schema.Field> fields = testSchema.getFields();
for (int i = 0; i < fields.size(); i++) {
assertEquals(expectedTypes[i], NiFiOrcUtils.getPrimitiveOrcTypeFromPrimitiveAvroType(fields.get(i).schema().getType()));
}
}
@Test
public void test_getPrimitiveOrcTypeFromPrimitiveAvroType_badType() throws Exception {
Schema.Type nonPrimitiveType = Schema.Type.ARRAY;
assertThrows(IllegalArgumentException.class, () -> NiFiOrcUtils.getPrimitiveOrcTypeFromPrimitiveAvroType(nonPrimitiveType));
}
@Test
public void test_getWritable() throws Exception {
assertTrue(NiFiOrcUtils.convertToORCObject(null, 1) instanceof IntWritable);
assertTrue(NiFiOrcUtils.convertToORCObject(null, 1L) instanceof LongWritable);
assertTrue(NiFiOrcUtils.convertToORCObject(null, 1.0f) instanceof FloatWritable);
assertTrue(NiFiOrcUtils.convertToORCObject(null, 1.0) instanceof DoubleWritable);
assertTrue(NiFiOrcUtils.convertToORCObject(null, BigDecimal.valueOf(1.0D)) instanceof HiveDecimalWritable);
assertTrue(NiFiOrcUtils.convertToORCObject(null, new int[]{1, 2, 3}) instanceof List);
assertTrue(NiFiOrcUtils.convertToORCObject(null, Arrays.asList(1, 2, 3)) instanceof List);
Map<String, Float> map = new HashMap<>();
map.put("Hello", 1.0f);
map.put("World", 2.0f);
Object convMap = NiFiOrcUtils.convertToORCObject(TypeInfoUtils.getTypeInfoFromTypeString("map<string,float>"), map);
assertTrue(convMap instanceof Map);
((Map) convMap).forEach((key, value) -> {
assertTrue(key instanceof Text);
assertTrue(value instanceof FloatWritable);
});
}
@Test
public void test_getHiveTypeFromAvroType_primitive() throws Exception {
// Expected ORC types
String[] expectedTypes = {
"INT",
"BIGINT",
"BOOLEAN",
"FLOAT",
"DOUBLE",
"BINARY",
"STRING",
};
Schema testSchema = buildPrimitiveAvroSchema();
List<Schema.Field> fields = testSchema.getFields();
for (int i = 0; i < fields.size(); i++) {
assertEquals(expectedTypes[i], NiFiOrcUtils.getHiveTypeFromAvroType(fields.get(i).schema()));
}
}
@Test
public void test_getHiveTypeFromAvroType_complex() throws Exception {
// Expected ORC types
String[] expectedTypes = {
"INT",
"MAP<STRING, DOUBLE>",
"STRING",
"UNIONTYPE<BIGINT, FLOAT>",
"ARRAY<INT>",
"DECIMAL(10,2)"
};
Schema testSchema = buildComplexAvroSchema();
List<Schema.Field> fields = testSchema.getFields();
for (int i = 0; i < fields.size(); i++) {
assertEquals(expectedTypes[i], NiFiOrcUtils.getHiveTypeFromAvroType(fields.get(i).schema()));
}
assertEquals("STRUCT<myInt:INT, myMap:MAP<STRING, DOUBLE>, myEnum:STRING, myLongOrFloat:UNIONTYPE<BIGINT, FLOAT>, myIntList:ARRAY<INT>, myDecimal:DECIMAL(10,2)>",
NiFiOrcUtils.getHiveTypeFromAvroType(testSchema));
}
@Test
public void test_generateHiveDDL_primitive() throws Exception {
Schema avroSchema = buildPrimitiveAvroSchema();
String ddl = NiFiOrcUtils.generateHiveDDL(avroSchema, "myHiveTable");
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS myHiveTable (int INT, long BIGINT, boolean BOOLEAN, float FLOAT, double DOUBLE, bytes BINARY, string STRING)"
+ " STORED AS ORC", ddl);
}
@Test
public void test_generateHiveDDL_complex() throws Exception {
Schema avroSchema = buildComplexAvroSchema();
String ddl = NiFiOrcUtils.generateHiveDDL(avroSchema, "myHiveTable");
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS myHiveTable "
+ "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>, myDecimal DECIMAL(10,2))"
+ " STORED AS ORC", ddl);
}
@Test
public void test_convertToORCObject() {
Schema schema = SchemaBuilder.enumeration("myEnum").symbols("x", "y", "z");
List<Object> objects = Arrays.asList(new Utf8("Hello"), new GenericData.EnumSymbol(schema, "x"));
objects.forEach((avroObject) -> {
Object o = NiFiOrcUtils.convertToORCObject(TypeInfoUtils.getTypeInfoFromTypeString("uniontype<bigint,string>"), avroObject);
assertTrue(o instanceof UnionObject);
UnionObject uo = (UnionObject) o;
assertTrue(uo.getObject() instanceof Text);
});
}
@Test
public void test_convertToORCObjectBadUnion() {
assertThrows(IllegalArgumentException.class, () -> NiFiOrcUtils.convertToORCObject(TypeInfoUtils.getTypeInfoFromTypeString("uniontype<bigint,long>"), "Hello"));
}
//////////////////
// Helper methods
//////////////////
public static Schema buildPrimitiveAvroSchema() {
// Build a fake Avro record with all primitive types
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("test.record").namespace("any.data").fields();
builder.name("int").type().intType().noDefault();
builder.name("long").type().longType().longDefault(1L);
builder.name("boolean").type().booleanType().booleanDefault(true);
builder.name("float").type().floatType().floatDefault(0.0f);
builder.name("double").type().doubleType().doubleDefault(0.0);
builder.name("bytes").type().bytesType().noDefault();
builder.name("string").type().stringType().stringDefault("default");
return builder.endRecord();
}
public static Schema buildAvroSchemaWithNull() {
// Build a fake Avro record which contains null
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("test.record").namespace("any.data").fields();
builder.name("string").type().stringType().stringDefault("default");
builder.name("null").type().nullType().noDefault();
return builder.endRecord();
}
public static Schema buildAvroSchemaWithEmptyArray() {
// Build a fake Avro record which contains empty array
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("test.record").namespace("any.data").fields();
builder.name("string").type().stringType().stringDefault("default");
builder.name("emptyArray").type().array().items().nullType().noDefault();
return builder.endRecord();
}
public static Schema buildAvroSchemaWithFixed() {
// Build a fake Avro record which contains null
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("test.record").namespace("any.data").fields();
builder.name("fixed").type().fixed("fixedField").size(6).fixedDefault("123456");
return builder.endRecord();
}
public static GenericData.Record buildPrimitiveAvroRecord(int i, long l, boolean b, float f, double d, ByteBuffer bytes, String string) {
Schema schema = buildPrimitiveAvroSchema();
GenericData.Record row = new GenericData.Record(schema);
row.put("int", i);
row.put("long", l);
row.put("boolean", b);
row.put("float", f);
row.put("double", d);
row.put("bytes", bytes);
row.put("string", string);
return row;
}
public static TypeInfo buildPrimitiveOrcSchema() {
return TypeInfoFactory.getStructTypeInfo(Arrays.asList("int", "long", "boolean", "float", "double", "bytes", "string"),
Arrays.asList(
TypeInfoCreator.createInt(),
TypeInfoCreator.createLong(),
TypeInfoCreator.createBoolean(),
TypeInfoCreator.createFloat(),
TypeInfoCreator.createDouble(),
TypeInfoCreator.createBinary(),
TypeInfoCreator.createString()));
}
public static Schema buildComplexAvroSchema() {
// Build a fake Avro record with nested types
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("complex.record").namespace("any.data").fields();
builder.name("myInt").type().unionOf().nullType().and().intType().endUnion().nullDefault();
builder.name("myMap").type().map().values().doubleType().noDefault();
builder.name("myEnum").type().enumeration("myEnum").symbols("ABC", "DEF", "XYZ").enumDefault("ABC");
builder.name("myLongOrFloat").type().unionOf().longType().and().floatType().endUnion().noDefault();
builder.name("myIntList").type().array().items().intType().noDefault();
builder.name("myDecimal").type().bytesBuilder()
.prop("logicalType", "decimal")
.prop("precision", "10")
.prop("scale", "2")
.endBytes().noDefault();
return builder.endRecord();
}
public static GenericData.Record buildComplexAvroRecord(Integer i, Map<String, Double> m, String e, Object unionVal, List<Integer> intArray, ByteBuffer decimal) {
Schema schema = buildComplexAvroSchema();
Schema enumSchema = schema.getField("myEnum").schema();
GenericData.Record row = new GenericData.Record(schema);
row.put("myInt", i);
row.put("myMap", m);
row.put("myEnum", new GenericData.EnumSymbol(enumSchema, e));
row.put("myLongOrFloat", unionVal);
row.put("myIntList", intArray);
row.put("myDecimal", decimal);
return row;
}
public static GenericData.Record buildAvroRecordWithNull(String string) {
Schema schema = buildAvroSchemaWithNull();
GenericData.Record row = new GenericData.Record(schema);
row.put("string", string);
row.put("null", null);
return row;
}
public static GenericData.Record buildAvroRecordWithEmptyArray(String string) {
Schema schema = buildAvroSchemaWithEmptyArray();
GenericData.Record row = new GenericData.Record(schema);
row.put("string", string);
row.put("emptyArray", Collections.emptyList());
return row;
}
public static GenericData.Record buildAvroRecordWithFixed(String string) {
Schema schema = buildAvroSchemaWithFixed();
GenericData.Record row = new GenericData.Record(schema);
row.put("fixed", new GenericData.Fixed(schema, string.getBytes(StandardCharsets.UTF_8)));
return row;
}
public static TypeInfo buildComplexOrcSchema() {
return TypeInfoUtils.getTypeInfoFromTypeString("struct<myInt:int,myMap:map<string,double>,myEnum:string,myLongOrFloat:uniontype<int>,myIntList:array<int>,myDecimal:decimal(10,2)>");
}
public static Schema buildNestedComplexAvroSchema() {
// Build a fake Avro record with nested complex types
final SchemaBuilder.FieldAssembler<Schema> builder = SchemaBuilder.record("nested.complex.record").namespace("any.data").fields();
builder.name("myMapOfArray").type().map().values().array().items().doubleType().noDefault();
builder.name("myArrayOfMap").type().array().items().map().values().stringType().noDefault();
return builder.endRecord();
}
public static GenericData.Record buildNestedComplexAvroRecord(Map<String, List<Double>> m, List<Map<String, String>> a) {
Schema schema = buildNestedComplexAvroSchema();
GenericData.Record row = new GenericData.Record(schema);
row.put("myMapOfArray", m);
row.put("myArrayOfMap", a);
return row;
}
public static TypeInfo buildNestedComplexOrcSchema() {
return TypeInfoUtils.getTypeInfoFromTypeString("struct<myMapOfArray:map<string,array<double>>,myArrayOfMap:array<map<string,string>>>");
}
private static class TypeInfoCreator {
static TypeInfo createInt() {
return TypeInfoFactory.getPrimitiveTypeInfo("int");
}
static TypeInfo createLong() {
return TypeInfoFactory.getPrimitiveTypeInfo("bigint");
}
static TypeInfo createBoolean() {
return TypeInfoFactory.getPrimitiveTypeInfo("boolean");
}
static TypeInfo createFloat() {
return TypeInfoFactory.getPrimitiveTypeInfo("float");
}
static TypeInfo createDouble() {
return TypeInfoFactory.getPrimitiveTypeInfo("double");
}
static TypeInfo createBinary() {
return TypeInfoFactory.getPrimitiveTypeInfo("binary");
}
static TypeInfo createString() {
return TypeInfoFactory.getPrimitiveTypeInfo("string");
}
}
}

View File

@ -1,38 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace" : "org.apache.nifi",
"name" : "outer_record",
"type" : "record",
"fields" : [ {
"name" : "records",
"type" : {
"type" : "array",
"items" : {
"type" : "record",
"name" : "inner_record",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "age",
"type" : "int"
} ]
}
}
} ]
}

View File

@ -1,30 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hive</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,22 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hive</value>
</property>
</configuration>

View File

@ -1,30 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hive</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>KERBEROS</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
</configuration>

View File

@ -1,22 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>file:///</value>
</property>
</configuration>

View File

@ -1,10 +0,0 @@
[libdefaults]
default_realm = EXAMPLE.COM
dns_lookup_kdc = false
dns_lookup_realm = false
[realms]
EXAMPLE.COM = {
kdc = kerberos.example.com
admin_server = kerberos.example.com
}

View File

@ -1,26 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]},
{"name": "scale", "type": ["double", "null"]}
]
}

View File

@ -1,31 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.dbcp.hive;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.Tags;
/**
* Definition for Hive 1.1 Database Connection Pooling Service.
*
*/
@Tags({"hive", "dbcp", "jdbc", "database", "connection", "pooling", "store"})
@CapabilityDescription("Provides Database Connection Pooling Service for Apache Hive 1.1.x. Connections can be asked from pool and returned after usage.")
public interface Hive_1_1DBCPService extends HiveDBCPService {
public String getConnectionURL();
}

View File

@ -1,47 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-bundle</artifactId>
<version>2.0.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-hive_1_1-nar</artifactId>
<packaging>nar</packaging>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
<source.skip>true</source.skip>
<!-- Need to override hadoop.version here, for Hive and hadoop-client transitive dependencies -->
<hadoop.version>${hive11.hadoop.version}</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-services-api-nar</artifactId>
<version>2.0.0-SNAPSHOT</version>
<type>nar</type>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive_1_1-processors</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>

View File

@ -1,329 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
APACHE NIFI SUBCOMPONENTS:
The Apache NiFi project contains subcomponents with separate copyright
notices and license terms. Your use of the source code for the these
subcomponents is subject to the terms and conditions of the following
licenses.
The binary distribution of this product bundles 'Bouncy Castle JDK 1.5'
under an MIT style license.
Copyright (c) 2000 - 2015 The Legion of the Bouncy Castle Inc. (http://www.bouncycastle.org)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
The binary distribution of this product includes modules from Groovy which bundles ANTLR
SOFTWARE RIGHTS
ANTLR 1989-2006 Developed by Terence Parr
Partially supported by University of San Francisco & jGuru.com
We reserve no legal rights to the ANTLR--it is fully in the
public domain. An individual or company may do whatever
they wish with source code distributed with ANTLR or the
code generated by ANTLR, including the incorporation of
ANTLR, or its output, into commerical software.
We encourage users to develop software with ANTLR. However,
we do ask that credit is given to us for developing
ANTLR. By "credit", we mean that if you use ANTLR or
incorporate any source code into one of your programs
(commercial product, research project, or otherwise) that
you acknowledge this fact somewhere in the documentation,
research report, etc... If you like ANTLR and have
developed a nice tool with the output, please mention that
you developed it using ANTLR. In addition, we ask that the
headers remain intact in our source code. As long as these
guidelines are kept, we expect to continue enhancing this
system and expect to make other tools available as they are
completed.
The primary ANTLR guy:
Terence Parr
parrt@cs.usfca.edu
parrt@antlr.org
The binary distribution of this product includes modules from Groovy which bundles ASM
/***
* http://asm.objectweb.org/
*
* ASM: a very small and fast Java bytecode manipulation framework
* Copyright (c) 2000-2005 INRIA, France Telecom
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
The binary distribution of this product includes modules from Groovy which bundles source from JSR-223
The following notice applies to the files:
src/main/org/codehaus/groovy/jsr223/GroovyCompiledScript.java
src/main/org/codehaus/groovy/jsr223/GroovyScriptEngineFactory.java
src/main/org/codehaus/groovy/jsr223/GroovyScriptEngineImpl.java
/*
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
* Redistribution and use in source and binary forms, with or without modification, are
* permitted provided that the following conditions are met: Redistributions of source code
* must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution. Neither the name of the Sun Microsystems nor the names of
* is contributors may be used to endorse or promote products derived from this software
* without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

View File

@ -1,288 +0,0 @@
nifi-hive_1_1-nar
Copyright 2014-2023 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
This includes derived works from the Apache Storm (ASLv2 licensed) project (https://github.com/apache/storm):
Copyright 2015 The Apache Software Foundation
The derived work is adapted from
org/apache/storm/hive/common/HiveWriter.java
org/apache/storm/hive/common/HiveOptions.java
and can be found in the org.apache.nifi.util.hive package
===========================================
Apache Software License v2
===========================================
The following binary components are provided under the Apache Software License v2
(ASLv2) Apache Ant
The following NOTICE information applies:
Apache Ant
Copyright 1999-2016 The Apache Software Foundation
(ASLv2) Apache Commons Codec
The following NOTICE information applies:
Apache Commons Codec
Copyright 2002-2014 The Apache Software Foundation
src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java
contains test data from http://aspell.net/test/orig/batch0.tab.
Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org)
===============================================================================
The content of package org.apache.commons.codec.language.bm has been translated
from the original php source code available at http://stevemorse.org/phoneticinfo.htm
with permission from the original authors.
Original source copyright:
Copyright (c) 2008 Alexander Beider & Stephen P. Morse.
(ASLv2) Apache Commons DBCP
The following NOTICE information applies:
Apache Commons DBCP
Copyright 2001-2015 The Apache Software Foundation.
(ASLv2) Apache HttpComponents
The following NOTICE information applies:
Apache HttpComponents Client
Copyright 1999-2016 The Apache Software Foundation
Apache HttpComponents Core - HttpCore
Copyright 2006-2009 The Apache Software Foundation
(ASLv2) Apache Commons Pool
The following NOTICE information applies:
Apache Commons Pool
Copyright 1999-2009 The Apache Software Foundation.
(ASLv2) Apache Commons IO
The following NOTICE information applies:
Apache Commons IO
Copyright 2002-2016 The Apache Software Foundation
(ASLv2) Apache Hive
The following NOTICE information applies:
Apache Hive
Copyright 2008-2015 The Apache Software Foundation
This product includes software developed by The Apache Software
Foundation (http://www.apache.org/).
This product includes Jersey (https://jersey.java.net/)
Copyright (c) 2010-2014 Oracle and/or its affiliates.
This project includes software copyrighted by Microsoft Corporation and
licensed under the Apache License, Version 2.0.
This project includes software copyrighted by Dell SecureWorks and
licensed under the Apache License, Version 2.0.
(ASLv2) Jackson JSON processor
The following NOTICE information applies:
# Jackson JSON processor
Jackson is a high-performance, Free/Open Source JSON processing library.
It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has
been in development since 2007.
It is currently developed by a community of developers, as well as supported
commercially by FasterXML.com.
## Licensing
Jackson core and extension components may licensed under different licenses.
To find the details that apply to this artifact see the accompanying LICENSE file.
For more information, including possible other licensing options, contact
FasterXML.com (http://fasterxml.com).
## Credits
A list of contributors may be found from CREDITS file, which is included
in some artifacts (usually source distributions); but is always available
from the source code management (SCM) system project uses.
(ASLv2) BoneCP
The following NOTICE information applies:
BoneCP
Copyright 2010 Wallace Wadge
(ASLv2) Apache Hadoop
The following NOTICE information applies:
The binary distribution of this product bundles binaries of
org.iq80.leveldb:leveldb-api (https://github.com/dain/leveldb), which has the
following notices:
* Copyright 2011 Dain Sundstrom <dain@iq80.com>
* Copyright 2011 FuseSource Corp. http://fusesource.com
The binary distribution of this product bundles binaries of
org.fusesource.hawtjni:hawtjni-runtime (https://github.com/fusesource/hawtjni),
which has the following notices:
* This product includes software developed by FuseSource Corp.
http://fusesource.com
* This product includes software developed at
Progress Software Corporation and/or its subsidiaries or affiliates.
* This product includes software developed by IBM Corporation and others.
(ASLv2) Apache Commons Lang
The following NOTICE information applies:
Apache Commons Lang
Copyright 2001-2015 The Apache Software Foundation
(ASLv2) Apache Curator
The following NOTICE information applies:
Apache Curator
Copyright 2013-2014 The Apache Software Foundation
(ASLv2) Apache Derby
The following NOTICE information applies:
Apache Derby
Copyright 2004-2014 Apache, Apache DB, Apache Derby, Apache Torque, Apache JDO, Apache DDLUtils,
the Derby hat logo, the Apache JDO logo, and the Apache feather logo are trademarks of The Apache Software Foundation.
(ASLv2) Apache DS
The following NOTICE information applies:
ApacheDS
Copyright 2003-2015 The Apache Software Foundation
(ASLv2) Apache Geronimo
The following NOTICE information applies:
Apache Geronimo
Copyright 2003-2008 The Apache Software Foundation
(ASLv2) HTrace Core
The following NOTICE information applies:
In addition, this product includes software dependencies. See
the accompanying LICENSE.txt for a listing of dependencies
that are NOT Apache licensed (with pointers to their licensing)
Apache HTrace includes an Apache Thrift connector to Zipkin. Zipkin
is a distributed tracing system that is Apache 2.0 Licensed.
Copyright 2012 Twitter, Inc.
(ASLv2) Jettison
The following NOTICE information applies:
Copyright 2006 Envoi Solutions LLC
(ASLv2) Jetty
The following NOTICE information applies:
Jetty Web Container
Copyright 1995-2019 Mort Bay Consulting Pty Ltd.
(ASLv2) Apache log4j
The following NOTICE information applies:
Apache log4j
Copyright 2007 The Apache Software Foundation
(ASLv2) Parquet MR
The following NOTICE information applies:
Parquet MR
Copyright 2012 Twitter, Inc.
This project includes code from https://github.com/lemire/JavaFastPFOR
parquet-column/src/main/java/parquet/column/values/bitpacking/LemireBitPacking.java
Apache License Version 2.0 http://www.apache.org/licenses/.
(c) Daniel Lemire, http://lemire.me/en/
(ASLv2) Apache Thrift
The following NOTICE information applies:
Apache Thrift
Copyright 2006-2010 The Apache Software Foundation.
(ASLv2) Apache Twill
The following NOTICE information applies:
Apache Twill
Copyright 2013-2016 The Apache Software Foundation
(ASLv2) Dropwizard Metrics
The following NOTICE information applies:
Metrics
Copyright 2010-2013 Coda Hale and Yammer, Inc.
This product includes code derived from the JSR-166 project (ThreadLocalRandom, Striped64,
LongAdder), which was released with the following comments:
Written by Doug Lea with assistance from members of JCP JSR-166
Expert Group and released to the public domain, as explained at
http://creativecommons.org/publicdomain/zero/1.0/
(ASLv2) Joda Time
The following NOTICE information applies:
This product includes software developed by
Joda.org (http://www.joda.org/).
(ASLv2) The Netty Project
The following NOTICE information applies:
The Netty Project
Copyright 2011 The Netty Project
(ASLv2) Apache Tomcat
The following NOTICE information applies:
Apache Tomcat
Copyright 2007 The Apache Software Foundation
Java Management Extensions (JMX) support is provided by
the MX4J package, which is open source software. The
original software and related information is available
at http://mx4j.sourceforge.net.
Java compilation software for JSP pages is provided by Eclipse,
which is open source software. The orginal software and
related infomation is available at
http://www.eclipse.org.
(ASLv2) Apache ZooKeeper
The following NOTICE information applies:
Apache ZooKeeper
Copyright 2009-2012 The Apache Software Foundation
(ASLv2) Google GSON
The following NOTICE information applies:
Copyright 2008 Google Inc.
(ASLv2) Groovy (org.codehaus.groovy:groovy-all:jar:2.1.6 - http://www.groovy-lang.org)
The following NOTICE information applies:
Groovy Language
Copyright 2003-2012 The respective authors and developers
Developers and Contributors are listed in the project POM file
and Gradle build file
This product includes software developed by
The Groovy community (http://groovy.codehaus.org/).
(ASLv2) JPam
The following NOTICE information applies:
Copyright 2003-2006 Greg Luck
************************
Common Development and Distribution License 1.1
************************
The following binary components are provided under the Common Development and Distribution License 1.1. See project link for details.
(CDDL 1.1) (GPL2 w/ CPE) jersey-client (com.sun.jersey:jersey-client:jar:1.9 - https://jersey.java.net)
(CDDL 1.1) (GPL2 w/ CPE) jersey-core (com.sun.jersey:jersey-core:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) jersey-json (com.sun.jersey:jersey-json:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) jersey-server (com.sun.jersey:jersey-server:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) jersey-guice (com.sun.jersey.contribs:jersey-guice:jar:1.9 - https://jersey.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) Java Architecture For XML Binding (javax.xml.bind:jaxb-api:jar:2.2.2 - https://jaxb.dev.java.net/)
(CDDL 1.1) (GPL2 w/ CPE) JavaMail API (compat) (javax.mail:mail:jar:1.4.7 - http://kenai.com/projects/javamail/mail)
************************
Common Development and Distribution License 1.0
************************
The following binary components are provided under the Common Development and Distribution License 1.0. See project link for details.
(CDDL 1.0) JavaServlet(TM) Specification (javax.servlet:servlet-api:jar:2.5 - no url available)
(CDDL 1.0) (GPL3) Streaming API For XML (javax.xml.stream:stax-api:jar:1.0-2 - no url provided)
(CDDL 1.0) JavaBeans Activation Framework (JAF) (javax.activation:activation:jar:1.1 - http://java.sun.com/products/javabeans/jaf/index.jsp)
(CDDL 1.0) JavaServer Pages(TM) API (javax.servlet.jsp:jsp-api:jar:2.1 - http://jsp.java.net)
*****************
Public Domain
*****************
The following binary components are provided to the 'Public Domain'. See project link for details.
(Public Domain) AOP Alliance 1.0 (http://aopalliance.sourceforge.net/)

View File

@ -1,225 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-bundle</artifactId>
<version>2.0.0-SNAPSHOT</version>
</parent>
<artifactId>nifi-hive_1_1-processors</artifactId>
<packaging>jar</packaging>
<properties>
<!-- Need to override hadoop.version here, for Hive and hadoop-client transitive dependencies -->
<hadoop.version>${hive11.hadoop.version}</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-api</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-put-pattern</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-security-kerberos</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-dbcp-service-api</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hive-services-api</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-kerberos-credentials-service-api</artifactId>
<scope>provided</scope>
</dependency>
<!-- Override groovy-all:2.1.6 from Hive -->
<dependency>
<groupId>org.codehaus.groovy</groupId>
<artifactId>groovy-all</artifactId>
<version>2.4.21</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive11.version}</version>
<exclusions>
<exclusion>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</exclusion>
<exclusion>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>apache-log4j-extras</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hive.hcatalog</groupId>
<artifactId>hive-hcatalog-streaming</artifactId>
<version>${hive11.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>apache-log4j-extras</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hive.hcatalog</groupId>
<artifactId>hive-hcatalog-core</artifactId>
<version>${hive11.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>apache-log4j-extras</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hadoop-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-hadoop-record-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-record-serialization-service-api</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-record</artifactId>
</dependency>
<dependency>
<groupId>com.github.stephenc.findbugs</groupId>
<artifactId>findbugs-annotations</artifactId>
<version>1.3.9-1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-dbcp2</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-mock</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.nifi</groupId>
<artifactId>nifi-mock-record-utils</artifactId>
<version>2.0.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -1,453 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.dbcp.hive;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hive.jdbc.HiveDriver;
import org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnDisabled;
import org.apache.nifi.annotation.lifecycle.OnEnabled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.PropertyValue;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.resource.ResourceCardinality;
import org.apache.nifi.components.resource.ResourceType;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.controller.ConfigurationContext;
import org.apache.nifi.controller.ControllerServiceInitializationContext;
import org.apache.nifi.dbcp.DBCPValidator;
import org.apache.nifi.expression.AttributeExpression;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.hadoop.SecurityUtil;
import org.apache.nifi.kerberos.KerberosCredentialsService;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.security.krb.KerberosKeytabUser;
import org.apache.nifi.security.krb.KerberosLoginException;
import org.apache.nifi.security.krb.KerberosPasswordUser;
import org.apache.nifi.security.krb.KerberosUser;
import org.apache.nifi.util.hive.AuthenticationFailedException;
import org.apache.nifi.util.hive.HiveConfigurator;
import org.apache.nifi.util.hive.ValidationResources;
import java.io.IOException;
import java.lang.reflect.UndeclaredThrowableException;
import java.security.PrivilegedExceptionAction;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
/**
* Implementation for Database Connection Pooling Service used for Apache Hive 1.1
* connections. Apache DBCP is used for connection pooling functionality.
*/
@RequiresInstanceClassLoading
@Tags({"hive", "dbcp", "jdbc", "database", "connection", "pooling", "store"})
@CapabilityDescription("Provides Database Connection Pooling Service for Apache Hive 1.1.x. Connections can be asked from pool and returned after usage.")
@DeprecationNotice(classNames = "org.apache.nifi.dbcp.hive.Hive3ConnectionPool")
public class Hive_1_1ConnectionPool extends AbstractControllerService implements Hive_1_1DBCPService {
private static final String DEFAULT_MAX_CONN_LIFETIME = "-1";
public static final PropertyDescriptor DATABASE_URL = new PropertyDescriptor.Builder()
.name("hive-db-connect-url")
.displayName("Database Connection URL")
.description("A database connection URL used to connect to a database. May contain database system name, host, port, database name and some parameters."
+ " The exact syntax of a database connection URL is specified by the Hive documentation. For example, the server principal is often included "
+ "as a connection parameter when connecting to a secure Hive server.")
.defaultValue(null)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.required(true)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor HIVE_CONFIGURATION_RESOURCES = new PropertyDescriptor.Builder()
.name("hive-config-resources")
.displayName("Hive Configuration Resources")
.description("A file or comma separated list of files which contains the Hive configuration (hive-site.xml, e.g.). Without this, Hadoop "
+ "will search the classpath for a 'hive-site.xml' file or will revert to a default configuration. Note that to enable authentication "
+ "with Kerberos e.g., the appropriate properties must be set in the configuration files. Please see the Hive documentation for more details.")
.required(false)
.identifiesExternalResource(ResourceCardinality.MULTIPLE, ResourceType.FILE)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor DB_USER = new PropertyDescriptor.Builder()
.name("hive-db-user")
.displayName("Database User")
.description("Database user name")
.defaultValue(null)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor DB_PASSWORD = new PropertyDescriptor.Builder()
.name("hive-db-password")
.displayName("Password")
.description("The password for the database user")
.defaultValue(null)
.required(false)
.sensitive(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor MAX_WAIT_TIME = new PropertyDescriptor.Builder()
.name("hive-max-wait-time")
.displayName("Max Wait Time")
.description("The maximum amount of time that the pool will wait (when there are no available connections) "
+ " for a connection to be returned before failing, or -1 to wait indefinitely. ")
.defaultValue("500 millis")
.required(true)
.addValidator(StandardValidators.TIME_PERIOD_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor MAX_TOTAL_CONNECTIONS = new PropertyDescriptor.Builder()
.name("hive-max-total-connections")
.displayName("Max Total Connections")
.description("The maximum number of active connections that can be allocated from this pool at the same time, "
+ "or negative for no limit.")
.defaultValue("8")
.required(true)
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor MAX_CONN_LIFETIME = new PropertyDescriptor.Builder()
.displayName("Max Connection Lifetime")
.name("hive-max-conn-lifetime")
.description("The maximum lifetime in milliseconds of a connection. After this time is exceeded the " +
"connection pool will invalidate the connection. A value of zero or -1 " +
"means the connection has an infinite lifetime.")
.defaultValue(DEFAULT_MAX_CONN_LIFETIME)
.required(true)
.addValidator(DBCPValidator.CUSTOM_TIME_PERIOD_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
public static final PropertyDescriptor VALIDATION_QUERY = new PropertyDescriptor.Builder()
.name("Validation-query")
.displayName("Validation query")
.description("Validation query used to validate connections before returning them. "
+ "When a borrowed connection is invalid, it gets dropped and a new valid connection will be returned. "
+ "NOTE: Using validation may have a performance penalty.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
static final PropertyDescriptor KERBEROS_CREDENTIALS_SERVICE = new PropertyDescriptor.Builder()
.name("kerberos-credentials-service")
.displayName("Kerberos Credentials Service")
.description("Specifies the Kerberos Credentials Controller Service that should be used for authenticating with Kerberos")
.identifiesControllerService(KerberosCredentialsService.class)
.required(false)
.build();
static final PropertyDescriptor KERBEROS_PRINCIPAL = new PropertyDescriptor.Builder()
.name("kerberos-principal")
.displayName("Kerberos Principal")
.description("The principal to use when specifying the principal and password directly in the processor for authenticating via Kerberos.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.addValidator(StandardValidators.createAttributeExpressionLanguageValidator(AttributeExpression.ResultType.STRING))
.expressionLanguageSupported(ExpressionLanguageScope.VARIABLE_REGISTRY)
.build();
static final PropertyDescriptor KERBEROS_PASSWORD = new PropertyDescriptor.Builder()
.name("kerberos-password")
.displayName("Kerberos Password")
.description("The password to use when specifying the principal and password directly in the processor for authenticating via Kerberos.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.sensitive(true)
.build();
private List<PropertyDescriptor> properties;
private String connectionUrl = "unknown";
// Holder of cached Configuration information so validation does not reload the same config over and over
private final AtomicReference<ValidationResources> validationResourceHolder = new AtomicReference<>();
private volatile BasicDataSource dataSource;
private volatile HiveConfigurator hiveConfigurator = new HiveConfigurator();
private volatile UserGroupInformation ugi;
private final AtomicReference<KerberosUser> kerberosUserReference = new AtomicReference<>();
@Override
protected void init(final ControllerServiceInitializationContext context) {
List<PropertyDescriptor> props = new ArrayList<>();
props.add(DATABASE_URL);
props.add(HIVE_CONFIGURATION_RESOURCES);
props.add(DB_USER);
props.add(DB_PASSWORD);
props.add(MAX_WAIT_TIME);
props.add(MAX_TOTAL_CONNECTIONS);
props.add(MAX_CONN_LIFETIME);
props.add(VALIDATION_QUERY);
props.add(KERBEROS_CREDENTIALS_SERVICE);
props.add(KERBEROS_PRINCIPAL);
props.add(KERBEROS_PASSWORD);
properties = props;
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return properties;
}
@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
boolean confFileProvided = validationContext.getProperty(HIVE_CONFIGURATION_RESOURCES).isSet();
final List<ValidationResult> problems = new ArrayList<>();
if (confFileProvided) {
final KerberosCredentialsService credentialsService = validationContext.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String explicitPrincipal = validationContext.getProperty(KERBEROS_PRINCIPAL).evaluateAttributeExpressions().getValue();
final String explicitPassword = validationContext.getProperty(KERBEROS_PASSWORD).getValue();
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService != null) {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
} else {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = null;
}
final String configFiles = validationContext.getProperty(HIVE_CONFIGURATION_RESOURCES).evaluateAttributeExpressions().getValue();
problems.addAll(hiveConfigurator.validate(configFiles, resolvedPrincipal, resolvedKeytab, explicitPassword, validationResourceHolder, getLogger()));
if (credentialsService != null && (explicitPrincipal != null || explicitPassword != null)) {
problems.add(new ValidationResult.Builder()
.subject(KERBEROS_CREDENTIALS_SERVICE.getDisplayName())
.valid(false)
.explanation("kerberos principal/password and kerberos credential service cannot be configured at the same time")
.build());
}
}
return problems;
}
/**
* Configures connection pool by creating an instance of the
* {@link BasicDataSource} based on configuration provided with
* {@link ConfigurationContext}.
* <p>
* This operation makes no guarantees that the actual connection could be
* made since the underlying system may still go off-line during normal
* operation of the connection pool.
* <p/>
* As of Apache NiFi 1.5.0, due to changes made to
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}, which is used by this class invoking
* {@link HiveConfigurator#authenticate(Configuration, String, String)}
* to authenticate a principal with Kerberos, Hive controller services no longer use a separate thread to
* relogin, and instead call {@link UserGroupInformation#checkTGTAndReloginFromKeytab()} from
* {@link Hive_1_1ConnectionPool#getConnection()}. The relogin request is performed in a synchronized block to prevent
* threads from requesting concurrent relogins. For more information, please read the documentation for
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}.
* <p/>
* In previous versions of NiFi, a {@link org.apache.nifi.hadoop.KerberosTicketRenewer} was started by
* {@link HiveConfigurator#authenticate(Configuration, String, String, long)} when the Hive
* controller service was enabled. The use of a separate thread to explicitly relogin could cause race conditions
* with the implicit relogin attempts made by hadoop/Hive code on a thread that references the same
* {@link UserGroupInformation} instance. One of these threads could leave the
* {@link javax.security.auth.Subject} in {@link UserGroupInformation} to be cleared or in an unexpected state
* while the other thread is attempting to use the {@link javax.security.auth.Subject}, resulting in failed
* authentication attempts that would leave the Hive controller service in an unrecoverable state.
*
* @see SecurityUtil#loginKerberos(Configuration, String, String)
* @see HiveConfigurator#authenticate(Configuration, String, String)
* @see HiveConfigurator#authenticate(Configuration, String, String, long)
* @param context the configuration context
* @throws InitializationException if unable to create a database connection
*/
@OnEnabled
public void onConfigured(final ConfigurationContext context) throws InitializationException {
ComponentLog log = getLogger();
final String configFiles = context.getProperty(HIVE_CONFIGURATION_RESOURCES).evaluateAttributeExpressions().getValue();
final Configuration hiveConfig = hiveConfigurator.getConfigurationFromFiles(configFiles);
final String validationQuery = context.getProperty(VALIDATION_QUERY).evaluateAttributeExpressions().getValue();
// add any dynamic properties to the Hive configuration
for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
final PropertyDescriptor descriptor = entry.getKey();
if (descriptor.isDynamic()) {
hiveConfig.set(descriptor.getName(), context.getProperty(descriptor).evaluateAttributeExpressions().getValue());
}
}
final String drv = HiveDriver.class.getName();
if (SecurityUtil.isSecurityEnabled(hiveConfig)) {
final String explicitPrincipal = context.getProperty(KERBEROS_PRINCIPAL).evaluateAttributeExpressions().getValue();
final String explicitPassword = context.getProperty(KERBEROS_PASSWORD).getValue();
final KerberosCredentialsService credentialsService = context.getProperty(KERBEROS_CREDENTIALS_SERVICE).asControllerService(KerberosCredentialsService.class);
final String resolvedPrincipal;
final String resolvedKeytab;
if (credentialsService != null) {
resolvedPrincipal = credentialsService.getPrincipal();
resolvedKeytab = credentialsService.getKeytab();
} else {
resolvedPrincipal = explicitPrincipal;
resolvedKeytab = null;
}
if (resolvedKeytab != null) {
kerberosUserReference.set(new KerberosKeytabUser(resolvedPrincipal, resolvedKeytab));
log.info("Hive Security Enabled, logging in as principal {} with keytab {}", new Object[] {resolvedPrincipal, resolvedKeytab});
} else if (explicitPassword != null) {
kerberosUserReference.set(new KerberosPasswordUser(resolvedPrincipal, explicitPassword));
log.info("Hive Security Enabled, logging in as principal {} with password", new Object[] {resolvedPrincipal});
} else {
throw new InitializationException("Unable to authenticate with Kerberos, no keytab or password was provided");
}
try {
ugi = hiveConfigurator.authenticate(hiveConfig, kerberosUserReference.get());
} catch (AuthenticationFailedException ae) {
log.error(ae.getMessage(), ae);
throw new InitializationException(ae);
}
getLogger().info("Successfully logged in as principal " + resolvedPrincipal);
}
final String user = context.getProperty(DB_USER).evaluateAttributeExpressions().getValue();
final String passw = context.getProperty(DB_PASSWORD).evaluateAttributeExpressions().getValue();
final Long maxWaitMillis = context.getProperty(MAX_WAIT_TIME).evaluateAttributeExpressions().asTimePeriod(TimeUnit.MILLISECONDS);
final Integer maxTotal = context.getProperty(MAX_TOTAL_CONNECTIONS).evaluateAttributeExpressions().asInteger();
final long maxConnectionLifetimeMillis = extractMillisWithInfinite(context.getProperty(MAX_CONN_LIFETIME).evaluateAttributeExpressions());
dataSource = new BasicDataSource();
dataSource.setDriverClassName(drv);
connectionUrl = context.getProperty(DATABASE_URL).evaluateAttributeExpressions().getValue();
dataSource.setMaxWaitMillis(maxWaitMillis);
dataSource.setMaxTotal(maxTotal);
dataSource.setMaxConnLifetimeMillis(maxConnectionLifetimeMillis);
if (validationQuery != null && !validationQuery.isEmpty()) {
dataSource.setValidationQuery(validationQuery);
dataSource.setTestOnBorrow(true);
}
dataSource.setUrl(connectionUrl);
dataSource.setUsername(user);
dataSource.setPassword(passw);
}
/**
* Shutdown pool, close all open connections.
*/
@OnDisabled
public void shutdown() {
try {
if(dataSource != null) {
dataSource.close();
}
} catch (final SQLException e) {
throw new ProcessException(e);
}
}
@Override
public Connection getConnection() throws ProcessException {
try {
if (ugi != null) {
/*
* Explicitly check the TGT and relogin if necessary with the KerberosUser instance. No synchronization
* is necessary in the client code, since AbstractKerberosUser's checkTGTAndRelogin method is synchronized.
*/
getLogger().trace("getting UGI instance");
if (kerberosUserReference.get() != null) {
// if there's a KerberosUser associated with this UGI, check the TGT and relogin if it is close to expiring
KerberosUser kerberosUser = kerberosUserReference.get();
getLogger().debug("kerberosUser is " + kerberosUser);
try {
getLogger().debug("checking TGT on kerberosUser " + kerberosUser);
kerberosUser.checkTGTAndRelogin();
} catch (final KerberosLoginException e) {
throw new ProcessException("Unable to relogin with kerberos credentials for " + kerberosUser.getPrincipal(), e);
}
} else {
getLogger().debug("kerberosUser was null, will not refresh TGT with KerberosUser");
// no synchronization is needed for UserGroupInformation.checkTGTAndReloginFromKeytab; UGI handles the synchronization internally
ugi.checkTGTAndReloginFromKeytab();
}
try {
return ugi.doAs((PrivilegedExceptionAction<Connection>) () -> dataSource.getConnection());
} catch (UndeclaredThrowableException e) {
Throwable cause = e.getCause();
if (cause instanceof SQLException) {
throw (SQLException) cause;
} else {
throw e;
}
}
} else {
getLogger().info("Simple Authentication");
return dataSource.getConnection();
}
} catch (SQLException | IOException | InterruptedException e) {
getLogger().error("Error getting Hive connection", e);
throw new ProcessException(e);
}
}
@Override
public String toString() {
return "HiveConnectionPool[id=" + getIdentifier() + "]";
}
@Override
public String getConnectionURL() {
return connectionUrl;
}
private long extractMillisWithInfinite(PropertyValue prop) {
if (prop.getValue() == null || DEFAULT_MAX_CONN_LIFETIME.equals(prop.getValue())) {
return -1;
} else {
return prop.asTimePeriod(TimeUnit.MILLISECONDS);
}
}
}

View File

@ -1,344 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.antlr.runtime.tree.CommonTree;
import org.apache.hadoop.hive.ql.parse.ASTNode;
import org.apache.hadoop.hive.ql.parse.ParseDriver;
import org.apache.hadoop.hive.ql.parse.ParseException;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.AbstractSessionFactoryProcessor;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.io.InputStreamCallback;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.stream.io.StreamUtils;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.sql.Date;
import java.sql.PreparedStatement;
import java.sql.SQLDataException;
import java.sql.SQLException;
import java.sql.Time;
import java.sql.Timestamp;
import java.sql.Types;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* An abstract base class for HiveQL processors to share common data, methods, etc.
*/
public abstract class AbstractHive_1_1QLProcessor extends AbstractSessionFactoryProcessor {
protected static final Pattern HIVEQL_TYPE_ATTRIBUTE_PATTERN = Pattern.compile("hiveql\\.args\\.(\\d+)\\.type");
protected static final Pattern NUMBER_PATTERN = Pattern.compile("-?\\d+");
static String ATTR_INPUT_TABLES = "query.input.tables";
static String ATTR_OUTPUT_TABLES = "query.output.tables";
public static final PropertyDescriptor HIVE_DBCP_SERVICE = new PropertyDescriptor.Builder()
.name("Hive Database Connection Pooling Service")
.description("The Hive Controller Service that is used to obtain connection(s) to the Hive database")
.required(true)
.identifiesControllerService(Hive_1_1DBCPService.class)
.build();
public static final PropertyDescriptor CHARSET = new PropertyDescriptor.Builder()
.name("hive-charset")
.displayName("Character Set")
.description("Specifies the character set of the record data.")
.required(true)
.defaultValue("UTF-8")
.addValidator(StandardValidators.CHARACTER_SET_VALIDATOR)
.build();
/**
* Determines the HiveQL statement that should be executed for the given FlowFile
*
* @param session the session that can be used to access the given FlowFile
* @param flowFile the FlowFile whose HiveQL statement should be executed
* @return the HiveQL that is associated with the given FlowFile
*/
protected String getHiveQL(final ProcessSession session, final FlowFile flowFile, final Charset charset) {
// Read the HiveQL from the FlowFile's content
final byte[] buffer = new byte[(int) flowFile.getSize()];
session.read(flowFile, new InputStreamCallback() {
@Override
public void process(final InputStream in) throws IOException {
StreamUtils.fillBuffer(in, buffer);
}
});
// Create the PreparedStatement to use for this FlowFile.
return new String(buffer, charset);
}
private class ParameterHolder {
String attributeName;
int jdbcType;
String value;
}
/**
* Sets all of the appropriate parameters on the given PreparedStatement, based on the given FlowFile attributes.
*
* @param stmt the statement to set the parameters on
* @param attributes the attributes from which to derive parameter indices, values, and types
* @throws SQLException if the PreparedStatement throws a SQLException when the appropriate setter is called
*/
protected int setParameters(int base, final PreparedStatement stmt, int paramCount, final Map<String, String> attributes) throws SQLException {
Map<Integer, ParameterHolder> parmMap = new TreeMap<Integer, ParameterHolder>();
for (final Map.Entry<String, String> entry : attributes.entrySet()) {
final String key = entry.getKey();
final Matcher matcher = HIVEQL_TYPE_ATTRIBUTE_PATTERN.matcher(key);
if (matcher.matches()) {
final int parameterIndex = Integer.parseInt(matcher.group(1));
if (parameterIndex >= base && parameterIndex < base + paramCount) {
final boolean isNumeric = NUMBER_PATTERN.matcher(entry.getValue()).matches();
if (!isNumeric) {
throw new SQLDataException("Value of the " + key + " attribute is '" + entry.getValue() + "', which is not a valid JDBC numeral jdbcType");
}
final String valueAttrName = "hiveql.args." + parameterIndex + ".value";
ParameterHolder ph = new ParameterHolder();
int realIndexLoc = parameterIndex - base +1;
ph.jdbcType = Integer.parseInt(entry.getValue());
ph.value = attributes.get(valueAttrName);
ph.attributeName = valueAttrName;
parmMap.put(realIndexLoc, ph);
}
}
}
// Now that's we've retrieved the correct number of parameters and it's sorted, let's set them.
for (final Map.Entry<Integer, ParameterHolder> entry : parmMap.entrySet()) {
final Integer index = entry.getKey();
final ParameterHolder ph = entry.getValue();
try {
setParameter(stmt, ph.attributeName, index, ph.value, ph.jdbcType);
} catch (final NumberFormatException nfe) {
throw new SQLDataException("The value of the " + ph.attributeName + " is '" + ph.value + "', which cannot be converted into the necessary data jdbcType", nfe);
}
}
return base + paramCount;
}
/**
* Determines how to map the given value to the appropriate JDBC data jdbcType and sets the parameter on the
* provided PreparedStatement
*
* @param stmt the PreparedStatement to set the parameter on
* @param attrName the name of the attribute that the parameter is coming from - for logging purposes
* @param parameterIndex the index of the HiveQL parameter to set
* @param parameterValue the value of the HiveQL parameter to set
* @param jdbcType the JDBC Type of the HiveQL parameter to set
* @throws SQLException if the PreparedStatement throws a SQLException when calling the appropriate setter
*/
protected void setParameter(final PreparedStatement stmt, final String attrName, final int parameterIndex, final String parameterValue, final int jdbcType) throws SQLException {
if (parameterValue == null) {
stmt.setNull(parameterIndex, jdbcType);
} else {
try {
switch (jdbcType) {
case Types.BIT:
case Types.BOOLEAN:
stmt.setBoolean(parameterIndex, Boolean.parseBoolean(parameterValue));
break;
case Types.TINYINT:
stmt.setByte(parameterIndex, Byte.parseByte(parameterValue));
break;
case Types.SMALLINT:
stmt.setShort(parameterIndex, Short.parseShort(parameterValue));
break;
case Types.INTEGER:
stmt.setInt(parameterIndex, Integer.parseInt(parameterValue));
break;
case Types.BIGINT:
stmt.setLong(parameterIndex, Long.parseLong(parameterValue));
break;
case Types.REAL:
stmt.setFloat(parameterIndex, Float.parseFloat(parameterValue));
break;
case Types.FLOAT:
case Types.DOUBLE:
stmt.setDouble(parameterIndex, Double.parseDouble(parameterValue));
break;
case Types.DECIMAL:
case Types.NUMERIC:
stmt.setBigDecimal(parameterIndex, new BigDecimal(parameterValue));
break;
case Types.DATE:
stmt.setDate(parameterIndex, new Date(Long.parseLong(parameterValue)));
break;
case Types.TIME:
stmt.setTime(parameterIndex, new Time(Long.parseLong(parameterValue)));
break;
case Types.TIMESTAMP:
stmt.setTimestamp(parameterIndex, new Timestamp(Long.parseLong(parameterValue)));
break;
case Types.CHAR:
case Types.VARCHAR:
case Types.LONGNVARCHAR:
case Types.LONGVARCHAR:
stmt.setString(parameterIndex, parameterValue);
break;
default:
stmt.setObject(parameterIndex, parameterValue, jdbcType);
break;
}
} catch (SQLException e) {
// Log which attribute/parameter had an error, then rethrow to be handled at the top level
getLogger().error("Error setting parameter {} to value from {} ({})", new Object[]{parameterIndex, attrName, parameterValue}, e);
throw e;
}
}
}
protected static class TableName {
private final String database;
private final String table;
private final boolean input;
TableName(String database, String table, boolean input) {
this.database = database;
this.table = table;
this.input = input;
}
public String getDatabase() {
return database;
}
public String getTable() {
return table;
}
public boolean isInput() {
return input;
}
@Override
public String toString() {
return database == null || database.isEmpty() ? table : database + '.' + table;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TableName tableName = (TableName) o;
if (input != tableName.input) return false;
if (database != null ? !database.equals(tableName.database) : tableName.database != null) return false;
return table.equals(tableName.table);
}
@Override
public int hashCode() {
int result = database != null ? database.hashCode() : 0;
result = 31 * result + table.hashCode();
result = 31 * result + (input ? 1 : 0);
return result;
}
}
protected Set<TableName> findTableNames(final String query) {
final ASTNode node;
try {
node = new ParseDriver().parse(normalize(query));
} catch (ParseException e) {
// If failed to parse the query, just log a message, but continue.
getLogger().debug("Failed to parse query: {} due to {}", new Object[]{query, e}, e);
return Collections.emptySet();
}
final HashSet<TableName> tableNames = new HashSet<>();
findTableNames(node, tableNames);
return tableNames;
}
/**
* Normalize query.
* Hive resolves prepared statement parameters before executing a query,
* see {@link org.apache.hive.jdbc.HivePreparedStatement#updateSql(String, HashMap)} for detail.
* HiveParser does not expect '?' to be in a query string, and throws an Exception if there is one.
* In this normalize method, '?' is replaced to 'x' to avoid that.
*/
private String normalize(String query) {
return query.replace('?', 'x');
}
private void findTableNames(final Object obj, final Set<TableName> tableNames) {
if (!(obj instanceof CommonTree)) {
return;
}
final CommonTree tree = (CommonTree) obj;
final int childCount = tree.getChildCount();
if ("TOK_TABNAME".equals(tree.getText())) {
final TableName tableName;
final boolean isInput = "TOK_TABREF".equals(tree.getParent().getText());
switch (childCount) {
case 1 :
tableName = new TableName(null, tree.getChild(0).getText(), isInput);
break;
case 2:
tableName = new TableName(tree.getChild(0).getText(), tree.getChild(1).getText(), isInput);
break;
default:
throw new IllegalStateException("TOK_TABNAME does not have expected children, childCount=" + childCount);
}
// If parent is TOK_TABREF, then it is an input table.
tableNames.add(tableName);
return;
}
for (int i = 0; i < childCount; i++) {
findTableNames(tree.getChild(i), tableNames);
}
}
protected Map<String, String> toQueryTableAttributes(Set<TableName> tableNames) {
final Map<String, String> attributes = new HashMap<>();
for (TableName tableName : tableNames) {
final String attributeName = tableName.isInput() ? ATTR_INPUT_TABLES : ATTR_OUTPUT_TABLES;
if (attributes.containsKey(attributeName)) {
attributes.put(attributeName, attributes.get(attributeName) + "," + tableName);
} else {
attributes.put(attributeName, tableName.toString());
}
}
return attributes;
}
}

View File

@ -1,300 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.commons.lang3.StringUtils;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.SeeAlso;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessSessionFactory;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.pattern.ErrorTypes;
import org.apache.nifi.processor.util.pattern.ExceptionHandler;
import org.apache.nifi.processor.util.pattern.ExceptionHandler.OnError;
import org.apache.nifi.processor.util.pattern.PartialFunctions.FetchFlowFiles;
import org.apache.nifi.processor.util.pattern.PartialFunctions.InitConnection;
import org.apache.nifi.processor.util.pattern.Put;
import org.apache.nifi.processor.util.pattern.RollbackOnFailure;
import org.apache.nifi.processor.util.pattern.RoutingResult;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.sql.SQLNonTransientException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
@SeeAlso(SelectHive_1_1QL.class)
@InputRequirement(Requirement.INPUT_REQUIRED)
@Tags({"sql", "hive", "put", "database", "update", "insert"})
@CapabilityDescription("Executes a HiveQL DDL/DML command (UPDATE, INSERT, e.g.). The content of an incoming FlowFile is expected to be the HiveQL command "
+ "to execute. The HiveQL command may use the ? to escape parameters. In this case, the parameters to use must exist as FlowFile attributes "
+ "with the naming convention hiveql.args.N.type and hiveql.args.N.value, where N is a positive integer. The hiveql.args.N.type is expected to be "
+ "a number indicating the JDBC Type. The content of the FlowFile is expected to be in UTF-8 format.")
@ReadsAttributes({
@ReadsAttribute(attribute = "hiveql.args.N.type", description = "Incoming FlowFiles are expected to be parametrized HiveQL statements. The type of each Parameter is specified as an integer "
+ "that represents the JDBC Type of the parameter."),
@ReadsAttribute(attribute = "hiveql.args.N.value", description = "Incoming FlowFiles are expected to be parametrized HiveQL statements. The value of the Parameters are specified as "
+ "hiveql.args.1.value, hiveql.args.2.value, hiveql.args.3.value, and so on. The type of the hiveql.args.1.value Parameter is specified by the hiveql.args.1.type attribute.")
})
@WritesAttributes({
@WritesAttribute(attribute = "query.input.tables", description = "This attribute is written on the flow files routed to the 'success' relationships, "
+ "and contains input table names (if any) in comma delimited 'databaseName.tableName' format."),
@WritesAttribute(attribute = "query.output.tables", description = "This attribute is written on the flow files routed to the 'success' relationships, "
+ "and contains the target table names in 'databaseName.tableName' format.")
})
@DeprecationNotice(classNames = "org.apache.nifi.processors.hive.PutHive3QL")
public class PutHive_1_1QL extends AbstractHive_1_1QLProcessor {
public static final PropertyDescriptor BATCH_SIZE = new PropertyDescriptor.Builder()
.name("hive-batch-size")
.displayName("Batch Size")
.description("The preferred number of FlowFiles to put to the database in a single transaction")
.required(true)
.addValidator(StandardValidators.POSITIVE_INTEGER_VALIDATOR)
.defaultValue("100")
.build();
public static final PropertyDescriptor STATEMENT_DELIMITER = new PropertyDescriptor.Builder()
.name("statement-delimiter")
.displayName("Statement Delimiter")
.description("Statement Delimiter used to separate SQL statements in a multiple statement script")
.required(true)
.defaultValue(";")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
.build();
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("A FlowFile is routed to this relationship after the database is successfully updated")
.build();
public static final Relationship REL_RETRY = new Relationship.Builder()
.name("retry")
.description("A FlowFile is routed to this relationship if the database cannot be updated but attempting the operation again may succeed")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("A FlowFile is routed to this relationship if the database cannot be updated and retrying the operation will also fail, "
+ "such as an invalid query or an integrity constraint violation")
.build();
private final static List<PropertyDescriptor> propertyDescriptors;
private final static Set<Relationship> relationships;
/*
* Will ensure that the list of property descriptors is built only once.
* Will also create a Set of relationships
*/
static {
List<PropertyDescriptor> _propertyDescriptors = new ArrayList<>();
_propertyDescriptors.add(HIVE_DBCP_SERVICE);
_propertyDescriptors.add(BATCH_SIZE);
_propertyDescriptors.add(CHARSET);
_propertyDescriptors.add(STATEMENT_DELIMITER);
_propertyDescriptors.add(RollbackOnFailure.ROLLBACK_ON_FAILURE);
propertyDescriptors = Collections.unmodifiableList(_propertyDescriptors);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
_relationships.add(REL_RETRY);
relationships = Collections.unmodifiableSet(_relationships);
}
private Put<FunctionContext, Connection> process;
private ExceptionHandler<FunctionContext> exceptionHandler;
@OnScheduled
public void constructProcess() {
exceptionHandler = new ExceptionHandler<>();
exceptionHandler.mapException(e -> {
if (e instanceof SQLNonTransientException) {
return ErrorTypes.InvalidInput;
} else if (e instanceof SQLException) {
// Use the SQLException's vendor code for guidance -- see Hive's ErrorMsg class for details on error codes
int errorCode = ((SQLException) e).getErrorCode();
getLogger().debug("Error occurred during Hive operation, Hive returned error code {}", new Object[]{errorCode});
if (errorCode >= 10000 && errorCode < 20000) {
return ErrorTypes.InvalidInput;
} else if (errorCode >= 20000 && errorCode < 30000) {
return ErrorTypes.InvalidInput;
} else if (errorCode >= 30000 && errorCode < 40000) {
return ErrorTypes.TemporalInputFailure;
} else if (errorCode >= 40000 && errorCode < 50000) {
// These are unknown errors (to include some parse errors), but rather than generating an UnknownFailure which causes
// a ProcessException, we'll route to failure via an InvalidInput error type.
return ErrorTypes.InvalidInput;
} else {
// Default unknown errors to TemporalFailure (as they were implemented originally), so they can be routed to failure
// or rolled back depending on the user's setting of Rollback On Failure.
return ErrorTypes.TemporalFailure;
}
} else {
return ErrorTypes.UnknownFailure;
}
});
exceptionHandler.adjustError(RollbackOnFailure.createAdjustError(getLogger()));
process = new Put<>();
process.setLogger(getLogger());
process.initConnection(initConnection);
process.fetchFlowFiles(fetchFlowFiles);
process.putFlowFile(putFlowFile);
process.adjustRoute(RollbackOnFailure.createAdjustRoute(REL_FAILURE, REL_RETRY));
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
private class FunctionContext extends RollbackOnFailure {
final Charset charset;
final String statementDelimiter;
final long startNanos = System.nanoTime();
String connectionUrl;
private FunctionContext(boolean rollbackOnFailure, Charset charset, String statementDelimiter) {
super(rollbackOnFailure, false);
this.charset = charset;
this.statementDelimiter = statementDelimiter;
}
}
private InitConnection<FunctionContext, Connection> initConnection = (context, session, fc, ffs) -> {
final Hive_1_1DBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(Hive_1_1DBCPService.class);
final Connection connection = dbcpService.getConnection(ffs == null || ffs.isEmpty() ? Collections.emptyMap() : ffs.get(0).getAttributes());
fc.connectionUrl = dbcpService.getConnectionURL();
return connection;
};
private FetchFlowFiles<FunctionContext> fetchFlowFiles = (context, session, functionContext, result) -> {
final int batchSize = context.getProperty(BATCH_SIZE).asInteger();
return session.get(batchSize);
};
private Put.PutFlowFile<FunctionContext, Connection> putFlowFile = (context, session, fc, conn, flowFile, result) -> {
final String script = getHiveQL(session, flowFile, fc.charset);
String regex = "(?<!\\\\)" + Pattern.quote(fc.statementDelimiter);
String[] hiveQLs = script.split(regex);
final Set<TableName> tableNames = new HashSet<>();
exceptionHandler.execute(fc, flowFile, input -> {
int loc = 1;
for (String hiveQLStr: hiveQLs) {
getLogger().debug("HiveQL: {}", new Object[]{hiveQLStr});
final String hiveQL = hiveQLStr.trim();
if (!StringUtils.isEmpty(hiveQL)) {
try (final PreparedStatement stmt = conn.prepareStatement(hiveQL)) {
// Get ParameterMetadata
// Hive JDBC Doesn't support this yet:
// ParameterMetaData pmd = stmt.getParameterMetaData();
// int paramCount = pmd.getParameterCount();
int paramCount = StringUtils.countMatches(hiveQL, "?");
if (paramCount > 0) {
loc = setParameters(loc, stmt, paramCount, flowFile.getAttributes());
}
// Parse hiveQL and extract input/output tables
try {
tableNames.addAll(findTableNames(hiveQL));
} catch (Exception e) {
// If failed to parse the query, just log a warning message, but continue.
getLogger().warn("Failed to parse hiveQL: {} due to {}", new Object[]{hiveQL, e}, e);
}
// Execute the statement
stmt.execute();
fc.proceed();
}
}
}
// Emit a Provenance SEND event
final long transmissionMillis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - fc.startNanos);
final FlowFile updatedFlowFile = session.putAllAttributes(flowFile, toQueryTableAttributes(tableNames));
session.getProvenanceReporter().send(updatedFlowFile, fc.connectionUrl, transmissionMillis, true);
result.routeTo(flowFile, REL_SUCCESS);
}, onFlowFileError(context, session, result));
};
private OnError<FunctionContext, FlowFile> onFlowFileError(final ProcessContext context, final ProcessSession session, final RoutingResult result) {
OnError<FunctionContext, FlowFile> onFlowFileError = ExceptionHandler.createOnError(context, session, result, REL_FAILURE, REL_RETRY);
onFlowFileError = onFlowFileError.andThen((c, i, r, e) -> {
switch (r.destination()) {
case Failure:
getLogger().error("Failed to update Hive for {} due to {}; routing to failure", new Object[] {i, e}, e);
break;
case Retry:
getLogger().error("Failed to update Hive for {} due to {}; it is possible that retrying the operation will succeed, so routing to retry",
new Object[] {i, e}, e);
break;
case Self:
getLogger().error("Failed to update Hive for {} due to {};", new Object[] {i, e}, e);
break;
}
});
return RollbackOnFailure.createOnError(onFlowFileError);
}
@Override
public void onTrigger(ProcessContext context, ProcessSessionFactory sessionFactory) throws ProcessException {
final Boolean rollbackOnFailure = context.getProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE).asBoolean();
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
final String statementDelimiter = context.getProperty(STATEMENT_DELIMITER).getValue();
final FunctionContext functionContext = new FunctionContext(rollbackOnFailure, charset, statementDelimiter);
RollbackOnFailure.onTrigger(context, sessionFactory, functionContext, getLogger(), session -> process.onTrigger(context, session, functionContext));
}
}

View File

@ -1,554 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.nifi.annotation.behavior.EventDriven;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.InputRequirement.Requirement;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.annotation.lifecycle.OnScheduled;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessSessionFactory;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.pattern.PartialFunctions;
import org.apache.nifi.util.StopWatch;
import org.apache.nifi.util.hive.CsvOutputOptions;
import org.apache.nifi.util.hive.HiveJdbcCommon;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import static org.apache.nifi.util.hive.HiveJdbcCommon.AVRO;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV_MIME_TYPE;
import static org.apache.nifi.util.hive.HiveJdbcCommon.MIME_TYPE_AVRO_BINARY;
import static org.apache.nifi.util.hive.HiveJdbcCommon.NORMALIZE_NAMES_FOR_AVRO;
@EventDriven
@InputRequirement(Requirement.INPUT_ALLOWED)
@Tags({"hive", "sql", "select", "jdbc", "query", "database"})
@CapabilityDescription("Execute provided HiveQL SELECT query against a Hive database connection. Query result will be converted to Avro or CSV format."
+ " Streaming is used so arbitrarily large result sets are supported. This processor can be scheduled to run on "
+ "a timer, or cron expression, using the standard scheduling methods, or it can be triggered by an incoming FlowFile. "
+ "If it is triggered by an incoming FlowFile, then attributes of that FlowFile will be available when evaluating the "
+ "select query. FlowFile attribute 'selecthiveql.row.count' indicates how many rows were selected.")
@WritesAttributes({
@WritesAttribute(attribute = "mime.type", description = "Sets the MIME type for the outgoing flowfile to application/avro-binary for Avro or text/csv for CSV."),
@WritesAttribute(attribute = "filename", description = "Adds .avro or .csv to the filename attribute depending on which output format is selected."),
@WritesAttribute(attribute = "selecthiveql.row.count", description = "Indicates how many rows were selected/returned by the query."),
@WritesAttribute(attribute = "fragment.identifier", description = "If 'Max Rows Per Flow File' is set then all FlowFiles from the same query result set "
+ "will have the same value for the fragment.identifier attribute. This can then be used to correlate the results."),
@WritesAttribute(attribute = "fragment.count", description = "If 'Max Rows Per Flow File' is set then this is the total number of "
+ "FlowFiles produced by a single ResultSet. This can be used in conjunction with the "
+ "fragment.identifier attribute in order to know how many FlowFiles belonged to the same incoming ResultSet."),
@WritesAttribute(attribute = "fragment.index", description = "If 'Max Rows Per Flow File' is set then the position of this FlowFile in the list of "
+ "outgoing FlowFiles that were all derived from the same result set FlowFile. This can be "
+ "used in conjunction with the fragment.identifier attribute to know which FlowFiles originated from the same query result set and in what order "
+ "FlowFiles were produced"),
@WritesAttribute(attribute = "query.input.tables", description = "Contains input table names in comma delimited 'databaseName.tableName' format.")
})
@DeprecationNotice(classNames = "org.apache.nifi.processors.hive.SelectHive3QL")
public class SelectHive_1_1QL extends AbstractHive_1_1QLProcessor {
public static final String RESULT_ROW_COUNT = "selecthiveql.row.count";
// Relationships
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("Successfully created FlowFile from HiveQL query result set.")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("HiveQL query execution failed. Incoming FlowFile will be penalized and routed to this relationship.")
.build();
public static final PropertyDescriptor HIVEQL_PRE_QUERY = new PropertyDescriptor.Builder()
.name("hive-pre-query")
.displayName("HiveQL Pre-Query")
.description("A semicolon-delimited list of queries executed before the main SQL query is executed. "
+ "Example: 'set tez.queue.name=queue1; set hive.exec.orc.split.strategy=ETL; set hive.exec.reducers.bytes.per.reducer=1073741824'. "
+ "Note, the results/outputs of these queries will be suppressed if successfully executed.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_SELECT_QUERY = new PropertyDescriptor.Builder()
.name("hive-query")
.displayName("HiveQL Select Query")
.description("HiveQL SELECT query to execute. If this is not set, the query is assumed to be in the content of an incoming FlowFile.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_POST_QUERY = new PropertyDescriptor.Builder()
.name("hive-post-query")
.displayName("HiveQL Post-Query")
.description("A semicolon-delimited list of queries executed after the main SQL query is executed. "
+ "Note, the results/outputs of these queries will be suppressed if successfully executed.")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor FETCH_SIZE = new PropertyDescriptor.Builder()
.name("hive-fetch-size")
.displayName("Fetch Size")
.description("The number of result rows to be fetched from the result set at a time. This is a hint to the driver and may not be "
+ "honored and/or exact. If the value specified is zero, then the hint is ignored.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor MAX_ROWS_PER_FLOW_FILE = new PropertyDescriptor.Builder()
.name("hive-max-rows")
.displayName("Max Rows Per Flow File")
.description("The maximum number of result rows that will be included in a single FlowFile. " +
"This will allow you to break up very large result sets into multiple FlowFiles. If the value specified is zero, then all rows are returned in a single FlowFile.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor MAX_FRAGMENTS = new PropertyDescriptor.Builder()
.name("hive-max-frags")
.displayName("Maximum Number of Fragments")
.description("The maximum number of fragments. If the value specified is zero, then all fragments are returned. " +
"This prevents OutOfMemoryError when this processor ingests huge table.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.NON_NEGATIVE_INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_CSV_HEADER = new PropertyDescriptor.Builder()
.name("csv-header")
.displayName("CSV Header")
.description("Include Header in Output")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor HIVEQL_CSV_ALT_HEADER = new PropertyDescriptor.Builder()
.name("csv-alt-header")
.displayName("Alternate CSV Header")
.description("Comma separated list of header fields")
.required(false)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_CSV_DELIMITER = new PropertyDescriptor.Builder()
.name("csv-delimiter")
.displayName("CSV Delimiter")
.description("CSV Delimiter used to separate fields")
.required(true)
.defaultValue(",")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
public static final PropertyDescriptor HIVEQL_CSV_QUOTE = new PropertyDescriptor.Builder()
.name("csv-quote")
.displayName("CSV Quote")
.description("Whether to force quoting of CSV fields. Note that this might conflict with the setting for CSV Escape.")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor HIVEQL_CSV_ESCAPE = new PropertyDescriptor.Builder()
.name("csv-escape")
.displayName("CSV Escape")
.description("Whether to escape CSV strings in output. Note that this might conflict with the setting for CSV Quote.")
.required(true)
.allowableValues("true", "false")
.defaultValue("true")
.addValidator(StandardValidators.BOOLEAN_VALIDATOR)
.build();
public static final PropertyDescriptor HIVEQL_OUTPUT_FORMAT = new PropertyDescriptor.Builder()
.name("hive-output-format")
.displayName("Output Format")
.description("How to represent the records coming from Hive (Avro, CSV, e.g.)")
.required(true)
.allowableValues(AVRO, CSV)
.defaultValue(AVRO)
.expressionLanguageSupported(ExpressionLanguageScope.NONE)
.build();
private final static List<PropertyDescriptor> propertyDescriptors;
private final static Set<Relationship> relationships;
/*
* Will ensure that the list of property descriptors is built only once.
* Will also create a Set of relationships
*/
static {
List<PropertyDescriptor> _propertyDescriptors = new ArrayList<>();
_propertyDescriptors.add(HIVE_DBCP_SERVICE);
_propertyDescriptors.add(HIVEQL_PRE_QUERY);
_propertyDescriptors.add(HIVEQL_SELECT_QUERY);
_propertyDescriptors.add(HIVEQL_POST_QUERY);
_propertyDescriptors.add(FETCH_SIZE);
_propertyDescriptors.add(MAX_ROWS_PER_FLOW_FILE);
_propertyDescriptors.add(MAX_FRAGMENTS);
_propertyDescriptors.add(HIVEQL_OUTPUT_FORMAT);
_propertyDescriptors.add(NORMALIZE_NAMES_FOR_AVRO);
_propertyDescriptors.add(HIVEQL_CSV_HEADER);
_propertyDescriptors.add(HIVEQL_CSV_ALT_HEADER);
_propertyDescriptors.add(HIVEQL_CSV_DELIMITER);
_propertyDescriptors.add(HIVEQL_CSV_QUOTE);
_propertyDescriptors.add(HIVEQL_CSV_ESCAPE);
_propertyDescriptors.add(CHARSET);
propertyDescriptors = Collections.unmodifiableList(_propertyDescriptors);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(_relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@OnScheduled
public void setup(ProcessContext context) {
// If the query is not set, then an incoming flow file is needed. Otherwise fail the initialization
if (!context.getProperty(HIVEQL_SELECT_QUERY).isSet() && !context.hasIncomingConnection()) {
final String errorString = "Either the Select Query must be specified or there must be an incoming connection "
+ "providing flowfile(s) containing a SQL select query";
getLogger().error(errorString);
throw new ProcessException(errorString);
}
}
@Override
public void onTrigger(ProcessContext context, ProcessSessionFactory sessionFactory) throws ProcessException {
PartialFunctions.onTrigger(context, sessionFactory, getLogger(), session -> onTrigger(context, session));
}
private void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile fileToProcess = (context.hasIncomingConnection() ? session.get() : null);
FlowFile flowfile = null;
// If we have no FlowFile, and all incoming connections are self-loops then we can continue on.
// However, if we have no FlowFile and we have connections coming from other Processors, then
// we know that we should run only if we have a FlowFile.
if (context.hasIncomingConnection()) {
if (fileToProcess == null && context.hasNonLoopConnection()) {
return;
}
}
final ComponentLog logger = getLogger();
final Hive_1_1DBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(Hive_1_1DBCPService.class);
final Charset charset = Charset.forName(context.getProperty(CHARSET).getValue());
List<String> preQueries = getQueries(context.getProperty(HIVEQL_PRE_QUERY).evaluateAttributeExpressions(fileToProcess).getValue());
List<String> postQueries = getQueries(context.getProperty(HIVEQL_POST_QUERY).evaluateAttributeExpressions(fileToProcess).getValue());
final boolean flowbased = !(context.getProperty(HIVEQL_SELECT_QUERY).isSet());
// Source the SQL
String hqlStatement;
if (context.getProperty(HIVEQL_SELECT_QUERY).isSet()) {
hqlStatement = context.getProperty(HIVEQL_SELECT_QUERY).evaluateAttributeExpressions(fileToProcess).getValue();
} else {
// If the query is not set, then an incoming flow file is required, and expected to contain a valid SQL select query.
// If there is no incoming connection, onTrigger will not be called as the processor will fail when scheduled.
final StringBuilder queryContents = new StringBuilder();
session.read(fileToProcess, in -> queryContents.append(IOUtils.toString(in, charset)));
hqlStatement = queryContents.toString();
}
final Integer fetchSize = context.getProperty(FETCH_SIZE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxRowsPerFlowFile = context.getProperty(MAX_ROWS_PER_FLOW_FILE).evaluateAttributeExpressions(fileToProcess).asInteger();
final Integer maxFragments = context.getProperty(MAX_FRAGMENTS).isSet()
? context.getProperty(MAX_FRAGMENTS).evaluateAttributeExpressions(fileToProcess).asInteger()
: 0;
final String outputFormat = context.getProperty(HIVEQL_OUTPUT_FORMAT).getValue();
final boolean convertNamesForAvro = context.getProperty(NORMALIZE_NAMES_FOR_AVRO).asBoolean();
final StopWatch stopWatch = new StopWatch(true);
final boolean header = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String altHeader = context.getProperty(HIVEQL_CSV_ALT_HEADER).evaluateAttributeExpressions(fileToProcess).getValue();
final String delimiter = context.getProperty(HIVEQL_CSV_DELIMITER).evaluateAttributeExpressions(fileToProcess).getValue();
final boolean quote = context.getProperty(HIVEQL_CSV_QUOTE).asBoolean();
final boolean escape = context.getProperty(HIVEQL_CSV_HEADER).asBoolean();
final String fragmentIdentifier = UUID.randomUUID().toString();
try (final Connection con = dbcpService.getConnection(fileToProcess == null ? Collections.emptyMap() : fileToProcess.getAttributes());
final Statement st = (flowbased ? con.prepareStatement(hqlStatement) : con.createStatement())
) {
Pair<String,SQLException> failure = executeConfigStatements(con, preQueries);
if (failure != null) {
// In case of failure, assigning config query to "hqlStatement" to follow current error handling
hqlStatement = failure.getLeft();
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw failure.getRight();
}
if (fetchSize != null && fetchSize > 0) {
try {
st.setFetchSize(fetchSize);
} catch (SQLException se) {
// Not all drivers support this, just log the error (at debug level) and move on
logger.debug("Cannot set fetch size to {} due to {}", new Object[]{fetchSize, se.getLocalizedMessage()}, se);
}
}
final List<FlowFile> resultSetFlowFiles = new ArrayList<>();
try {
logger.debug("Executing query {}", new Object[]{hqlStatement});
if (flowbased) {
// Hive JDBC Doesn't Support this yet:
// ParameterMetaData pmd = ((PreparedStatement)st).getParameterMetaData();
// int paramCount = pmd.getParameterCount();
// Alternate way to determine number of params in SQL.
int paramCount = StringUtils.countMatches(hqlStatement, "?");
if (paramCount > 0) {
setParameters(1, (PreparedStatement) st, paramCount, fileToProcess.getAttributes());
}
}
final ResultSet resultSet;
try {
resultSet = (flowbased ? ((PreparedStatement) st).executeQuery() : st.executeQuery(hqlStatement));
} catch (SQLException se) {
// If an error occurs during the query, a flowfile is expected to be routed to failure, so ensure one here
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw se;
}
int fragmentIndex = 0;
String baseFilename = (fileToProcess != null) ? fileToProcess.getAttribute(CoreAttributes.FILENAME.key()) : null;
while (true) {
final AtomicLong nrOfRows = new AtomicLong(0L);
flowfile = (fileToProcess == null) ? session.create() : session.create(fileToProcess);
if (baseFilename == null) {
baseFilename = flowfile.getAttribute(CoreAttributes.FILENAME.key());
}
try {
flowfile = session.write(flowfile, out -> {
try {
if (AVRO.equals(outputFormat)) {
nrOfRows.set(HiveJdbcCommon.convertToAvroStream(resultSet, out, maxRowsPerFlowFile, convertNamesForAvro));
} else if (CSV.equals(outputFormat)) {
CsvOutputOptions options = new CsvOutputOptions(header, altHeader, delimiter, quote, escape, maxRowsPerFlowFile);
nrOfRows.set(HiveJdbcCommon.convertToCsvStream(resultSet, out, options));
} else {
nrOfRows.set(0L);
throw new ProcessException("Unsupported output format: " + outputFormat);
}
} catch (final SQLException | RuntimeException e) {
throw new ProcessException("Error during database query or conversion of records.", e);
}
});
} catch (ProcessException e) {
// Add flowfile to results before rethrowing so it will be removed from session in outer catch
resultSetFlowFiles.add(flowfile);
throw e;
}
if (nrOfRows.get() > 0 || resultSetFlowFiles.isEmpty()) {
final Map<String, String> attributes = new HashMap<>();
// Set attribute for how many rows were selected
attributes.put(RESULT_ROW_COUNT, String.valueOf(nrOfRows.get()));
try {
// Set input/output table names by parsing the query
attributes.putAll(toQueryTableAttributes(findTableNames(hqlStatement)));
} catch (Exception e) {
// If failed to parse the query, just log a warning message, but continue.
getLogger().warn("Failed to parse query: {} due to {}", new Object[]{hqlStatement, e}, e);
}
// Set MIME type on output document and add extension to filename
if (AVRO.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), MIME_TYPE_AVRO_BINARY);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".avro");
} else if (CSV.equals(outputFormat)) {
attributes.put(CoreAttributes.MIME_TYPE.key(), CSV_MIME_TYPE);
attributes.put(CoreAttributes.FILENAME.key(), baseFilename + "." + fragmentIndex + ".csv");
}
if (maxRowsPerFlowFile > 0) {
attributes.put("fragment.identifier", fragmentIdentifier);
attributes.put("fragment.index", String.valueOf(fragmentIndex));
}
flowfile = session.putAllAttributes(flowfile, attributes);
logger.info("{} contains {} " + outputFormat + " records; transferring to 'success'",
new Object[]{flowfile, nrOfRows.get()});
if (context.hasIncomingConnection()) {
// If the flow file came from an incoming connection, issue a Fetch provenance event
session.getProvenanceReporter().fetch(flowfile, dbcpService.getConnectionURL(),
"Retrieved " + nrOfRows.get() + " rows", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
} else {
// If we created a flow file from rows received from Hive, issue a Receive provenance event
session.getProvenanceReporter().receive(flowfile, dbcpService.getConnectionURL(), stopWatch.getElapsed(TimeUnit.MILLISECONDS));
}
resultSetFlowFiles.add(flowfile);
} else {
// If there were no rows returned (and the first flow file has been sent, we're done processing, so remove the flowfile and carry on
session.remove(flowfile);
if (resultSetFlowFiles != null && resultSetFlowFiles.size()>0) {
flowfile = resultSetFlowFiles.get(resultSetFlowFiles.size()-1);
}
break;
}
fragmentIndex++;
if (maxFragments > 0 && fragmentIndex >= maxFragments) {
break;
}
}
for (int i = 0; i < resultSetFlowFiles.size(); i++) {
// Set count on all FlowFiles
if (maxRowsPerFlowFile > 0) {
resultSetFlowFiles.set(i,
session.putAttribute(resultSetFlowFiles.get(i), "fragment.count", Integer.toString(fragmentIndex)));
}
}
} catch (final SQLException e) {
throw e;
}
failure = executeConfigStatements(con, postQueries);
if (failure != null) {
hqlStatement = failure.getLeft();
if (resultSetFlowFiles != null) {
resultSetFlowFiles.forEach(ff -> session.remove(ff));
}
flowfile = (fileToProcess == null) ? session.create() : fileToProcess;
fileToProcess = null;
throw failure.getRight();
}
session.transfer(resultSetFlowFiles, REL_SUCCESS);
if (fileToProcess != null) {
session.remove(fileToProcess);
}
} catch (final ProcessException | SQLException e) {
logger.error("Issue processing SQL {} due to {}.", new Object[]{hqlStatement, e});
if (flowfile == null) {
// This can happen if any exceptions occur while setting up the connection, statement, etc.
logger.error("Unable to execute HiveQL select query {} due to {}. No FlowFile to route to failure",
new Object[]{hqlStatement, e});
context.yield();
} else {
if (context.hasIncomingConnection()) {
logger.error("Unable to execute HiveQL select query {} for {} due to {}; routing to failure",
new Object[]{hqlStatement, flowfile, e});
flowfile = session.penalize(flowfile);
} else {
logger.error("Unable to execute HiveQL select query {} due to {}; routing to failure",
new Object[]{hqlStatement, e});
context.yield();
}
session.transfer(flowfile, REL_FAILURE);
}
}
}
/*
* Executes given queries using pre-defined connection.
* Returns null on success, or a query string if failed.
*/
protected Pair<String,SQLException> executeConfigStatements(final Connection con, final List<String> configQueries){
if (configQueries == null || configQueries.isEmpty()) {
return null;
}
for (String confSQL : configQueries) {
try(final Statement st = con.createStatement()){
st.execute(confSQL);
} catch (SQLException e) {
return Pair.of(confSQL, e);
}
}
return null;
}
protected List<String> getQueries(final String value) {
if (value == null || value.length() == 0 || value.trim().length() == 0) {
return null;
}
final List<String> queries = new LinkedList<>();
for (String query : value.split(";")) {
if (query.trim().length() > 0) {
queries.add(query.trim());
}
}
return queries;
}
}

View File

@ -1,853 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.nifi.annotation.behavior.InputRequirement;
import org.apache.nifi.annotation.behavior.ReadsAttribute;
import org.apache.nifi.annotation.behavior.ReadsAttributes;
import org.apache.nifi.annotation.behavior.RequiresInstanceClassLoading;
import org.apache.nifi.annotation.behavior.WritesAttribute;
import org.apache.nifi.annotation.behavior.WritesAttributes;
import org.apache.nifi.annotation.documentation.CapabilityDescription;
import org.apache.nifi.annotation.documentation.DeprecationNotice;
import org.apache.nifi.annotation.documentation.Tags;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.ValidationContext;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.components.Validator;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.expression.ExpressionLanguageScope;
import org.apache.nifi.flowfile.FlowFile;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.AbstractProcessor;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSession;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.StandardValidators;
import org.apache.nifi.processor.util.pattern.DiscontinuedException;
import org.apache.nifi.processors.hadoop.exception.RecordReaderFactoryException;
import org.apache.nifi.serialization.MalformedRecordException;
import org.apache.nifi.serialization.RecordReader;
import org.apache.nifi.serialization.RecordReaderFactory;
import org.apache.nifi.serialization.RecordSetWriter;
import org.apache.nifi.serialization.RecordSetWriterFactory;
import org.apache.nifi.serialization.SimpleRecordSchema;
import org.apache.nifi.serialization.WriteResult;
import org.apache.nifi.serialization.record.DataType;
import org.apache.nifi.serialization.record.MapRecord;
import org.apache.nifi.serialization.record.Record;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.serialization.record.RecordSchema;
import org.apache.nifi.serialization.record.type.ArrayDataType;
import org.apache.nifi.serialization.record.type.ChoiceDataType;
import org.apache.nifi.serialization.record.type.MapDataType;
import org.apache.nifi.serialization.record.type.RecordDataType;
import org.apache.nifi.util.StringUtils;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
@Tags({"hive", "metadata", "jdbc", "database", "table"})
@CapabilityDescription("This processor uses a Hive JDBC connection and incoming records to generate any Hive 1.1 table changes needed to support the incoming records.")
@ReadsAttributes({
@ReadsAttribute(attribute = "hive.table.management.strategy", description = "This attribute is read if the 'Table Management Strategy' property is configured "
+ "to use the value of this attribute. The value of this attribute should correspond (ignoring case) to a valid option of the 'Table Management Strategy' property.")
})
@WritesAttributes({
@WritesAttribute(attribute = "output.table", description = "This attribute is written on the flow files routed to the 'success' "
+ "and 'failure' relationships, and contains the target table name."),
@WritesAttribute(attribute = "output.path", description = "This attribute is written on the flow files routed to the 'success' "
+ "and 'failure' relationships, and contains the path on the file system to the table (or partition location if the table is partitioned)."),
@WritesAttribute(attribute = "mime.type", description = "Sets the mime.type attribute to the MIME Type specified by the Record Writer, only if a Record Writer is specified "
+ "and Update Field Names is 'true'."),
@WritesAttribute(attribute = "record.count", description = "Sets the number of records in the FlowFile, only if a Record Writer is specified and Update Field Names is 'true'.")
})
@InputRequirement(InputRequirement.Requirement.INPUT_REQUIRED)
@RequiresInstanceClassLoading
@DeprecationNotice(classNames = "org.apache.nifi.processors.hive.UpdateHive3Table")
public class UpdateHive_1_1Table extends AbstractProcessor {
static final String TEXTFILE = "TEXTFILE";
static final String SEQUENCEFILE = "SEQUENCEFILE";
static final String ORC = "ORC";
static final String PARQUET = "PARQUET";
static final String AVRO = "AVRO";
static final String RCFILE = "RCFILE";
static final AllowableValue TEXTFILE_STORAGE = new AllowableValue(TEXTFILE, TEXTFILE, "Stored as plain text files. TEXTFILE is the default file format, unless the configuration "
+ "parameter hive.default.fileformat has a different setting.");
static final AllowableValue SEQUENCEFILE_STORAGE = new AllowableValue(SEQUENCEFILE, SEQUENCEFILE, "Stored as compressed Sequence Files.");
static final AllowableValue ORC_STORAGE = new AllowableValue(ORC, ORC, "Stored as ORC file format. Supports ACID Transactions & Cost-based Optimizer (CBO). "
+ "Stores column-level metadata.");
static final AllowableValue PARQUET_STORAGE = new AllowableValue(PARQUET, PARQUET, "Stored as Parquet format for the Parquet columnar storage format.");
static final AllowableValue AVRO_STORAGE = new AllowableValue(AVRO, AVRO, "Stored as Avro format.");
static final AllowableValue RCFILE_STORAGE = new AllowableValue(RCFILE, RCFILE, "Stored as Record Columnar File format.");
static final AllowableValue CREATE_IF_NOT_EXISTS = new AllowableValue("Create If Not Exists", "Create If Not Exists",
"Create a table with the given schema if it does not already exist");
static final AllowableValue FAIL_IF_NOT_EXISTS = new AllowableValue("Fail If Not Exists", "Fail If Not Exists",
"If the target does not already exist, log an error and route the flowfile to failure");
static final String TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE = "hive.table.management.strategy";
static final AllowableValue MANAGED_TABLE = new AllowableValue("Managed", "Managed",
"Any tables created by this processor will be managed tables (see Hive documentation for details).");
static final AllowableValue EXTERNAL_TABLE = new AllowableValue("External", "External",
"Any tables created by this processor will be external tables located at the `External Table Location` property value.");
static final AllowableValue ATTRIBUTE_DRIVEN_TABLE = new AllowableValue("Use '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' Attribute",
"Use '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' Attribute",
"Inspects the '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' FlowFile attribute to determine the table management strategy. The value "
+ "of this attribute must be a case-insensitive match to one of the other allowable values (Managed, External, e.g.).");
static final String ATTR_OUTPUT_TABLE = "output.table";
static final String ATTR_OUTPUT_PATH = "output.path";
// Properties
static final PropertyDescriptor RECORD_READER = new PropertyDescriptor.Builder()
.name("record-reader")
.displayName("Record Reader")
.description("The service for reading incoming flow files. The reader is only used to determine the schema of the records, the actual records will not be processed.")
.identifiesControllerService(RecordReaderFactory.class)
.required(true)
.build();
static final PropertyDescriptor HIVE_DBCP_SERVICE = new PropertyDescriptor.Builder()
.name("hive11-dbcp-service")
.displayName("Hive Database Connection Pooling Service")
.description("The Hive Controller Service that is used to obtain connection(s) to the Hive database")
.required(true)
.identifiesControllerService(Hive_1_1DBCPService.class)
.build();
static final PropertyDescriptor TABLE_NAME = new PropertyDescriptor.Builder()
.name("hive11-table-name")
.displayName("Table Name")
.description("The name of the database table to update. If the table does not exist, then it will either be created or an error thrown, depending "
+ "on the value of the Create Table property.")
.required(true)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
static final PropertyDescriptor CREATE_TABLE = new PropertyDescriptor.Builder()
.name("hive11-create-table")
.displayName("Create Table Strategy")
.description("Specifies how to process the target table when it does not exist (create it, fail, e.g.).")
.required(true)
.addValidator(Validator.VALID)
.allowableValues(CREATE_IF_NOT_EXISTS, FAIL_IF_NOT_EXISTS)
.defaultValue(FAIL_IF_NOT_EXISTS.getValue())
.build();
static final PropertyDescriptor TABLE_MANAGEMENT_STRATEGY = new PropertyDescriptor.Builder()
.name("hive11-create-table-management")
.displayName("Create Table Management Strategy")
.description("Specifies (when a table is to be created) whether the table is a managed table or an external table. Note that when External is specified, the "
+ "'External Table Location' property must be specified. If the '" + TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE + "' value is selected, 'External Table Location' "
+ "must still be specified, but can contain Expression Language or be set to the empty string, and is ignored when the attribute evaluates to 'Managed'.")
.required(true)
.addValidator(Validator.VALID)
.allowableValues(MANAGED_TABLE, EXTERNAL_TABLE, ATTRIBUTE_DRIVEN_TABLE)
.defaultValue(MANAGED_TABLE.getValue())
.dependsOn(CREATE_TABLE, CREATE_IF_NOT_EXISTS)
.build();
static final PropertyDescriptor UPDATE_FIELD_NAMES = new PropertyDescriptor.Builder()
.name("hive11-update-field-names")
.displayName("Update Field Names")
.description("This property indicates whether to update the output schema such that the field names are set to the exact column names from the specified "
+ "table. This should be used if the incoming record field names may not match the table's column names in terms of upper- and lower-case. For example, this property should be "
+ "set to true if the output FlowFile (and target table storage) is Avro format, as Hive/Impala expects the field names to match the column names exactly.")
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
static final PropertyDescriptor RECORD_WRITER_FACTORY = new PropertyDescriptor.Builder()
.name("hive11-record-writer")
.displayName("Record Writer")
.description("Specifies the Controller Service to use for writing results to a FlowFile. The Record Writer should use Inherit Schema to emulate the inferred schema behavior, i.e. "
+ "an explicit schema need not be defined in the writer, and will be supplied by the same logic used to infer the schema from the column types. If Create Table Strategy is set "
+ "'Create If Not Exists', the Record Writer's output format must match the Record Reader's format in order for the data to be placed in the created table location. Note that "
+ "this property is only used if 'Update Field Names' is set to true and the field names do not all match the column names exactly. If no "
+ "update is needed for any field names (or 'Update Field Names' is false), the Record Writer is not used and instead the input FlowFile is routed to success or failure "
+ "without modification.")
.identifiesControllerService(RecordSetWriterFactory.class)
.dependsOn(UPDATE_FIELD_NAMES, "true")
.required(true)
.build();
static final PropertyDescriptor EXTERNAL_TABLE_LOCATION = new PropertyDescriptor.Builder()
.name("hive11-external-table-location")
.displayName("External Table Location")
.description("Specifies (when an external table is to be created) the file path (in HDFS, e.g.) to store table data.")
.required(true)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.ATTRIBUTE_EXPRESSION_LANGUAGE_VALIDATOR)
.dependsOn(TABLE_MANAGEMENT_STRATEGY, EXTERNAL_TABLE, ATTRIBUTE_DRIVEN_TABLE)
.build();
static final PropertyDescriptor TABLE_STORAGE_FORMAT = new PropertyDescriptor.Builder()
.name("hive11-storage-format")
.displayName("Create Table Storage Format")
.description("If a table is to be created, the specified storage format will be used.")
.required(true)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.allowableValues(TEXTFILE_STORAGE, SEQUENCEFILE_STORAGE, ORC_STORAGE, PARQUET_STORAGE, AVRO_STORAGE, RCFILE_STORAGE)
.defaultValue(TEXTFILE)
.dependsOn(CREATE_TABLE, CREATE_IF_NOT_EXISTS)
.build();
static final PropertyDescriptor QUERY_TIMEOUT = new PropertyDescriptor.Builder()
.name("hive11query-timeout")
.displayName("Query Timeout")
.description("Sets the number of seconds the driver will wait for a query to execute. "
+ "A value of 0 means no timeout. NOTE: Non-zero values may not be supported by the driver.")
.defaultValue("0")
.required(true)
.addValidator(StandardValidators.INTEGER_VALIDATOR)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.build();
static final PropertyDescriptor PARTITION_CLAUSE = new PropertyDescriptor.Builder()
.name("hive11-partition-clause")
.displayName("Partition Clause")
.description("Specifies a comma-separated list of attribute names and optional data types corresponding to the partition columns of the target table. Simply put, if the table is "
+ "partitioned or is to be created with partitions, each partition name should be an attribute on the FlowFile and listed in this property. This assumes all incoming records "
+ "belong to the same partition and the partition columns are not fields in the record. An example of specifying this field is if PartitionRecord "
+ "is upstream and two partition columns 'name' (of type string) and 'age' (of type integer) are used, then this property can be set to 'name string, age int'. The data types "
+ "are optional and if partition(s) are to be created they will default to string type if not specified. For non-string primitive types, specifying the data type for existing "
+ "partition columns is helpful for interpreting the partition value(s). If the table exists, the data types need not be specified "
+ "(and are ignored in that case). This property must be set if the table is partitioned, and there must be an attribute for each partition column in the table. "
+ "The values of the attributes will be used as the partition values, and the resulting output.path attribute value will reflect the location of the partition in the filesystem "
+ "(for use downstream in processors such as PutHDFS).")
.required(false)
.expressionLanguageSupported(ExpressionLanguageScope.FLOWFILE_ATTRIBUTES)
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.build();
// Relationships
public static final Relationship REL_SUCCESS = new Relationship.Builder()
.name("success")
.description("A FlowFile containing records routed to this relationship after the record has been successfully transmitted to Hive.")
.build();
public static final Relationship REL_FAILURE = new Relationship.Builder()
.name("failure")
.description("A FlowFile containing records routed to this relationship if the record could not be transmitted to Hive.")
.build();
private List<PropertyDescriptor> propertyDescriptors;
private Set<Relationship> relationships;
@Override
protected void init(ProcessorInitializationContext context) {
List<PropertyDescriptor> props = new ArrayList<>();
props.add(RECORD_READER);
props.add(HIVE_DBCP_SERVICE);
props.add(TABLE_NAME);
props.add(PARTITION_CLAUSE);
props.add(CREATE_TABLE);
props.add(TABLE_MANAGEMENT_STRATEGY);
props.add(EXTERNAL_TABLE_LOCATION);
props.add(TABLE_STORAGE_FORMAT);
props.add(UPDATE_FIELD_NAMES);
props.add(RECORD_WRITER_FACTORY);
props.add(QUERY_TIMEOUT);
propertyDescriptors = Collections.unmodifiableList(props);
Set<Relationship> _relationships = new HashSet<>();
_relationships.add(REL_SUCCESS);
_relationships.add(REL_FAILURE);
relationships = Collections.unmodifiableSet(_relationships);
}
@Override
protected List<PropertyDescriptor> getSupportedPropertyDescriptors() {
return propertyDescriptors;
}
@Override
public Set<Relationship> getRelationships() {
return relationships;
}
@Override
protected Collection<ValidationResult> customValidate(ValidationContext validationContext) {
List<ValidationResult> validationResults = new ArrayList<>(super.customValidate(validationContext));
final boolean recordWriterFactorySet = validationContext.getProperty(RECORD_WRITER_FACTORY).isSet();
final boolean createIfNotExists = validationContext.getProperty(CREATE_TABLE).getValue().equals(CREATE_IF_NOT_EXISTS.getValue());
final boolean updateFieldNames = validationContext.getProperty(UPDATE_FIELD_NAMES).asBoolean();
if (!recordWriterFactorySet && updateFieldNames) {
validationResults.add(new ValidationResult.Builder().subject(RECORD_WRITER_FACTORY.getDisplayName())
.explanation("Record Writer must be set if 'Update Field Names' is true").valid(false).build());
}
final String tableManagementStrategy = validationContext.getProperty(TABLE_MANAGEMENT_STRATEGY).getValue();
final boolean managedTable;
if (!ATTRIBUTE_DRIVEN_TABLE.getValue().equals(tableManagementStrategy)) {
managedTable = MANAGED_TABLE.getValue().equals(tableManagementStrategy);
// Ensure valid configuration for external tables
if (createIfNotExists && !managedTable && !validationContext.getProperty(EXTERNAL_TABLE_LOCATION).isSet()) {
validationResults.add(new ValidationResult.Builder().subject(EXTERNAL_TABLE_LOCATION.getDisplayName())
.explanation("External Table Location must be set when Table Management Strategy is set to External").valid(false).build());
}
}
return validationResults;
}
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final RecordReaderFactory recordReaderFactory = context.getProperty(RECORD_READER).asControllerService(RecordReaderFactory.class);
final RecordSetWriterFactory recordWriterFactory = context.getProperty(RECORD_WRITER_FACTORY).asControllerService(RecordSetWriterFactory.class);
final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String partitionClauseString = context.getProperty(PARTITION_CLAUSE).evaluateAttributeExpressions(flowFile).getValue();
List<String> partitionClauseElements = null;
if (!StringUtils.isEmpty(partitionClauseString)) {
partitionClauseElements = Arrays.stream(partitionClauseString.split(",")).filter(Objects::nonNull).map(String::trim).collect(Collectors.toList());
}
final ComponentLog log = getLogger();
try {
final RecordReader reader;
try (final InputStream in = session.read(flowFile)) {
// if we fail to create the RecordReader then we want to route to failure, so we need to
// handle this separately from the other IOExceptions which normally route to retry
try {
reader = recordReaderFactory.createRecordReader(flowFile, in, getLogger());
} catch (Exception e) {
throw new RecordReaderFactoryException("Unable to create RecordReader", e);
}
} catch (RecordReaderFactoryException rrfe) {
log.error(
"Failed to create {} for {} - routing to failure",
new Object[]{RecordReader.class.getSimpleName(), flowFile},
rrfe
);
// Since we are wrapping the exceptions above there should always be a cause
// but it's possible it might not have a message. This handles that by logging
// the name of the class thrown.
Throwable c = rrfe.getCause();
if (c != null) {
session.putAttribute(flowFile, "record.error.message", (c.getLocalizedMessage() != null) ? c.getLocalizedMessage() : c.getClass().getCanonicalName() + " Thrown");
} else {
session.putAttribute(flowFile, "record.error.message", rrfe.getClass().getCanonicalName() + " Thrown");
}
session.transfer(flowFile, REL_FAILURE);
return;
}
final RecordSchema recordSchema = reader.getSchema();
final boolean createIfNotExists = context.getProperty(CREATE_TABLE).getValue().equals(CREATE_IF_NOT_EXISTS.getValue());
final boolean updateFieldNames = context.getProperty(UPDATE_FIELD_NAMES).asBoolean();
if (recordWriterFactory == null && updateFieldNames) {
throw new ProcessException("Record Writer must be set if 'Update Field Names' is true");
}
final String tableManagementStrategy = context.getProperty(TABLE_MANAGEMENT_STRATEGY).getValue();
final boolean managedTable;
if (ATTRIBUTE_DRIVEN_TABLE.getValue().equals(tableManagementStrategy)) {
String tableManagementStrategyAttribute = flowFile.getAttribute(TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE);
if (MANAGED_TABLE.getValue().equalsIgnoreCase(tableManagementStrategyAttribute)) {
managedTable = true;
} else if (EXTERNAL_TABLE.getValue().equalsIgnoreCase(tableManagementStrategyAttribute)) {
managedTable = false;
} else {
log.error("The '{}' attribute either does not exist or has invalid value: {}. Must be one of (ignoring case): Managed, External. "
+ "Routing flowfile to failure",
new Object[]{TABLE_MANAGEMENT_STRATEGY_ATTRIBUTE, tableManagementStrategyAttribute});
session.transfer(flowFile, REL_FAILURE);
return;
}
} else {
managedTable = MANAGED_TABLE.getValue().equals(tableManagementStrategy);
}
// Ensure valid configuration for external tables
if (createIfNotExists && !managedTable && !context.getProperty(EXTERNAL_TABLE_LOCATION).isSet()) {
throw new IOException("External Table Location must be set when Table Management Strategy is set to External");
}
final String externalTableLocation = managedTable ? null : context.getProperty(EXTERNAL_TABLE_LOCATION).evaluateAttributeExpressions(flowFile).getValue();
if (!managedTable && StringUtils.isEmpty(externalTableLocation)) {
log.error("External Table Location has invalid value: {}. Routing flowfile to failure", new Object[]{externalTableLocation});
session.transfer(flowFile, REL_FAILURE);
return;
}
final String storageFormat = context.getProperty(TABLE_STORAGE_FORMAT).getValue();
final Hive_1_1DBCPService dbcpService = context.getProperty(HIVE_DBCP_SERVICE).asControllerService(Hive_1_1DBCPService.class);
try (final Connection connection = dbcpService.getConnection()) {
final Map<String,String> attributes = new HashMap<>(flowFile.getAttributes());
OutputMetadataHolder outputMetadataHolder = checkAndUpdateTableSchema(attributes, connection, recordSchema, tableName, partitionClauseElements,
createIfNotExists, externalTableLocation, storageFormat, updateFieldNames);
if (outputMetadataHolder != null) {
// The output schema changed (i.e. field names were updated), so write out the corresponding FlowFile
try {
final FlowFile inputFlowFile = flowFile;
flowFile = session.write(flowFile, (in, out) -> {
// if we fail to create the RecordReader then we want to route to failure, so we need to
// handle this separately from the other IOExceptions which normally route to retry
final RecordReader recordReader;
final RecordSetWriter recordSetWriter;
try {
recordReader = recordReaderFactory.createRecordReader(inputFlowFile, in, getLogger());
recordSetWriter = recordWriterFactory.createWriter(getLogger(), outputMetadataHolder.getOutputSchema(), out, attributes);
} catch (Exception e) {
if(e instanceof IOException) {
throw (IOException) e;
}
throw new IOException(new RecordReaderFactoryException("Unable to create RecordReader", e));
}
WriteResult writeResult = updateRecords(recordSchema, outputMetadataHolder, recordReader, recordSetWriter);
recordSetWriter.flush();
recordSetWriter.close();
attributes.put("record.count", String.valueOf(writeResult.getRecordCount()));
attributes.put(CoreAttributes.MIME_TYPE.key(), recordSetWriter.getMimeType());
attributes.putAll(writeResult.getAttributes());
});
} catch (final Exception e) {
getLogger().error("Failed to process {}; will route to failure", new Object[]{flowFile, e});
// Since we are wrapping the exceptions above there should always be a cause
// but it's possible it might not have a message. This handles that by logging
// the name of the class thrown.
Throwable c = e.getCause();
if (c != null) {
session.putAttribute(flowFile, "record.error.message", (c.getLocalizedMessage() != null) ? c.getLocalizedMessage() : c.getClass().getCanonicalName() + " Thrown");
} else {
session.putAttribute(flowFile, "record.error.message", e.getClass().getCanonicalName() + " Thrown");
}
session.transfer(flowFile, REL_FAILURE);
return;
}
}
attributes.put(ATTR_OUTPUT_TABLE, tableName);
flowFile = session.putAllAttributes(flowFile, attributes);
session.getProvenanceReporter().invokeRemoteProcess(flowFile, dbcpService.getConnectionURL());
session.transfer(flowFile, REL_SUCCESS);
}
} catch (IOException | SQLException e) {
flowFile = session.putAttribute(flowFile, ATTR_OUTPUT_TABLE, tableName);
log.error("Exception while processing {} - routing to failure", new Object[]{flowFile}, e);
session.transfer(flowFile, REL_FAILURE);
} catch (DiscontinuedException e) {
// The input FlowFile processing is discontinued. Keep it in the input queue.
getLogger().warn("Discontinued processing for {} due to {}", new Object[]{flowFile, e}, e);
session.transfer(flowFile, Relationship.SELF);
} catch (Throwable t) {
throw (t instanceof ProcessException) ? (ProcessException) t : new ProcessException(t);
}
}
private synchronized OutputMetadataHolder checkAndUpdateTableSchema(Map<String,String> attributes, final Connection conn, final RecordSchema schema,
final String tableName, List<String> partitionClause, final boolean createIfNotExists,
final String externalTableLocation, final String storageFormat, final boolean updateFieldNames) throws IOException {
// Read in the current table metadata, compare it to the reader's schema, and
// add any columns from the schema that are missing in the table
try (Statement s = conn.createStatement()) {
// Determine whether the table exists
ResultSet tables = s.executeQuery("SHOW TABLES");
List<String> tableNames = new ArrayList<>();
String hiveTableName;
while (tables.next() && StringUtils.isNotEmpty(hiveTableName = tables.getString(1))) {
tableNames.add(hiveTableName);
}
List<String> columnsToAdd = new ArrayList<>();
String outputPath;
boolean tableCreated = false;
if (!tableNames.contains(tableName) && createIfNotExists) {
StringBuilder createTableStatement = new StringBuilder();
for (RecordField recordField : schema.getFields()) {
String recordFieldName = recordField.getFieldName();
// The field does not exist in the table, add it
columnsToAdd.add("`" + recordFieldName + "` " + getHiveTypeFromFieldType(recordField.getDataType(), true));
getLogger().debug("Adding column " + recordFieldName + " to table " + tableName);
}
// Handle partition clause
if (partitionClause == null) {
partitionClause = Collections.emptyList();
}
List<String> validatedPartitionClause = new ArrayList<>(partitionClause.size());
for (String partition : partitionClause) {
String[] partitionInfo = partition.split(" ");
if (partitionInfo.length != 2) {
validatedPartitionClause.add("`" + partitionInfo[0] + "` string");
} else {
validatedPartitionClause.add("`" + partitionInfo[0] + "` " + partitionInfo[1]);
}
}
createTableStatement.append("CREATE ")
.append(externalTableLocation == null ? "" : "EXTERNAL ")
.append("TABLE IF NOT EXISTS `")
.append(tableName)
.append("` (")
.append(String.join(", ", columnsToAdd))
.append(") ")
.append(validatedPartitionClause.isEmpty() ? "" : "PARTITIONED BY (" + String.join(", ", validatedPartitionClause) + ") ")
.append("STORED AS ")
.append(storageFormat)
.append(externalTableLocation == null ? "" : " LOCATION '" + externalTableLocation + "'");
String createTableSql = createTableStatement.toString();
if (StringUtils.isNotEmpty(createTableSql)) {
// Perform the table create
getLogger().info("Executing Hive DDL: " + createTableSql);
s.execute(createTableSql);
}
tableCreated = true;
}
// Process the table (columns, partitions, location, etc.)
List<String> hiveColumns = new ArrayList<>();
String describeTable = "DESC FORMATTED `" + tableName + "`";
ResultSet tableInfo = s.executeQuery(describeTable);
// Result is 3 columns, col_name, data_type, comment. Check the first row for a header and skip if so, otherwise add column name
tableInfo.next();
String columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName) && !columnName.startsWith("#")) {
hiveColumns.add(columnName);
}
// If the column was a header, check for a blank line to follow and skip it, otherwise add the column name
if (columnName.startsWith("#")) {
tableInfo.next();
columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName)) {
hiveColumns.add(columnName);
}
}
// Collect all column names
while (tableInfo.next() && StringUtils.isNotEmpty(columnName = tableInfo.getString(1))) {
hiveColumns.add(columnName);
}
// Collect all partition columns
boolean moreRows = true;
boolean headerFound = false;
while (moreRows && !headerFound) {
String line = tableInfo.getString(1);
if ("# Partition Information".equals(line)) {
headerFound = true;
} else if ("# Detailed Table Information".equals(line)) {
// Not partitioned, exit the loop with headerFound = false
break;
}
moreRows = tableInfo.next();
}
List<String> partitionColumns = new ArrayList<>();
List<String> partitionColumnsEqualsValueList = new ArrayList<>();
List<String> partitionColumnsLocationList = new ArrayList<>();
if (headerFound) {
// If the table is partitioned, construct the partition=value strings for each partition column
String partitionColumnName;
columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName) && !columnName.startsWith("#")) {
partitionColumns.add(columnName);
}
// If the column was a header, check for a blank line to follow and skip it, otherwise add the column name
if (columnName.startsWith("#")) {
tableInfo.next();
columnName = tableInfo.getString(1);
if (StringUtils.isNotEmpty(columnName)) {
partitionColumns.add(columnName);
}
}
while (tableInfo.next() && StringUtils.isNotEmpty(partitionColumnName = tableInfo.getString(1))) {
partitionColumns.add(partitionColumnName);
}
final int partitionColumnsSize = partitionColumns.size();
final int partitionClauseSize = (partitionClause == null) ? 0 : partitionClause.size();
if (partitionClauseSize != partitionColumnsSize) {
throw new IOException("Found " + partitionColumnsSize + " partition columns but " + partitionClauseSize + " partition values were supplied");
}
for (int i = 0; i < partitionClauseSize; i++) {
String partitionName = partitionClause.get(i).split(" ")[0];
String partitionValue = attributes.get(partitionName);
if (StringUtils.isEmpty(partitionValue)) {
throw new IOException("No value found for partition value attribute '" + partitionName + "'");
}
if (!partitionColumns.contains(partitionName)) {
throw new IOException("Cannot add partition '" + partitionName + "' to existing table");
}
partitionColumnsEqualsValueList.add("`" + partitionName + "`='" + partitionValue + "'");
// Add unquoted version for the output path
partitionColumnsLocationList.add(partitionName + "=" + partitionValue);
}
}
// Get table location
moreRows = true;
headerFound = false;
while (moreRows && !headerFound) {
String line = tableInfo.getString(1);
if (line.startsWith("Location:")) {
headerFound = true;
continue; // Don't do a next() here, need to get the second column value
}
moreRows = tableInfo.next();
}
String tableLocation = tableInfo.getString(2);
String alterTableSql;
// If the table wasn't newly created, alter it accordingly
if (!tableCreated) {
StringBuilder alterTableStatement = new StringBuilder();
// Handle new columns
for (RecordField recordField : schema.getFields()) {
String recordFieldName = recordField.getFieldName().toLowerCase();
if (!hiveColumns.contains(recordFieldName) && !partitionColumns.contains(recordFieldName)) {
// The field does not exist in the table (and is not a partition column), add it
columnsToAdd.add("`" + recordFieldName + "` " + getHiveTypeFromFieldType(recordField.getDataType(), true));
getLogger().info("Adding column " + recordFieldName + " to table " + tableName);
}
}
if (!columnsToAdd.isEmpty()) {
alterTableStatement.append("ALTER TABLE `")
.append(tableName)
.append("` ADD COLUMNS (")
.append(String.join(", ", columnsToAdd))
.append(")");
alterTableSql = alterTableStatement.toString();
if (StringUtils.isNotEmpty(alterTableSql)) {
// Perform the table update
getLogger().info("Executing Hive DDL: " + alterTableSql);
s.execute(alterTableSql);
}
}
}
outputPath = tableLocation;
// Handle new partition values
if (!partitionColumnsEqualsValueList.isEmpty()) {
alterTableSql = "ALTER TABLE `" +
tableName +
"` ADD IF NOT EXISTS PARTITION (" +
String.join(", ", partitionColumnsEqualsValueList) +
")";
if (StringUtils.isNotEmpty(alterTableSql)) {
// Perform the table update
getLogger().info("Executing Hive DDL: " + alterTableSql);
s.execute(alterTableSql);
}
// Add attribute for HDFS location of the partition values
outputPath = tableLocation + "/" + String.join("/", partitionColumnsLocationList);
}
// If updating field names, return a new RecordSchema, otherwise return null
OutputMetadataHolder outputMetadataHolder;
if (updateFieldNames) {
List<RecordField> inputRecordFields = schema.getFields();
List<RecordField> outputRecordFields = new ArrayList<>();
Map<String,String> fieldMap = new HashMap<>();
boolean needsUpdating = false;
for (RecordField inputRecordField : inputRecordFields) {
final String inputRecordFieldName = inputRecordField.getFieldName();
boolean found = false;
for (String hiveColumnName : hiveColumns) {
if (inputRecordFieldName.equalsIgnoreCase(hiveColumnName)) {
// Set a flag if the field name doesn't match the column name exactly. This overall flag will determine whether
// the records need updating (if true) or not (if false)
if (!inputRecordFieldName.equals(hiveColumnName)) {
needsUpdating = true;
}
fieldMap.put(inputRecordFieldName, hiveColumnName);
outputRecordFields.add(new RecordField(hiveColumnName, inputRecordField.getDataType(), inputRecordField.getDefaultValue(), inputRecordField.isNullable()));
found = true;
break;
}
}
if (!found) {
// If the input field wasn't a Hive table column, add it back to the schema as-is
fieldMap.put(inputRecordFieldName, inputRecordFieldName);
}
}
outputMetadataHolder = needsUpdating ? new OutputMetadataHolder(new SimpleRecordSchema(outputRecordFields), fieldMap)
: null;
} else {
outputMetadataHolder = null;
}
attributes.put(ATTR_OUTPUT_PATH, outputPath);
return outputMetadataHolder;
} catch (Exception e) {
throw new IOException(e);
}
}
public static String getHiveTypeFromFieldType(DataType rawDataType, boolean hiveFieldNames) {
if (rawDataType == null) {
throw new IllegalArgumentException("Field type is null");
}
RecordFieldType dataType = rawDataType.getFieldType();
if (RecordFieldType.INT.equals(dataType)) {
return "INT";
}
if (RecordFieldType.LONG.equals(dataType)) {
return "BIGINT";
}
if (RecordFieldType.BOOLEAN.equals(dataType)) {
return "BOOLEAN";
}
if (RecordFieldType.DOUBLE.equals(dataType)) {
return "DOUBLE";
}
if (RecordFieldType.FLOAT.equals(dataType)) {
return "FLOAT";
}
if (RecordFieldType.DECIMAL.equals(dataType)) {
return "DECIMAL";
}
if (RecordFieldType.STRING.equals(dataType) || RecordFieldType.ENUM.equals(dataType)) {
return "STRING";
}
if (RecordFieldType.DATE.equals(dataType)) {
return "DATE";
}
if (RecordFieldType.TIME.equals(dataType)) {
return "INT";
}
if (RecordFieldType.TIMESTAMP.equals(dataType)) {
return "TIMESTAMP";
}
if (RecordFieldType.ARRAY.equals(dataType)) {
ArrayDataType arrayDataType = (ArrayDataType) rawDataType;
if (RecordFieldType.BYTE.getDataType().equals(arrayDataType.getElementType())) {
return "BINARY";
}
return "ARRAY<" + getHiveTypeFromFieldType(arrayDataType.getElementType(), hiveFieldNames) + ">";
}
if (RecordFieldType.MAP.equals(dataType)) {
MapDataType mapDataType = (MapDataType) rawDataType;
return "MAP<STRING, " + getHiveTypeFromFieldType(mapDataType.getValueType(), hiveFieldNames) + ">";
}
if (RecordFieldType.CHOICE.equals(dataType)) {
ChoiceDataType choiceDataType = (ChoiceDataType) rawDataType;
List<DataType> unionFieldSchemas = choiceDataType.getPossibleSubTypes();
if (unionFieldSchemas != null) {
// Ignore null types in union
List<String> hiveFields = unionFieldSchemas.stream()
.map((it) -> getHiveTypeFromFieldType(it, hiveFieldNames))
.collect(Collectors.toList());
// Flatten the field if the union only has one non-null element
return (hiveFields.size() == 1)
? hiveFields.get(0)
: "UNIONTYPE<" + org.apache.commons.lang3.StringUtils.join(hiveFields, ", ") + ">";
}
return null;
}
if (RecordFieldType.RECORD.equals(dataType)) {
RecordDataType recordDataType = (RecordDataType) rawDataType;
List<RecordField> recordFields = recordDataType.getChildSchema().getFields();
if (recordFields != null) {
List<String> hiveFields = recordFields.stream().map(
recordField -> ("`" + (hiveFieldNames ? recordField.getFieldName().toLowerCase() : recordField.getFieldName()) + "`:"
+ getHiveTypeFromFieldType(recordField.getDataType(), hiveFieldNames))).collect(Collectors.toList());
return "STRUCT<" + org.apache.commons.lang3.StringUtils.join(hiveFields, ", ") + ">";
}
return null;
}
throw new IllegalArgumentException("Error converting Avro type " + dataType.name() + " to Hive type");
}
private synchronized WriteResult updateRecords(final RecordSchema inputRecordSchema, final OutputMetadataHolder outputMetadataHolder,
final RecordReader reader, final RecordSetWriter writer) throws IOException {
try {
writer.beginRecordSet();
Record inputRecord;
while((inputRecord = reader.nextRecord()) != null) {
List<RecordField> inputRecordFields = inputRecordSchema.getFields();
Map<String,Object> outputRecordFields = new HashMap<>(inputRecordFields.size());
// Copy values from input field name to output field name
for(Map.Entry<String,String> mapping : outputMetadataHolder.getFieldMap().entrySet()) {
outputRecordFields.put(mapping.getValue(), inputRecord.getValue(mapping.getKey()));
}
Record outputRecord = new MapRecord(outputMetadataHolder.getOutputSchema(), outputRecordFields);
writer.write(outputRecord);
}
return writer.finishRecordSet();
} catch (MalformedRecordException mre) {
throw new IOException("Error reading records: "+mre.getMessage(), mre);
}
}
private static class OutputMetadataHolder {
private final RecordSchema outputSchema;
private final Map<String,String> fieldMap;
public OutputMetadataHolder(RecordSchema outputSchema, Map<String, String> fieldMap) {
this.outputSchema = outputSchema;
this.fieldMap = fieldMap;
}
public RecordSchema getOutputSchema() {
return outputSchema;
}
public Map<String, String> getFieldMap() {
return fieldMap;
}
}
}

View File

@ -1,23 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
public class AuthenticationFailedException extends Exception {
public AuthenticationFailedException(String reason, Exception cause) {
super(reason, cause);
}
}

View File

@ -1,63 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
public class CsvOutputOptions {
private boolean header = true;
private String altHeader = null;
private String delimiter = ",";
private boolean quote = false;
private boolean escape = true;
private int maxRowsPerFlowFile = 0;
public boolean isHeader() {
return header;
}
public String getAltHeader() {
return altHeader;
}
public String getDelimiter() {
return delimiter;
}
public boolean isQuote() {
return quote;
}
public boolean isEscape() {
return escape;
}
public int getMaxRowsPerFlowFile() {
return maxRowsPerFlowFile;
}
public CsvOutputOptions(boolean header, String altHeader, String delimiter, boolean quote, boolean escape, int maxRowsPerFlowFile) {
this.header = header;
this.altHeader = altHeader;
this.delimiter = delimiter;
this.quote = quote;
this.escape = escape;
this.maxRowsPerFlowFile = maxRowsPerFlowFile;
}
}

View File

@ -1,136 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.nifi.components.ValidationResult;
import org.apache.nifi.hadoop.KerberosProperties;
import org.apache.nifi.hadoop.SecurityUtil;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.security.krb.KerberosUser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.atomic.AtomicReference;
public class HiveConfigurator {
public Collection<ValidationResult> validate(String configFiles, String principal, String keyTab, String password,
AtomicReference<ValidationResources> validationResourceHolder, ComponentLog log) {
final List<ValidationResult> problems = new ArrayList<>();
ValidationResources resources = validationResourceHolder.get();
// if no resources in the holder, or if the holder has different resources loaded,
// then load the Configuration and set the new resources in the holder
if (resources == null || !configFiles.equals(resources.getConfigResources())) {
log.debug("Reloading validation resources");
resources = new ValidationResources(configFiles, getConfigurationFromFiles(configFiles));
validationResourceHolder.set(resources);
}
final Configuration hiveConfig = resources.getConfiguration();
problems.addAll(KerberosProperties.validatePrincipalWithKeytabOrPassword(this.getClass().getSimpleName(), hiveConfig, principal, keyTab, password, log));
return problems;
}
public HiveConf getConfigurationFromFiles(final String configFiles) {
final HiveConf hiveConfig = new HiveConf();
if (StringUtils.isNotBlank(configFiles)) {
for (final String configFile : configFiles.split(",")) {
hiveConfig.addResource(new Path(configFile.trim()));
}
}
return hiveConfig;
}
public void preload(Configuration configuration) {
try {
FileSystem.get(configuration).close();
UserGroupInformation.setConfiguration(configuration);
} catch (IOException ioe) {
// Suppress exception as future uses of this configuration will fail
}
}
/**
* Acquires a {@link UserGroupInformation} using the given {@link Configuration} and {@link KerberosUser}.
* @see SecurityUtil#getUgiForKerberosUser(Configuration, KerberosUser)
* @param hiveConfig The Configuration to apply to the acquired UserGroupInformation
* @param kerberosUser The KerberosUser to authenticate
* @return A UserGroupInformation instance created using the Subject of the given KerberosUser
* @throws AuthenticationFailedException if authentication fails
*/
public UserGroupInformation authenticate(final Configuration hiveConfig, KerberosUser kerberosUser) throws AuthenticationFailedException {
try {
return SecurityUtil.getUgiForKerberosUser(hiveConfig, kerberosUser);
} catch (IOException ioe) {
throw new AuthenticationFailedException("Kerberos Authentication for Hive failed", ioe);
}
}
/**
* As of Apache NiFi 1.5.0, due to changes made to
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}, which is used by this
* class to authenticate a principal with Kerberos, Hive controller services no longer
* attempt relogins explicitly. For more information, please read the documentation for
* {@link SecurityUtil#loginKerberos(Configuration, String, String)}.
* <p/>
* In previous versions of NiFi, a {@link org.apache.nifi.hadoop.KerberosTicketRenewer} was started by
* {@link HiveConfigurator#authenticate(Configuration, String, String, long)} when the Hive
* controller service was enabled. The use of a separate thread to explicitly relogin could cause race conditions
* with the implicit relogin attempts made by hadoop/Hive code on a thread that references the same
* {@link UserGroupInformation} instance. One of these threads could leave the
* {@link javax.security.auth.Subject} in {@link UserGroupInformation} to be cleared or in an unexpected state
* while the other thread is attempting to use the {@link javax.security.auth.Subject}, resulting in failed
* authentication attempts that would leave the Hive controller service in an unrecoverable state.
*
* @see SecurityUtil#loginKerberos(Configuration, String, String)
* @deprecated Use {@link SecurityUtil#getUgiForKerberosUser(Configuration, KerberosUser)}
*/
@Deprecated
public UserGroupInformation authenticate(final Configuration hiveConfig, String principal, String keyTab) throws AuthenticationFailedException {
UserGroupInformation ugi;
try {
ugi = SecurityUtil.loginKerberos(hiveConfig, principal, keyTab);
} catch (IOException ioe) {
throw new AuthenticationFailedException("Kerberos Authentication for Hive failed", ioe);
}
return ugi;
}
/**
* As of Apache NiFi 1.5.0, this method has been deprecated and is now a wrapper
* method which invokes {@link HiveConfigurator#authenticate(Configuration, String, String)}. It will no longer start a
* {@link org.apache.nifi.hadoop.KerberosTicketRenewer} to perform explicit relogins.
*
* @see HiveConfigurator#authenticate(Configuration, String, String)
*/
@Deprecated
public UserGroupInformation authenticate(final Configuration hiveConfig, String principal, String keyTab, long ticketRenewalPeriod) throws AuthenticationFailedException {
return authenticate(hiveConfig, principal, keyTab);
}
}

View File

@ -1,462 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.avro.SchemaBuilder.FieldAssembler;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.StringEscapeUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.nifi.components.PropertyDescriptor;
import java.io.IOException;
import java.io.OutputStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.ResultSetMetaData;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static java.sql.Types.ARRAY;
import static java.sql.Types.BIGINT;
import static java.sql.Types.BINARY;
import static java.sql.Types.BIT;
import static java.sql.Types.BLOB;
import static java.sql.Types.BOOLEAN;
import static java.sql.Types.CHAR;
import static java.sql.Types.CLOB;
import static java.sql.Types.DATE;
import static java.sql.Types.DECIMAL;
import static java.sql.Types.DOUBLE;
import static java.sql.Types.FLOAT;
import static java.sql.Types.INTEGER;
import static java.sql.Types.JAVA_OBJECT;
import static java.sql.Types.LONGNVARCHAR;
import static java.sql.Types.LONGVARBINARY;
import static java.sql.Types.LONGVARCHAR;
import static java.sql.Types.NCHAR;
import static java.sql.Types.NUMERIC;
import static java.sql.Types.NVARCHAR;
import static java.sql.Types.OTHER;
import static java.sql.Types.REAL;
import static java.sql.Types.ROWID;
import static java.sql.Types.SMALLINT;
import static java.sql.Types.SQLXML;
import static java.sql.Types.STRUCT;
import static java.sql.Types.TIME;
import static java.sql.Types.TIMESTAMP;
import static java.sql.Types.TINYINT;
import static java.sql.Types.VARBINARY;
import static java.sql.Types.VARCHAR;
/**
* JDBC / HiveQL common functions.
*/
public class HiveJdbcCommon {
public static final String AVRO = "Avro";
public static final String CSV = "CSV";
public static final String MIME_TYPE_AVRO_BINARY = "application/avro-binary";
public static final String CSV_MIME_TYPE = "text/csv";
public static final PropertyDescriptor NORMALIZE_NAMES_FOR_AVRO = new PropertyDescriptor.Builder()
.name("hive-normalize-avro")
.displayName("Normalize Table/Column Names")
.description("Whether to change non-Avro-compatible characters in column names to Avro-compatible characters. For example, colons and periods "
+ "will be changed to underscores in order to build a valid Avro record.")
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, final int maxRows, boolean convertNames) throws SQLException, IOException {
return convertToAvroStream(rs, outStream, null, maxRows, convertNames, null);
}
public static long convertToAvroStream(final ResultSet rs, final OutputStream outStream, String recordName, final int maxRows, boolean convertNames, ResultSetRowCallback callback)
throws SQLException, IOException {
final Schema schema = createSchema(rs, recordName, convertNames);
final GenericRecord rec = new GenericData.Record(schema);
final DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
try (final DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter)) {
dataFileWriter.create(schema, outStream);
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
long nrOfRows = 0;
while (rs.next()) {
if (callback != null) {
callback.processRow(rs);
}
for (int i = 1; i <= nrOfColumns; i++) {
final int javaSqlType = meta.getColumnType(i);
Object value = rs.getObject(i);
if (value == null) {
rec.put(i - 1, null);
} else if (javaSqlType == BINARY || javaSqlType == VARBINARY || javaSqlType == LONGVARBINARY || javaSqlType == BLOB || javaSqlType == CLOB) {
// bytes requires little bit different handling
ByteBuffer bb = null;
if (value instanceof byte[]) {
bb = ByteBuffer.wrap((byte[]) value);
} else if (value instanceof ByteBuffer) {
bb = (ByteBuffer) value;
}
if (bb != null) {
rec.put(i - 1, bb);
} else {
throw new IOException("Could not process binary object of type " + value.getClass().getName());
}
} else if (value instanceof Byte) {
// tinyint(1) type is returned by JDBC driver as java.sql.Types.TINYINT
// But value is returned by JDBC as java.lang.Byte
// (at least H2 JDBC works this way)
// direct put to avro record results:
// org.apache.avro.AvroRuntimeException: Unknown datum type java.lang.Byte
rec.put(i - 1, ((Byte) value).intValue());
} else if (value instanceof BigDecimal || value instanceof BigInteger) {
// Avro can't handle BigDecimal and BigInteger as numbers - it will throw an AvroRuntimeException such as: "Unknown datum type: java.math.BigDecimal: 38"
rec.put(i - 1, value.toString());
} else if (value instanceof Number) {
// Need to call the right getXYZ() method (instead of the getObject() method above), since Doubles are sometimes returned
// when the JDBC type is 6 (Float) for example.
if (javaSqlType == FLOAT) {
value = rs.getFloat(i);
} else if (javaSqlType == DOUBLE) {
value = rs.getDouble(i);
} else if (javaSqlType == INTEGER || javaSqlType == TINYINT || javaSqlType == SMALLINT) {
value = rs.getInt(i);
}
rec.put(i - 1, value);
} else if (value instanceof Boolean) {
rec.put(i - 1, value);
} else if (value instanceof java.sql.SQLXML) {
rec.put(i - 1, ((java.sql.SQLXML) value).getString());
} else {
// The different types that we support are numbers (int, long, double, float),
// as well as boolean values and Strings. Since Avro doesn't provide
// timestamp types, we want to convert those to Strings. So we will cast anything other
// than numbers or booleans to strings by using the toString() method.
rec.put(i - 1, value.toString());
}
}
dataFileWriter.append(rec);
nrOfRows += 1;
if (maxRows > 0 && nrOfRows == maxRows)
break;
}
return nrOfRows;
}
}
public static Schema createSchema(final ResultSet rs, boolean convertNames) throws SQLException {
return createSchema(rs, null, false);
}
/**
* Creates an Avro schema from a result set. If the table/record name is known a priori and provided, use that as a
* fallback for the record name if it cannot be retrieved from the result set, and finally fall back to a default value.
*
* @param rs The result set to convert to Avro
* @param recordName The a priori record name to use if it cannot be determined from the result set.
* @param convertNames Whether to convert column/table names to be legal Avro names
* @return A Schema object representing the result set converted to an Avro record
* @throws SQLException if any error occurs during conversion
*/
public static Schema createSchema(final ResultSet rs, String recordName, boolean convertNames) throws SQLException {
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
String tableName = StringUtils.isEmpty(recordName) ? "NiFi_SelectHiveQL_Record" : recordName;
try {
if (nrOfColumns > 0) {
// Hive JDBC doesn't support getTableName, instead it returns table.column for column name. Grab the table name from the first column
String firstColumnNameFromMeta = meta.getColumnName(1);
int tableNameDelimiter = firstColumnNameFromMeta.lastIndexOf(".");
if (tableNameDelimiter > -1) {
String tableNameFromMeta = firstColumnNameFromMeta.substring(0, tableNameDelimiter);
if (!StringUtils.isBlank(tableNameFromMeta)) {
tableName = tableNameFromMeta;
}
}
}
} catch (SQLException se) {
// Not all drivers support getTableName, so just use the previously-set default
}
if (convertNames) {
tableName = normalizeNameForAvro(tableName);
}
final FieldAssembler<Schema> builder = SchemaBuilder.record(tableName).namespace("any.data").fields();
/**
* Some missing Avro types - Decimal, Date types. May need some additional work.
*/
for (int i = 1; i <= nrOfColumns; i++) {
String columnNameFromMeta = meta.getColumnName(i);
// Hive returns table.column for column name. Grab the column name as the string after the last period
int columnNameDelimiter = columnNameFromMeta.lastIndexOf(".");
String columnName = columnNameFromMeta.substring(columnNameDelimiter + 1);
switch (meta.getColumnType(i)) {
case CHAR:
case LONGNVARCHAR:
case LONGVARCHAR:
case NCHAR:
case NVARCHAR:
case VARCHAR:
case ARRAY:
case STRUCT:
case JAVA_OBJECT:
case OTHER:
case SQLXML:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
case BIT:
case BOOLEAN:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().booleanType().endUnion().noDefault();
break;
case INTEGER:
// Default to signed type unless otherwise noted. Some JDBC drivers don't implement isSigned()
boolean signedType = true;
try {
signedType = meta.isSigned(i);
} catch (SQLException se) {
// Use signed types as default
}
if (signedType) {
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().intType().endUnion().noDefault();
} else {
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().longType().endUnion().noDefault();
}
break;
case SMALLINT:
case TINYINT:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().intType().endUnion().noDefault();
break;
case BIGINT:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().longType().endUnion().noDefault();
break;
// java.sql.RowId is interface, is seems to be database
// implementation specific, let's convert to String
case ROWID:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
case FLOAT:
case REAL:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().floatType().endUnion().noDefault();
break;
case DOUBLE:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().doubleType().endUnion().noDefault();
break;
// Did not find direct suitable type, need to be clarified!!!!
case DECIMAL:
case NUMERIC:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
// Did not find direct suitable type, need to be clarified!!!!
case DATE:
case TIME:
case TIMESTAMP:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().stringType().endUnion().noDefault();
break;
case BINARY:
case VARBINARY:
case LONGVARBINARY:
case BLOB:
case CLOB:
builder.name(columnName).type().unionOf().nullBuilder().endNull().and().bytesType().endUnion().noDefault();
break;
default:
throw new IllegalArgumentException("createSchema: Unknown SQL type " + meta.getColumnType(i) + " cannot be converted to Avro type");
}
}
return builder.endRecord();
}
public static long convertToCsvStream(final ResultSet rs, final OutputStream outStream, CsvOutputOptions outputOptions) throws SQLException, IOException {
return convertToCsvStream(rs, outStream, null, null, outputOptions);
}
public static long convertToCsvStream(final ResultSet rs, final OutputStream outStream, String recordName, ResultSetRowCallback callback, CsvOutputOptions outputOptions)
throws SQLException, IOException {
final ResultSetMetaData meta = rs.getMetaData();
final int nrOfColumns = meta.getColumnCount();
List<String> columnNames = new ArrayList<>(nrOfColumns);
if (outputOptions.isHeader()) {
if (outputOptions.getAltHeader() == null) {
for (int i = 1; i <= nrOfColumns; i++) {
String columnNameFromMeta = meta.getColumnName(i);
// Hive returns table.column for column name. Grab the column name as the string after the last period
int columnNameDelimiter = columnNameFromMeta.lastIndexOf(".");
columnNames.add(columnNameFromMeta.substring(columnNameDelimiter + 1));
}
} else {
String[] altHeaderNames = outputOptions.getAltHeader().split(",");
columnNames = Arrays.asList(altHeaderNames);
}
}
// Write column names as header row
outStream.write(StringUtils.join(columnNames, outputOptions.getDelimiter()).getBytes(StandardCharsets.UTF_8));
if (outputOptions.isHeader()) {
outStream.write("\n".getBytes(StandardCharsets.UTF_8));
}
// Iterate over the rows
int maxRows = outputOptions.getMaxRowsPerFlowFile();
long nrOfRows = 0;
while (rs.next()) {
if (callback != null) {
callback.processRow(rs);
}
List<String> rowValues = new ArrayList<>(nrOfColumns);
for (int i = 1; i <= nrOfColumns; i++) {
final int javaSqlType = meta.getColumnType(i);
final Object value = rs.getObject(i);
switch (javaSqlType) {
case CHAR:
case LONGNVARCHAR:
case LONGVARCHAR:
case NCHAR:
case NVARCHAR:
case VARCHAR:
String valueString = rs.getString(i);
if (valueString != null) {
// Removed extra quotes as those are a part of the escapeCsv when required.
StringBuilder sb = new StringBuilder();
if (outputOptions.isQuote()) {
sb.append("\"");
if (outputOptions.isEscape()) {
sb.append(StringEscapeUtils.escapeCsv(valueString));
} else {
sb.append(valueString);
}
sb.append("\"");
rowValues.add(sb.toString());
} else {
if (outputOptions.isEscape()) {
rowValues.add(StringEscapeUtils.escapeCsv(valueString));
} else {
rowValues.add(valueString);
}
}
} else {
rowValues.add("");
}
break;
case ARRAY:
case STRUCT:
case JAVA_OBJECT:
String complexValueString = rs.getString(i);
if (complexValueString != null) {
rowValues.add(StringEscapeUtils.escapeCsv(complexValueString));
} else {
rowValues.add("");
}
break;
case SQLXML:
if (value != null) {
rowValues.add(StringEscapeUtils.escapeCsv(((java.sql.SQLXML) value).getString()));
} else {
rowValues.add("");
}
default:
if (value != null) {
rowValues.add(value.toString());
} else {
rowValues.add("");
}
}
}
// Write row values
outStream.write(StringUtils.join(rowValues, outputOptions.getDelimiter()).getBytes(StandardCharsets.UTF_8));
outStream.write("\n".getBytes(StandardCharsets.UTF_8));
nrOfRows++;
if (maxRows > 0 && nrOfRows == maxRows)
break;
}
return nrOfRows;
}
public static String normalizeNameForAvro(String inputName) {
String normalizedName = inputName.replaceAll("[^A-Za-z0-9_]", "_");
if (Character.isDigit(normalizedName.charAt(0))) {
normalizedName = "_" + normalizedName;
}
return normalizedName;
}
/**
* An interface for callback methods which allows processing of a row during the convertToXYZStream() processing.
* <b>IMPORTANT:</b> This method should only work on the row pointed at by the current ResultSet reference.
* Advancing the cursor (e.g.) can cause rows to be skipped during Avro transformation.
*/
public interface ResultSetRowCallback {
void processRow(ResultSet resultSet) throws IOException;
}
public static Configuration getConfigurationFromFiles(final String configFiles) {
final Configuration hiveConfig = new HiveConf();
if (StringUtils.isNotBlank(configFiles)) {
for (final String configFile : configFiles.split(",")) {
hiveConfig.addResource(new Path(configFile.trim()));
}
}
return hiveConfig;
}
}

View File

@ -1,41 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.util.hive;
import org.apache.hadoop.conf.Configuration;
/**
* A helper class for maintaining loaded configurations (to avoid reloading on use unless necessary)
*/
public class ValidationResources {
private final String configResources;
private final Configuration configuration;
public ValidationResources(String configResources, Configuration configuration) {
this.configResources = configResources;
this.configuration = configuration;
}
public String getConfigResources() {
return configResources;
}
public Configuration getConfiguration() {
return configuration;
}
}

View File

@ -1,15 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.dbcp.hive.Hive_1_1ConnectionPool

View File

@ -1,17 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.nifi.processors.hive.SelectHive_1_1QL
org.apache.nifi.processors.hive.PutHive_1_1QL
org.apache.nifi.processors.hive.UpdateHive_1_1Table

View File

@ -1,184 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.dbcp.hive;
import org.apache.commons.dbcp2.BasicDataSource;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.kerberos.KerberosCredentialsService;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.registry.VariableDescriptor;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.MockConfigurationContext;
import org.apache.nifi.util.MockControllerServiceLookup;
import org.apache.nifi.util.MockVariableRegistry;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Field;
import java.lang.reflect.UndeclaredThrowableException;
import java.security.PrivilegedExceptionAction;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.mockito.ArgumentMatchers.isA;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class Hive_1_1ConnectionPoolTest {
private UserGroupInformation userGroupInformation;
private Hive_1_1ConnectionPool hiveConnectionPool;
private BasicDataSource basicDataSource;
private ComponentLog componentLog;
private File krb5conf = new File("src/test/resources/krb5.conf");
@BeforeEach
public void setup() throws Exception {
// have to initialize this system property before anything else
System.setProperty("java.security.krb5.conf", krb5conf.getAbsolutePath());
System.setProperty("java.security.krb5.realm", "nifi.com");
System.setProperty("java.security.krb5.kdc", "nifi.kdc");
userGroupInformation = mock(UserGroupInformation.class);
basicDataSource = mock(BasicDataSource.class);
componentLog = mock(ComponentLog.class);
when(userGroupInformation.doAs(isA(PrivilegedExceptionAction.class))).thenAnswer(invocation -> {
try {
return ((PrivilegedExceptionAction) invocation.getArguments()[0]).run();
} catch (IOException | Error | RuntimeException | InterruptedException e) {
throw e;
} catch (Throwable e) {
throw new UndeclaredThrowableException(e);
}
});
initPool();
}
private void initPool() throws Exception {
hiveConnectionPool = new Hive_1_1ConnectionPool();
Field ugiField = Hive_1_1ConnectionPool.class.getDeclaredField("ugi");
ugiField.setAccessible(true);
ugiField.set(hiveConnectionPool, userGroupInformation);
Field dataSourceField = Hive_1_1ConnectionPool.class.getDeclaredField("dataSource");
dataSourceField.setAccessible(true);
dataSourceField.set(hiveConnectionPool, basicDataSource);
Field componentLogField = AbstractControllerService.class.getDeclaredField("logger");
componentLogField.setAccessible(true);
componentLogField.set(hiveConnectionPool, componentLog);
}
@Test
public void testGetConnectionSqlException() throws SQLException {
SQLException sqlException = new SQLException("bad sql");
when(basicDataSource.getConnection()).thenThrow(sqlException);
ProcessException e = assertThrows(ProcessException.class, () -> hiveConnectionPool.getConnection());
assertEquals(sqlException, e.getCause());
}
@Test
public void testExpressionLanguageSupport() throws Exception {
final String URL = "jdbc:hive2://localhost:10000/default";
final String USER = "user";
final String PASS = "pass";
final int MAX_CONN = 7;
final String MAX_CONN_LIFETIME = "1 sec";
final String MAX_WAIT = "10 sec"; // 10000 milliseconds
final String CONF = "/path/to/hive-site.xml";
hiveConnectionPool = new Hive_1_1ConnectionPool();
Map<PropertyDescriptor, String> props = new HashMap<PropertyDescriptor, String>() {{
put(Hive_1_1ConnectionPool.DATABASE_URL, "${url}");
put(Hive_1_1ConnectionPool.DB_USER, "${username}");
put(Hive_1_1ConnectionPool.DB_PASSWORD, "${password}");
put(Hive_1_1ConnectionPool.MAX_TOTAL_CONNECTIONS, "${maxconn}");
put(Hive_1_1ConnectionPool.MAX_CONN_LIFETIME, "${maxconnlifetime}");
put(Hive_1_1ConnectionPool.MAX_WAIT_TIME, "${maxwait}");
put(Hive_1_1ConnectionPool.HIVE_CONFIGURATION_RESOURCES, "${hiveconf}");
}};
MockVariableRegistry registry = new MockVariableRegistry();
registry.setVariable(new VariableDescriptor("url"), URL);
registry.setVariable(new VariableDescriptor("username"), USER);
registry.setVariable(new VariableDescriptor("password"), PASS);
registry.setVariable(new VariableDescriptor("maxconn"), Integer.toString(MAX_CONN));
registry.setVariable(new VariableDescriptor("maxconnlifetime"), MAX_CONN_LIFETIME);
registry.setVariable(new VariableDescriptor("maxwait"), MAX_WAIT);
registry.setVariable(new VariableDescriptor("hiveconf"), CONF);
MockConfigurationContext context = new MockConfigurationContext(props, null, registry);
hiveConnectionPool.onConfigured(context);
Field dataSourceField = Hive_1_1ConnectionPool.class.getDeclaredField("dataSource");
dataSourceField.setAccessible(true);
basicDataSource = (BasicDataSource) dataSourceField.get(hiveConnectionPool);
assertEquals(URL, basicDataSource.getUrl());
assertEquals(USER, basicDataSource.getUsername());
assertEquals(PASS, basicDataSource.getPassword());
assertEquals(MAX_CONN, basicDataSource.getMaxTotal());
assertEquals(1000L, basicDataSource.getMaxConnLifetimeMillis());
assertEquals(10000L, basicDataSource.getMaxWaitMillis());
assertEquals(URL, hiveConnectionPool.getConnectionURL());
}
@EnabledIfSystemProperty(
named = "nifi.test.unstable",
matches = "true",
disabledReason = "Kerberos does not seem to be properly handled in Travis build, but, locally, this test should successfully run")
@Test
public void testKerberosAuthException() {
final String URL = "jdbc:hive2://localhost:10000/default";
final String conf = "src/test/resources/hive-site-security.xml";
final String ktab = "src/test/resources/fake.keytab";
final String kprinc = "bad@PRINCIPAL.COM";
final String kerberosCredentialsServiceId = UUID.randomUUID().toString();
Map<PropertyDescriptor, String> props = new HashMap<PropertyDescriptor, String>() {{
put(Hive_1_1ConnectionPool.DATABASE_URL, "${url}");
put(Hive_1_1ConnectionPool.HIVE_CONFIGURATION_RESOURCES, "${conf}");
put(Hive_1_1ConnectionPool.KERBEROS_CREDENTIALS_SERVICE, kerberosCredentialsServiceId);
}};
MockVariableRegistry registry = new MockVariableRegistry();
registry.setVariable(new VariableDescriptor("url"), URL);
registry.setVariable(new VariableDescriptor("conf"), conf);
MockControllerServiceLookup mockControllerServiceLookup = new MockControllerServiceLookup() {};
KerberosCredentialsService kerberosCredentialsService = mock(KerberosCredentialsService.class);
when(kerberosCredentialsService.getKeytab()).thenReturn(ktab);
when(kerberosCredentialsService.getPrincipal()).thenReturn(kprinc);
mockControllerServiceLookup.addControllerService(kerberosCredentialsService, kerberosCredentialsServiceId);
MockConfigurationContext context = new MockConfigurationContext(props, mockControllerServiceLookup, registry);
assertThrows(InitializationException.class, () -> hiveConnectionPool.onConfigured(context));
}
}

View File

@ -1,292 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.nifi.processor.ProcessContext;
import org.apache.nifi.processor.ProcessSessionFactory;
import org.apache.nifi.processor.ProcessorInitializationContext;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.util.MockProcessContext;
import org.apache.nifi.util.MockProcessorInitializationContext;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestHiveParser extends AbstractHive_1_1QLProcessor {
@BeforeEach
public void initialize() {
final MockProcessContext processContext = new MockProcessContext(this);
final ProcessorInitializationContext initializationContext = new MockProcessorInitializationContext(this, processContext);
initialize(initializationContext);
}
@Override
public void onTrigger(ProcessContext context, ProcessSessionFactory sessionFactory) throws ProcessException {
}
@Test
public void parseSelect() {
String query = "select a.empid, to_something(b.saraly) from " +
"company.emp a inner join default.salary b where a.empid = b.empid";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(2, tableNames.size());
assertTrue(tableNames.contains(new TableName("company", "emp", true)));
assertTrue(tableNames.contains(new TableName("default", "salary", true)));
}
@Test
public void parseSelectPrepared() {
String query = "select empid from company.emp a where a.firstName = ?";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName("company", "emp", true)));
}
@Test
public void parseLongSelect() {
String query = "select\n" +
"\n" +
" i_item_id,\n" +
"\n" +
" i_item_desc,\n" +
"\n" +
" s_state,\n" +
"\n" +
" count(ss_quantity) as store_sales_quantitycount,\n" +
"\n" +
" avg(ss_quantity) as store_sales_quantityave,\n" +
"\n" +
" stddev_samp(ss_quantity) as store_sales_quantitystdev,\n" +
"\n" +
" stddev_samp(ss_quantity) / avg(ss_quantity) as store_sales_quantitycov,\n" +
"\n" +
" count(sr_return_quantity) as store_returns_quantitycount,\n" +
"\n" +
" avg(sr_return_quantity) as store_returns_quantityave,\n" +
"\n" +
" stddev_samp(sr_return_quantity) as store_returns_quantitystdev,\n" +
"\n" +
" stddev_samp(sr_return_quantity) / avg(sr_return_quantity) as store_returns_quantitycov,\n" +
"\n" +
" count(cs_quantity) as catalog_sales_quantitycount,\n" +
"\n" +
" avg(cs_quantity) as catalog_sales_quantityave,\n" +
"\n" +
" stddev_samp(cs_quantity) / avg(cs_quantity) as catalog_sales_quantitystdev,\n" +
"\n" +
" stddev_samp(cs_quantity) / avg(cs_quantity) as catalog_sales_quantitycov\n" +
"\n" +
"from\n" +
"\n" +
" store_sales,\n" +
"\n" +
" store_returns,\n" +
"\n" +
" catalog_sales,\n" +
"\n" +
" date_dim d1,\n" +
"\n" +
" date_dim d2,\n" +
"\n" +
" date_dim d3,\n" +
"\n" +
" store,\n" +
"\n" +
" item\n" +
"\n" +
"where\n" +
"\n" +
" d1.d_quarter_name = '2000Q1'\n" +
"\n" +
" and d1.d_date_sk = ss_sold_date_sk\n" +
"\n" +
" and i_item_sk = ss_item_sk\n" +
"\n" +
" and s_store_sk = ss_store_sk\n" +
"\n" +
" and ss_customer_sk = sr_customer_sk\n" +
"\n" +
" and ss_item_sk = sr_item_sk\n" +
"\n" +
" and ss_ticket_number = sr_ticket_number\n" +
"\n" +
" and sr_returned_date_sk = d2.d_date_sk\n" +
"\n" +
" and d2.d_quarter_name in ('2000Q1' , '2000Q2', '2000Q3')\n" +
"\n" +
" and sr_customer_sk = cs_bill_customer_sk\n" +
"\n" +
" and sr_item_sk = cs_item_sk\n" +
"\n" +
" and cs_sold_date_sk = d3.d_date_sk\n" +
"\n" +
" and d3.d_quarter_name in ('2000Q1' , '2000Q2', '2000Q3')\n" +
"\n" +
"group by i_item_id , i_item_desc , s_state\n" +
"\n" +
"order by i_item_id , i_item_desc , s_state\n" +
"\n" +
"limit 100";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(6, tableNames.size());
AtomicInteger cnt = new AtomicInteger(0);
for (TableName tableName : tableNames) {
if (tableName.equals(new TableName(null, "store_sales", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "store_returns", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "catalog_sales", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "date_dim", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "store", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "item", true))) {
cnt.incrementAndGet();
}
}
assertEquals(6, cnt.get());
}
@Test
public void parseSelectInsert() {
String query = "insert into databaseA.tableA select key, max(value) from databaseA.tableA where category = 'x'";
// The same database.tableName can appear two times for input and output.
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(2, tableNames.size());
AtomicInteger cnt = new AtomicInteger(0);
tableNames.forEach(tableName -> {
if (tableName.equals(new TableName("databaseA", "tableA", false))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName("databaseA", "tableA", true))) {
cnt.incrementAndGet();
}
});
assertEquals(2, cnt.get());
}
@Test
public void parseInsert() {
String query = "insert into databaseB.tableB1 select something from tableA1 a1 inner join tableA2 a2 where a1.id = a2.id";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(3, tableNames.size());
AtomicInteger cnt = new AtomicInteger(0);
tableNames.forEach(tableName -> {
if (tableName.equals(new TableName("databaseB", "tableB1", false))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "tableA1", true))) {
cnt.incrementAndGet();
} else if (tableName.equals(new TableName(null, "tableA2", true))) {
cnt.incrementAndGet();
}
});
assertEquals(3, cnt.get());
}
@Test
public void parseUpdate() {
String query = "update table_a set y = 'updated' where x > 100";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "table_a", false)));
}
@Test
public void parseDelete() {
String query = "delete from table_a where x > 100";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "table_a", false)));
}
@Test
public void parseDDL() {
String query = "CREATE TABLE IF NOT EXISTS EMPLOYEES(\n" +
"EmployeeID INT,FirstName STRING, Title STRING,\n" +
"State STRING, Laptop STRING)\n" +
"COMMENT 'Employee Names'\n" +
"STORED AS ORC";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "EMPLOYEES", false)));
}
@Test
public void parseSetProperty() {
String query = " set 'hive.exec.dynamic.partition.mode'=nonstrict";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
@Test
public void parseSetRole() {
String query = "set role all";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
@Test
public void parseShowRoles() {
String query = "show roles";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
@Test
public void parseMsck() {
String query = "msck repair table table_a";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(1, tableNames.size());
assertTrue(tableNames.contains(new TableName(null, "table_a", false)));
}
@Test
public void parseAddJar() {
String query = "ADD JAR hdfs:///tmp/my_jar.jar";
final Set<TableName> tableNames = findTableNames(query);
System.out.printf("tableNames=%s\n", tableNames);
assertEquals(0, tableNames.size());
}
}

View File

@ -1,820 +0,0 @@
package org.apache.nifi.processors.hive;/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.dbcp.DBCPService;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.processor.util.pattern.RollbackOnFailure;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledOnOs;
import org.junit.jupiter.api.condition.OS;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mockito;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Types;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
@DisabledOnOs(OS.WINDOWS)
public class TestPutHive_1_1QL {
private static final String createPersons = "CREATE TABLE PERSONS (id integer primary key, name varchar(100), code integer)";
private static final String createPersonsAutoId = "CREATE TABLE PERSONS (id INTEGER NOT NULL GENERATED ALWAYS AS IDENTITY (START WITH 1), name VARCHAR(100), code INTEGER check(code <= 100))";
@BeforeAll
public static void setup() {
System.setProperty("derby.stream.error.file", "target/derby.log");
}
@Test
public void testDirectStatements(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (1, 'Mark', 84)".getBytes());
runner.run();
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
runner.enqueue("UPDATE PERSONS SET NAME='George' WHERE ID=1".getBytes());
runner.run();
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("George", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testFailInMiddleWithBadStatementRollbackOnFailure(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', 84)".getBytes());
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
runner.run();
// The 1st one should be routed to success, others should stay in queue.
assertEquals(3, runner.getQueueSize().getObjectCount());
runner.assertTransferCount(PutHive_1_1QL.REL_FAILURE, 0);
runner.assertTransferCount(PutHive_1_1QL.REL_SUCCESS, 1);
}
@Test
public void testFailAtBeginning(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
runner.run();
runner.assertTransferCount(PutHive_1_1QL.REL_FAILURE, 1);
runner.assertTransferCount(PutHive_1_1QL.REL_SUCCESS, 2);
}
@Test
public void testFailAtBeginningRollbackOnFailure(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue("INSERT INTO PERSONS".getBytes()); // intentionally wrong syntax
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Tom', 3)".getBytes());
runner.enqueue("INSERT INTO PERSONS (NAME, CODE) VALUES ('Harry', 44)".getBytes());
AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertTrue(e.getCause() instanceof ProcessException);
assertEquals(3, runner.getQueueSize().getObjectCount());
runner.assertTransferCount(PutHive_1_1QL.REL_FAILURE, 0);
runner.assertTransferCount(PutHive_1_1QL.REL_SUCCESS, 0);
}
@Test
public void testFailInMiddleWithBadParameterType(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> goodAttributes = new HashMap<>();
goodAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
goodAttributes.put("hiveql.args.1.value", "84");
final Map<String, String> badAttributes = new HashMap<>();
badAttributes.put("hiveql.args.1.type", String.valueOf(Types.VARCHAR));
badAttributes.put("hiveql.args.1.value", "hello");
final byte[] data = "INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', ?)".getBytes();
runner.enqueue(data, goodAttributes);
runner.enqueue(data, badAttributes);
runner.enqueue(data, goodAttributes);
runner.enqueue(data, goodAttributes);
runner.run();
runner.assertTransferCount(PutHive_1_1QL.REL_FAILURE, 1);
runner.assertTransferCount(PutHive_1_1QL.REL_SUCCESS, 3);
}
@Test
public void testFailInMiddleWithBadParameterValue(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> goodAttributes = new HashMap<>();
goodAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
goodAttributes.put("hiveql.args.1.value", "84");
final Map<String, String> badAttributes = new HashMap<>();
badAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
badAttributes.put("hiveql.args.1.value", "101"); // Constraint violation, up to 100
final byte[] data = "INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', ?)".getBytes();
runner.enqueue(data, goodAttributes);
runner.enqueue(data, badAttributes);
runner.enqueue(data, goodAttributes);
runner.enqueue(data, goodAttributes);
runner.run();
runner.assertTransferCount(PutHive_1_1QL.REL_SUCCESS, 3);
runner.assertTransferCount(PutHive_1_1QL.REL_FAILURE, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertTrue(rs.next());
assertTrue(rs.next());
assertFalse(rs.next());
}
}
}
@Test
public void testFailInMiddleWithBadNumberFormat(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersonsAutoId);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> goodAttributes = new HashMap<>();
goodAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
goodAttributes.put("hiveql.args.1.value", "84");
final Map<String, String> badAttributes = new HashMap<>();
badAttributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
badAttributes.put("hiveql.args.1.value", "NOT_NUMBER");
final byte[] data = "INSERT INTO PERSONS (NAME, CODE) VALUES ('Mark', ?)".getBytes();
runner.enqueue(data, goodAttributes);
runner.enqueue(data, badAttributes);
runner.enqueue(data, goodAttributes);
runner.enqueue(data, goodAttributes);
runner.run();
runner.assertTransferCount(PutHive_1_1QL.REL_SUCCESS, 3);
runner.assertTransferCount(PutHive_1_1QL.REL_FAILURE, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertTrue(rs.next());
assertTrue(rs.next());
assertFalse(rs.next());
}
}
}
@Test
public void testUsingSqlDataTypesWithNegativeValues(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate("CREATE TABLE PERSONS (id integer primary key, name varchar(100), code bigint)");
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", "-5");
attributes.put("hiveql.args.1.value", "84");
runner.enqueue("INSERT INTO PERSONS VALUES (1, 'Mark', ?)".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
runner.getFlowFilesForRelationship(PutHive_1_1QL.REL_SUCCESS).get(0).assertAttributeEquals(PutHive_1_1QL.ATTR_OUTPUT_TABLES, "PERSONS");
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testStatementsWithPreparedParameters(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?)".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
runner.clearTransferState();
attributes.clear();
attributes.put("hiveql.args.1.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.1.value", "George");
attributes.put("hiveql.args.2.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.2.value", "1");
runner.enqueue("UPDATE PERSONS SET NAME=? WHERE ID=?".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("George", rs.getString(2));
assertEquals(84, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testMultipleStatementsWithinFlowFile(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because of the semicolon
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
runner.getFlowFilesForRelationship(PutHive_1_1QL.REL_SUCCESS)
.forEach(f -> f.assertAttributeEquals(PutHive_1_1QL.ATTR_OUTPUT_TABLES, "PERSONS"));
// Now we can check that the values were inserted by the multi-statement script.
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1), "Record ID mismatch");
assertEquals( "George", rs.getString(2), "Record NAME mismatch");
}
}
}
@Test
public void testMultipleStatementsWithinFlowFilePlusEmbeddedDelimiter(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George\\;' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because of the semicolon
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
// Now we can check that the values were inserted by the multi-statement script.
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1), "Record ID mismatch");
assertEquals( "George\\;", rs.getString(2), "Record NAME mismatch");
}
}
}
@Test
public void testWithNullParameter(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
runner.enqueue("INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?)".getBytes(), attributes);
runner.run();
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_SUCCESS, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
assertEquals(1, rs.getInt(1));
assertEquals("Mark", rs.getString(2));
assertEquals(0, rs.getInt(3));
assertFalse(rs.next());
}
}
}
@Test
public void testInvalidStatement(@TempDir Path tempDir) throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockDBCPService(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
stmt.executeUpdate(createPersons);
}
}
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE SOME_RANDOM_TABLE NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because of the table is invalid
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_FAILURE, 1);
try (final Connection conn = service.getConnection()) {
try (final Statement stmt = conn.createStatement()) {
final ResultSet rs = stmt.executeQuery("SELECT * FROM PERSONS");
assertTrue(rs.next());
}
}
}
@Test
public void testRetryableFailure() throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final DBCPService service = new SQLExceptionService(null);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because there isn't a valid connection and tables don't exist.
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_RETRY, 1);
}
@Test
public void testRetryableFailureRollbackOnFailure() throws InitializationException, ProcessException, SQLException, IOException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final DBCPService service = new SQLExceptionService(null);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertTrue(e.getCause() instanceof ProcessException);
assertEquals(1, runner.getQueueSize().getObjectCount());
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_RETRY, 0);
}
@Test
public void testUnknownFailure() throws InitializationException, ProcessException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final SQLExceptionService service = new SQLExceptionService(null);
service.setErrorCode(2);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
runner.run();
// should fail because there isn't a valid connection and tables don't exist.
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_RETRY, 1);
}
@Test
public void testUnknownFailureRollbackOnFailure() throws InitializationException, ProcessException {
final TestRunner runner = TestRunners.newTestRunner(PutHive_1_1QL.class);
final SQLExceptionService service = new SQLExceptionService(null);
service.setErrorCode(0);
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(PutHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(RollbackOnFailure.ROLLBACK_ON_FAILURE, "true");
final String sql = "INSERT INTO PERSONS (ID, NAME, CODE) VALUES (?, ?, ?); " +
"UPDATE PERSONS SET NAME='George' WHERE ID=?; ";
final Map<String, String> attributes = new HashMap<>();
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.2.type", String.valueOf(Types.VARCHAR));
attributes.put("hiveql.args.2.value", "Mark");
attributes.put("hiveql.args.3.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.3.value", "84");
attributes.put("hiveql.args.4.type", String.valueOf(Types.INTEGER));
attributes.put("hiveql.args.4.value", "1");
runner.enqueue(sql.getBytes(), attributes);
AssertionError e = assertThrows(AssertionError.class, () -> runner.run());
assertTrue(e.getCause() instanceof ProcessException);
assertEquals(1, runner.getQueueSize().getObjectCount());
runner.assertAllFlowFilesTransferred(PutHive_1_1QL.REL_RETRY, 0);
}
/**
* Simple implementation only for testing purposes
*/
private static class MockDBCPService extends AbstractControllerService implements Hive_1_1DBCPService {
private final String dbLocation;
MockDBCPService(final String dbLocation) {
this.dbLocation = dbLocation;
}
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
Class.forName("org.apache.derby.jdbc.EmbeddedDriver");
return DriverManager.getConnection("jdbc:derby:" + dbLocation + ";create=true");
} catch (final Exception e) {
e.printStackTrace();
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return "jdbc:derby:" + dbLocation + ";create=true";
}
}
/**
* Simple implementation only for testing purposes
*/
private static class SQLExceptionService extends AbstractControllerService implements Hive_1_1DBCPService {
private final Hive_1_1DBCPService service;
private int allowedBeforeFailure = 0;
private int successful = 0;
private int errorCode = 30000; // Default to a retryable exception code
SQLExceptionService(final Hive_1_1DBCPService service) {
this.service = service;
}
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
if (++successful > allowedBeforeFailure) {
final Connection conn = Mockito.mock(Connection.class);
Mockito.when(conn.prepareStatement(Mockito.any(String.class))).thenThrow(new SQLException("Unit Test Generated SQLException", "42000", errorCode));
return conn;
} else {
return service.getConnection();
}
} catch (final Exception e) {
e.printStackTrace();
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return service != null ? service.getConnectionURL() : null;
}
void setErrorCode(int errorCode) {
this.errorCode = errorCode;
}
}
}

View File

@ -1,661 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.dbcp.DBCPService;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.flowfile.attributes.CoreAttributes;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.provenance.ProvenanceEventRecord;
import org.apache.nifi.provenance.ProvenanceEventType;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.apache.nifi.util.hive.HiveJdbcCommon;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.sql.Types;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import static org.apache.nifi.processors.hive.SelectHive_1_1QL.HIVEQL_OUTPUT_FORMAT;
import static org.apache.nifi.util.hive.HiveJdbcCommon.AVRO;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV;
import static org.apache.nifi.util.hive.HiveJdbcCommon.CSV_MIME_TYPE;
import static org.apache.nifi.util.hive.HiveJdbcCommon.MIME_TYPE_AVRO_BINARY;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestSelectHive_1_1QL {
private static final Logger LOGGER;
private final static String MAX_ROWS_KEY = "maxRows";
private final int NUM_OF_ROWS = 100;
static {
System.setProperty("org.slf4j.simpleLogger.defaultLogLevel", "info");
System.setProperty("org.slf4j.simpleLogger.showDateTime", "true");
System.setProperty("org.slf4j.simpleLogger.log.nifi.io.nio", "debug");
System.setProperty("org.slf4j.simpleLogger.log.nifi.processors.hive.SelectHive_1_1QL", "debug");
System.setProperty("org.slf4j.simpleLogger.log.nifi.processors.hive.TestSelectHive_1_1QL", "debug");
LOGGER = LoggerFactory.getLogger(TestSelectHive_1_1QL.class);
}
private final static String DB_LOCATION = "target/db";
private final static String QUERY_WITH_EL = "select "
+ " PER.ID as PersonId, PER.NAME as PersonName, PER.CODE as PersonCode"
+ " from persons PER"
+ " where PER.ID > ${person.id}";
private final static String QUERY_WITHOUT_EL = "select "
+ " PER.ID as PersonId, PER.NAME as PersonName, PER.CODE as PersonCode"
+ " from persons PER"
+ " where PER.ID > 10";
@BeforeAll
public static void setupClass() {
System.setProperty("derby.stream.error.file", "target/derby.log");
}
private TestRunner runner;
@BeforeEach
public void setup() throws InitializationException {
final DBCPService dbcp = new DBCPServiceSimpleImpl();
final Map<String, String> dbcpProperties = new HashMap<>();
runner = TestRunners.newTestRunner(SelectHive_1_1QL.class);
runner.addControllerService("dbcp", dbcp, dbcpProperties);
runner.enableControllerService(dbcp);
runner.setProperty(SelectHive_1_1QL.HIVE_DBCP_SERVICE, "dbcp");
}
@Test
public void testIncomingConnectionWithNoFlowFile() {
runner.setIncomingConnection(true);
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, "SELECT * FROM persons");
runner.run();
runner.assertTransferCount(SelectHive_1_1QL.REL_SUCCESS, 0);
runner.assertTransferCount(SelectHive_1_1QL.REL_FAILURE, 0);
}
@Test
public void testNoIncomingConnection() throws ClassNotFoundException, SQLException, InitializationException, IOException {
runner.setIncomingConnection(false);
invokeOnTrigger(QUERY_WITHOUT_EL, false, "Avro");
final List<ProvenanceEventRecord> provenanceEvents = runner.getProvenanceEvents();
final ProvenanceEventRecord provenance0 = provenanceEvents.get(0);
assertEquals(ProvenanceEventType.RECEIVE, provenance0.getEventType());
assertEquals("jdbc:derby:target/db;create=true", provenance0.getTransitUri());
}
@Test
public void testNoTimeLimit() throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITH_EL, true, "Avro");
final List<ProvenanceEventRecord> provenanceEvents = runner.getProvenanceEvents();
assertEquals(4, provenanceEvents.size());
final ProvenanceEventRecord provenance0 = provenanceEvents.get(0);
assertEquals(ProvenanceEventType.FORK, provenance0.getEventType());
final ProvenanceEventRecord provenance1 = provenanceEvents.get(1);
assertEquals(ProvenanceEventType.FETCH, provenance1.getEventType());
assertEquals("jdbc:derby:target/db;create=true", provenance1.getTransitUri());
final ProvenanceEventRecord provenance2 = provenanceEvents.get(2);
assertEquals(ProvenanceEventType.FORK, provenance2.getEventType());
final ProvenanceEventRecord provenance3 = provenanceEvents.get(3);
assertEquals(ProvenanceEventType.DROP, provenance3.getEventType());
}
@Test
public void testWithNullIntColumn() throws SQLException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((Hive_1_1DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_NULL_INT");
} catch (final SQLException sqle) {
// Nothing to do, probably means the table didn't exist
}
stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))");
stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)");
stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)");
runner.setIncomingConnection(false);
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_SUCCESS, 1);
runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(0).assertAttributeEquals(SelectHive_1_1QL.RESULT_ROW_COUNT, "2");
}
@Test
public void testWithSqlException() throws SQLException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((Hive_1_1DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_NO_ROWS");
} catch (final SQLException sqle) {
// Nothing to do, probably means the table didn't exist
}
stmt.execute("create table TEST_NO_ROWS (id integer)");
runner.setIncomingConnection(false);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, "SELECT val1 FROM TEST_NO_ROWS");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPreQueriesNoIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, false, CSV,
"select 'no exception' from persons; select exception from persons",
null);
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPreQueriesWithIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, true, CSV,
"select 'no exception' from persons; select exception from persons",
null);
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPostQueriesNoIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, false, CSV,
null,
"select 'no exception' from persons; select exception from persons");
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
}
@Test
public void invokeOnTriggerExceptionInPostQueriesWithIncomingFlows()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
doOnTrigger(QUERY_WITHOUT_EL, true, CSV,
null,
"select 'no exception' from persons; select exception from persons");
// with incoming connections, it should be rolled back
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
}
@Test
public void testWithBadSQL() throws SQLException {
final String BAD_SQL = "create table TEST_NO_ROWS (id integer)";
// Test with incoming flow file (it should be routed to failure intact, i.e. same content and no parent)
runner.setIncomingConnection(true);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.enqueue(BAD_SQL);
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_FAILURE).get(0);
flowFile.assertContentEquals(BAD_SQL);
flowFile.assertAttributeEquals("parentIds", null);
runner.clearTransferState();
// Test with no incoming flow file (an empty flow file is transferred)
runner.setIncomingConnection(false);
// Try a valid SQL statement that will generate an error (val1 does not exist, e.g.)
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, BAD_SQL);
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_FAILURE, 1);
flowFile = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_FAILURE).get(0);
flowFile.assertContentEquals("");
}
@Test
public void invokeOnTriggerWithCsv()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV);
}
@Test
public void invokeOnTriggerWithAvro()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, AVRO);
}
@Test
public void invokeOnTriggerWithValidPreQieries()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV,
"select '1' from persons; select '2' from persons", //should not be 'select'. But Derby driver doesn't support "set param=val" format.
null);
}
@Test
public void invokeOnTriggerWithValidPostQieries()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV,
null,
//should not be 'select'. But Derby driver doesn't support "set param=val" format,
//so just providing any "compilable" query.
" select '4' from persons; \nselect '5' from persons");
}
@Test
public void invokeOnTriggerWithValidPrePostQieries()
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(QUERY_WITHOUT_EL, false, CSV,
//should not be 'select'. But Derby driver doesn't support "set param=val" format,
//so just providing any "compilable" query.
"select '1' from persons; select '2' from persons",
" select '4' from persons; \nselect '5' from persons");
}
public void invokeOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat)
throws InitializationException, ClassNotFoundException, SQLException, IOException {
invokeOnTrigger(query, incomingFlowFile, outputFormat, null, null);
}
public void invokeOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat,
String preQueries, String postQueries)
throws InitializationException, ClassNotFoundException, SQLException, IOException {
TestRunner runner = doOnTrigger(query, incomingFlowFile, outputFormat, preQueries, postQueries);
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_SUCCESS, 1);
final List<MockFlowFile> flowfiles = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS);
MockFlowFile flowFile = flowfiles.get(0);
final InputStream in = new ByteArrayInputStream(flowFile.toByteArray());
long recordsFromStream = 0;
if (AVRO.equals(outputFormat)) {
assertEquals(MIME_TYPE_AVRO_BINARY, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
GenericRecord record = null;
while (dataFileReader.hasNext()) {
// Reuse record object by passing it to next(). This saves us from
// allocating and garbage collecting many objects for files with
// many items.
record = dataFileReader.next(record);
recordsFromStream++;
}
}
} else {
assertEquals(CSV_MIME_TYPE, flowFile.getAttribute(CoreAttributes.MIME_TYPE.key()));
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String headerRow = br.readLine();
// Derby capitalizes column names
assertEquals("PERSONID,PERSONNAME,PERSONCODE", headerRow);
// Validate rows
String line;
while ((line = br.readLine()) != null) {
recordsFromStream++;
String[] values = line.split(",");
if (recordsFromStream < (NUM_OF_ROWS - 10)) {
assertEquals(3, values.length);
assertTrue(values[1].startsWith("\""));
assertTrue(values[1].endsWith("\""));
} else {
assertEquals(2, values.length); // Middle value is null
}
}
}
assertEquals(NUM_OF_ROWS - 10, recordsFromStream);
assertEquals(recordsFromStream, Integer.parseInt(flowFile.getAttribute(SelectHive_1_1QL.RESULT_ROW_COUNT)));
flowFile.assertAttributeEquals(AbstractHive_1_1QLProcessor.ATTR_INPUT_TABLES, "persons");
}
public TestRunner doOnTrigger(final String query, final boolean incomingFlowFile, String outputFormat,
String preQueries, String postQueries)
throws InitializationException, ClassNotFoundException, SQLException, IOException {
// remove previous test database, if any
final File dbLocation = new File(DB_LOCATION);
dbLocation.delete();
// load test data to database
final Connection con = ((Hive_1_1DBCPService) runner.getControllerService("dbcp")).getConnection();
final Statement stmt = con.createStatement();
try {
stmt.execute("drop table persons");
} catch (final SQLException sqle) {
// Nothing to do here, the table didn't exist
}
stmt.execute("create table persons (id integer, name varchar(100), code integer)");
Random rng = new Random(53496);
stmt.executeUpdate("insert into persons values (1, 'Joe Smith', " + rng.nextInt(469947) + ")");
for (int i = 2; i < NUM_OF_ROWS; i++) {
stmt.executeUpdate("insert into persons values (" + i + ", 'Someone Else', " + rng.nextInt(469947) + ")");
}
stmt.executeUpdate("insert into persons values (" + NUM_OF_ROWS + ", 'Last Person', NULL)");
LOGGER.info("test data loaded");
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, query);
runner.setProperty(HIVEQL_OUTPUT_FORMAT, outputFormat);
if (preQueries != null) {
runner.setProperty(SelectHive_1_1QL.HIVEQL_PRE_QUERY, preQueries);
}
if (postQueries != null) {
runner.setProperty(SelectHive_1_1QL.HIVEQL_POST_QUERY, postQueries);
}
if (incomingFlowFile) {
// incoming FlowFile content is not used, but attributes are used
final Map<String, String> attributes = new HashMap<>();
attributes.put("person.id", "10");
runner.enqueue("Hello".getBytes(), attributes);
}
runner.setIncomingConnection(incomingFlowFile);
runner.run();
return runner;
}
@Test
public void testMaxRowsPerFlowFileAvro() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
InputStream in;
MockFlowFile mff;
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(false);
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_QUERY_DB_TABLE");
runner.setProperty(SelectHive_1_1QL.MAX_ROWS_PER_FLOW_FILE, "${" + MAX_ROWS_KEY + "}");
runner.setProperty(SelectHive_1_1QL.HIVEQL_OUTPUT_FORMAT, HiveJdbcCommon.AVRO);
runner.setVariable(MAX_ROWS_KEY, "9");
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_SUCCESS, 12);
//ensure all but the last file have 9 records each
for (int ff = 0; ff < 11; ff++) {
mff = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(ff);
in = new ByteArrayInputStream(mff.toByteArray());
assertEquals(9, getNumberOfRecordsFromStream(in));
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(ff), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
}
//last file should have 1 record
mff = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(11);
in = new ByteArrayInputStream(mff.toByteArray());
assertEquals(1, getNumberOfRecordsFromStream(in));
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(11), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
runner.clearTransferState();
}
@Test
public void testParametrizedQuery() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(true);
runner.setProperty(SelectHive_1_1QL.MAX_ROWS_PER_FLOW_FILE, "${" + MAX_ROWS_KEY + "}");
runner.setProperty(SelectHive_1_1QL.HIVEQL_OUTPUT_FORMAT, HiveJdbcCommon.AVRO);
runner.setVariable(MAX_ROWS_KEY, "9");
Map<String, String> attributes = new HashMap<String, String>();
attributes.put("hiveql.args.1.value", "1");
attributes.put("hiveql.args.1.type", String.valueOf(Types.INTEGER));
runner.enqueue("SELECT * FROM TEST_QUERY_DB_TABLE WHERE id = ?", attributes );
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_SUCCESS, 1);
MockFlowFile flowFile = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(0);
// Assert the attributes from the incoming flow file are preserved in the outgoing flow file(s)
flowFile.assertAttributeEquals("hiveql.args.1.value", "1");
flowFile.assertAttributeEquals("hiveql.args.1.type", String.valueOf(Types.INTEGER));
runner.clearTransferState();
}
@Test
public void testMaxRowsPerFlowFileCSV() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
InputStream in;
MockFlowFile mff;
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(true);
runner.setProperty(SelectHive_1_1QL.MAX_ROWS_PER_FLOW_FILE, "${" + MAX_ROWS_KEY + "}");
runner.setProperty(SelectHive_1_1QL.HIVEQL_OUTPUT_FORMAT, HiveJdbcCommon.CSV);
runner.enqueue("SELECT * FROM TEST_QUERY_DB_TABLE", new HashMap<String, String>() {{
put(MAX_ROWS_KEY, "9");
}});
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_SUCCESS, 12);
//ensure all but the last file have 9 records (10 lines = 9 records + header) each
for (int ff = 0; ff < 11; ff++) {
mff = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(ff);
in = new ByteArrayInputStream(mff.toByteArray());
BufferedReader br = new BufferedReader(new InputStreamReader(in));
assertEquals(10, br.lines().count());
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(ff), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
}
//last file should have 1 record (2 lines = 1 record + header)
mff = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(11);
in = new ByteArrayInputStream(mff.toByteArray());
BufferedReader br = new BufferedReader(new InputStreamReader(in));
assertEquals(2, br.lines().count());
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(11), mff.getAttribute("fragment.index"));
assertEquals("12", mff.getAttribute("fragment.count"));
runner.clearTransferState();
}
@Test
public void testMaxRowsPerFlowFileWithMaxFragments() throws ClassNotFoundException, SQLException, InitializationException, IOException {
// load test data to database
final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
Statement stmt = con.createStatement();
InputStream in;
MockFlowFile mff;
try {
stmt.execute("drop table TEST_QUERY_DB_TABLE");
} catch (final SQLException sqle) {
// Ignore this error, probably a "table does not exist" since Derby doesn't yet support DROP IF EXISTS [DERBY-4842]
}
stmt.execute("create table TEST_QUERY_DB_TABLE (id integer not null, name varchar(100), scale float, created_on timestamp, bignum bigint default 0)");
int rowCount = 0;
//create larger row set
for (int batch = 0; batch < 100; batch++) {
stmt.execute("insert into TEST_QUERY_DB_TABLE (id, name, scale, created_on) VALUES (" + rowCount + ", 'Joe Smith', 1.0, '1962-09-23 03:23:34.234')");
rowCount++;
}
runner.setIncomingConnection(false);
runner.setProperty(SelectHive_1_1QL.HIVEQL_SELECT_QUERY, "SELECT * FROM TEST_QUERY_DB_TABLE");
runner.setProperty(SelectHive_1_1QL.MAX_ROWS_PER_FLOW_FILE, "9");
Integer maxFragments = 3;
runner.setProperty(SelectHive_1_1QL.MAX_FRAGMENTS, maxFragments.toString());
runner.run();
runner.assertAllFlowFilesTransferred(SelectHive_1_1QL.REL_SUCCESS, maxFragments);
for (int i = 0; i < maxFragments; i++) {
mff = runner.getFlowFilesForRelationship(SelectHive_1_1QL.REL_SUCCESS).get(i);
in = new ByteArrayInputStream(mff.toByteArray());
assertEquals(9, getNumberOfRecordsFromStream(in));
mff.assertAttributeExists("fragment.identifier");
assertEquals(Integer.toString(i), mff.getAttribute("fragment.index"));
assertEquals(maxFragments.toString(), mff.getAttribute("fragment.count"));
}
runner.clearTransferState();
}
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
GenericRecord record = null;
long recordsFromStream = 0;
while (dataFileReader.hasNext()) {
// Reuse record object by passing it to next(). This saves us from
// allocating and garbage collecting many objects for files with
// many items.
record = dataFileReader.next(record);
recordsFromStream += 1;
}
return recordsFromStream;
}
}
/**
* Simple implementation only for SelectHive_1_1QL processor testing.
*/
private class DBCPServiceSimpleImpl extends AbstractControllerService implements Hive_1_1DBCPService {
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
Class.forName("org.apache.derby.jdbc.EmbeddedDriver");
return DriverManager.getConnection("jdbc:derby:" + DB_LOCATION + ";create=true");
} catch (final Exception e) {
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return "jdbc:derby:" + DB_LOCATION + ";create=true";
}
}
}

View File

@ -1,444 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.hive;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.nifi.controller.AbstractControllerService;
import org.apache.nifi.dbcp.DBCPService;
import org.apache.nifi.dbcp.hive.Hive_1_1DBCPService;
import org.apache.nifi.logging.ComponentLog;
import org.apache.nifi.processor.exception.ProcessException;
import org.apache.nifi.reporting.InitializationException;
import org.apache.nifi.schema.access.SchemaNotFoundException;
import org.apache.nifi.serialization.RecordReader;
import org.apache.nifi.serialization.SimpleRecordSchema;
import org.apache.nifi.serialization.record.MockRecordParser;
import org.apache.nifi.serialization.record.RecordField;
import org.apache.nifi.serialization.record.RecordFieldType;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledOnOs;
import org.junit.jupiter.api.condition.OS;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.stubbing.Answer;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiFunction;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@DisabledOnOs(OS.WINDOWS)
public class TestUpdateHive_1_1Table {
private static final String TEST_CONF_PATH = "src/test/resources/core-site.xml";
private static final String TARGET_HIVE = "target/hive";
private static final String[] SHOW_TABLES_COLUMN_NAMES = new String[]{"tab_name"};
private static final String[][] SHOW_TABLES_RESULTSET = new String[][]{
new String[]{"messages"},
new String[]{"users"},
};
private static final String[] DESC_MESSAGES_TABLE_COLUMN_NAMES = new String[]{"id", "msg"};
private static final String[][] DESC_MESSAGES_TABLE_RESULTSET = new String[][]{
new String[]{"# col_name", "data_type", "comment"},
new String[]{"id", "int", ""},
new String[]{"msg", "string", ""},
new String[]{"", null, null},
new String[]{"# Partition Information", null, null},
new String[]{"# col_name", "data_type", "comment"},
new String[]{"continent", "string", ""},
new String[]{"country", "string", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/messages", null}
};
private static final String[] DESC_USERS_TABLE_COLUMN_NAMES = new String[]{"name", "favorite_number", "favorite_color", "scale"};
private static final String[][] DESC_USERS_TABLE_RESULTSET = new String[][]{
new String[]{"name", "string", ""},
new String[]{"favorite_number", "int", ""},
new String[]{"favorite_color", "string", ""},
new String[]{"scale", "double", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/users", null}
};
private static final String[][] DESC_EXTERNAL_USERS_TABLE_RESULTSET = new String[][]{
new String[]{"name", "string", ""},
new String[]{"favorite_number", "int", ""},
new String[]{"favorite_color", "string", ""},
new String[]{"scale", "double", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/path/to/users", null}
};
private static final String[] DESC_NEW_TABLE_COLUMN_NAMES = DESC_USERS_TABLE_COLUMN_NAMES;
private static final String[][] DESC_NEW_TABLE_RESULTSET = new String[][]{
new String[]{"", null, null},
new String[]{"name", "string", ""},
new String[]{"favorite_number", "int", ""},
new String[]{"favorite_color", "string", ""},
new String[]{"scale", "double", ""},
new String[]{"", null, null},
new String[]{"# Detailed Table Information", null, null},
new String[]{"Location:", "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/_newTable", null}
};
private TestRunner runner;
private UpdateHive_1_1Table processor;
@BeforeEach
public void setUp() {
Configuration testConf = new Configuration();
testConf.addResource(new Path(TEST_CONF_PATH));
// Delete any temp files from previous tests
try {
FileUtils.deleteDirectory(new File(TARGET_HIVE));
} catch (IOException ioe) {
// Do nothing, directory may not have existed
}
processor = new UpdateHive_1_1Table();
}
private void configure(final UpdateHive_1_1Table processor, final int numUsers) throws InitializationException {
configure(processor, numUsers, false, -1);
}
private void configure(final UpdateHive_1_1Table processor, final int numUsers, boolean failOnCreateReader, int failAfter) throws InitializationException {
configure(processor, numUsers, failOnCreateReader, failAfter, null);
}
private void configure(final UpdateHive_1_1Table processor, final int numUsers, final boolean failOnCreateReader, final int failAfter,
final BiFunction<Integer, MockRecordParser, Void> recordGenerator) throws InitializationException {
runner = TestRunners.newTestRunner(processor);
MockRecordParser readerFactory = new MockRecordParser() {
@Override
public RecordReader createRecordReader(Map<String, String> variables, InputStream in, long inputLength, ComponentLog logger) throws IOException, SchemaNotFoundException {
if (failOnCreateReader) {
throw new SchemaNotFoundException("test");
}
return super.createRecordReader(variables, in, inputLength, logger);
}
};
List<RecordField> fields = Arrays.asList(
new RecordField("name", RecordFieldType.STRING.getDataType()),
new RecordField("favorite_number", RecordFieldType.INT.getDataType()),
new RecordField("favorite_color", RecordFieldType.STRING.getDataType()),
new RecordField("scale", RecordFieldType.DOUBLE.getDataType())
);
final SimpleRecordSchema recordSchema = new SimpleRecordSchema(fields);
for (final RecordField recordField : recordSchema.getFields()) {
readerFactory.addSchemaField(recordField.getFieldName(), recordField.getDataType().getFieldType(), recordField.isNullable());
}
if (recordGenerator == null) {
for (int i = 0; i < numUsers; i++) {
readerFactory.addRecord("name" + i, i, "blue" + i, i * 10.0);
}
} else {
recordGenerator.apply(numUsers, readerFactory);
}
readerFactory.failAfter(failAfter);
runner.addControllerService("mock-reader-factory", readerFactory);
runner.enableControllerService(readerFactory);
runner.setProperty(UpdateHive_1_1Table.RECORD_READER, "mock-reader-factory");
}
@Test
public void testSetup(@TempDir java.nio.file.Path tempDir) throws Exception {
configure(processor, 0);
runner.assertNotValid();
final File dbDir = tempDir.resolve("db").toFile();
final DBCPService service = new MockHiveConnectionPool(dbDir.getAbsolutePath());
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
runner.assertNotValid();
runner.assertNotValid();
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "users");
runner.assertValid();
runner.run();
}
@Test
public void testNoStatementsExecuted() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "users");
final MockHiveConnectionPool service = new MockHiveConnectionPool("test");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(UpdateHive_1_1Table.PARTITION_CLAUSE, "continent, country");
HashMap<String,String> attrs = new HashMap<>();
attrs.put("continent", "Asia");
attrs.put("country", "China");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHive_1_1Table.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHive_1_1Table.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_TABLE, "users");
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/users");
assertTrue(service.getExecutedStatements().isEmpty());
}
@Test
public void testCreateManagedTable() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "${table.name}");
runner.setProperty(UpdateHive_1_1Table.CREATE_TABLE, UpdateHive_1_1Table.CREATE_IF_NOT_EXISTS);
runner.setProperty(UpdateHive_1_1Table.TABLE_STORAGE_FORMAT, UpdateHive_1_1Table.PARQUET);
final MockHiveConnectionPool service = new MockHiveConnectionPool("_newTable");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
Map<String, String> attrs = new HashMap<>();
attrs.put("db.name", "default");
attrs.put("table.name", "_newTable");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHive_1_1Table.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHive_1_1Table.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_TABLE, "_newTable");
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/_newTable");
List<String> statements = service.getExecutedStatements();
assertEquals(1, statements.size());
assertEquals("CREATE TABLE IF NOT EXISTS `_newTable` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) STORED AS PARQUET",
statements.get(0));
}
@Test
public void testCreateManagedTableWithPartition() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "${table.name}");
runner.setProperty(UpdateHive_1_1Table.CREATE_TABLE, UpdateHive_1_1Table.CREATE_IF_NOT_EXISTS);
runner.setProperty(UpdateHive_1_1Table.PARTITION_CLAUSE, "age int");
runner.setProperty(UpdateHive_1_1Table.TABLE_STORAGE_FORMAT, UpdateHive_1_1Table.PARQUET);
final MockHiveConnectionPool service = new MockHiveConnectionPool("_newTable");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
Map<String, String> attrs = new HashMap<>();
attrs.put("db.name", "default");
attrs.put("table.name", "_newTable");
attrs.put("age", "23");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHive_1_1Table.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHive_1_1Table.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_TABLE, "_newTable");
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/_newTable");
List<String> statements = service.getExecutedStatements();
assertEquals(1, statements.size());
assertEquals("CREATE TABLE IF NOT EXISTS `_newTable` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) PARTITIONED BY (`age` int) STORED AS PARQUET",
statements.get(0));
}
@Test
public void testCreateExternalTable() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "${table.name}");
runner.setProperty(UpdateHive_1_1Table.CREATE_TABLE, UpdateHive_1_1Table.CREATE_IF_NOT_EXISTS);
runner.setProperty(UpdateHive_1_1Table.TABLE_MANAGEMENT_STRATEGY, UpdateHive_1_1Table.EXTERNAL_TABLE);
runner.setProperty(UpdateHive_1_1Table.TABLE_STORAGE_FORMAT, UpdateHive_1_1Table.PARQUET);
final MockHiveConnectionPool service = new MockHiveConnectionPool("ext_users");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
runner.assertNotValid(); // Needs location specified
runner.setProperty(UpdateHive_1_1Table.EXTERNAL_TABLE_LOCATION, "/path/to/users");
runner.assertValid();
Map<String, String> attrs = new HashMap<>();
attrs.put("db.name", "default");
attrs.put("table.name", "ext_users");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHive_1_1Table.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHive_1_1Table.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_TABLE, "ext_users");
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/path/to/users");
List<String> statements = service.getExecutedStatements();
assertEquals(1, statements.size());
assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS `ext_users` (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE) STORED AS PARQUET "
+ "LOCATION '/path/to/users'",
statements.get(0));
}
@Test
public void testAddColumnsAndPartition() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "messages");
final MockHiveConnectionPool service = new MockHiveConnectionPool("test");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
runner.setProperty(UpdateHive_1_1Table.PARTITION_CLAUSE, "continent, country");
HashMap<String,String> attrs = new HashMap<>();
attrs.put("continent", "Asia");
attrs.put("country", "China");
runner.enqueue(new byte[0], attrs);
runner.run();
runner.assertTransferCount(UpdateHive_1_1Table.REL_SUCCESS, 1);
final MockFlowFile flowFile = runner.getFlowFilesForRelationship(UpdateHive_1_1Table.REL_SUCCESS).get(0);
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_TABLE, "messages");
flowFile.assertAttributeEquals(UpdateHive_1_1Table.ATTR_OUTPUT_PATH, "hdfs://mycluster:8020/warehouse/tablespace/managed/hive/messages/continent=Asia/country=China");
List<String> statements = service.getExecutedStatements();
assertEquals(2, statements.size());
// All columns from users table/data should be added to the table, and a new partition should be added
assertEquals("ALTER TABLE `messages` ADD COLUMNS (`name` STRING, `favorite_number` INT, `favorite_color` STRING, `scale` DOUBLE)",
statements.get(0));
assertEquals("ALTER TABLE `messages` ADD IF NOT EXISTS PARTITION (`continent`='Asia', `country`='China')",
statements.get(1));
}
@Test
public void testMissingPartitionValues() throws Exception {
configure(processor, 1);
runner.setProperty(UpdateHive_1_1Table.TABLE_NAME, "messages");
final DBCPService service = new MockHiveConnectionPool("test");
runner.addControllerService("dbcp", service);
runner.enableControllerService(service);
runner.setProperty(UpdateHive_1_1Table.HIVE_DBCP_SERVICE, "dbcp");
runner.enqueue(new byte[0]);
runner.run();
runner.assertTransferCount(UpdateHive_1_1Table.REL_SUCCESS, 0);
runner.assertTransferCount(UpdateHive_1_1Table.REL_FAILURE, 1);
}
/**
* Simple implementation only for testing purposes
*/
private static class MockHiveConnectionPool extends AbstractControllerService implements Hive_1_1DBCPService {
private final String dbLocation;
private final List<String> executedStatements = new ArrayList<>();
MockHiveConnectionPool(final String dbLocation) {
this.dbLocation = dbLocation;
}
@Override
public String getIdentifier() {
return "dbcp";
}
@Override
public Connection getConnection() throws ProcessException {
try {
Connection conn = mock(Connection.class);
Statement s = mock(Statement.class);
when(conn.createStatement()).thenReturn(s);
when(s.executeQuery(anyString())).thenAnswer((Answer<ResultSet>) invocation -> {
final String query = (String) invocation.getArguments()[0];
if ("SHOW TABLES".equals(query)) {
return new MockResultSet(SHOW_TABLES_COLUMN_NAMES, SHOW_TABLES_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `messages`".equals(query)) {
return new MockResultSet(DESC_MESSAGES_TABLE_COLUMN_NAMES, DESC_MESSAGES_TABLE_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `users`".equals(query)) {
return new MockResultSet(DESC_USERS_TABLE_COLUMN_NAMES, DESC_USERS_TABLE_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `ext_users`".equals(query)) {
return new MockResultSet(DESC_USERS_TABLE_COLUMN_NAMES, DESC_EXTERNAL_USERS_TABLE_RESULTSET).createResultSet();
} else if ("DESC FORMATTED `_newTable`".equals(query)) {
return new MockResultSet(DESC_NEW_TABLE_COLUMN_NAMES, DESC_NEW_TABLE_RESULTSET).createResultSet();
} else {
return new MockResultSet(new String[]{}, new String[][]{new String[]{}}).createResultSet();
}
});
when(s.execute(anyString())).thenAnswer((Answer<Boolean>) invocation -> {
executedStatements.add((String) invocation.getArguments()[0]);
return false;
});
return conn;
} catch (final Exception e) {
e.printStackTrace();
throw new ProcessException("getConnection failed: " + e);
}
}
@Override
public String getConnectionURL() {
return "jdbc:fake:" + dbLocation;
}
List<String> getExecutedStatements() {
return executedStatements;
}
}
private static class MockResultSet {
String[] colNames;
String[][] data;
int currentRow;
MockResultSet(String[] colNames, String[][] data) {
this.colNames = colNames;
this.data = data;
currentRow = 0;
}
ResultSet createResultSet() throws SQLException {
ResultSet rs = mock(ResultSet.class);
when(rs.next()).thenAnswer((Answer<Boolean>) invocation -> (data != null) && (++currentRow <= data.length));
when(rs.getString(anyInt())).thenAnswer((Answer<String>) invocation -> {
final int index = (int) invocation.getArguments()[0];
if (index < 1) {
throw new SQLException("Columns start with index 1");
}
if (currentRow > data.length) {
throw new SQLException("This result set is already closed");
}
return data[currentRow - 1][index - 1];
});
return rs;
}
}
}

View File

@ -1,38 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{
"namespace" : "org.apache.nifi",
"name" : "outer_record",
"type" : "record",
"fields" : [ {
"name" : "records",
"type" : {
"type" : "array",
"items" : {
"type" : "record",
"name" : "inner_record",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "age",
"type" : "int"
} ]
}
}
} ]
}

View File

@ -1,30 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hive</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,22 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hive</value>
</property>
</configuration>

View File

@ -1,30 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://hive</value>
</property>
<property>
<name>hive.server2.authentication</name>
<value>KERBEROS</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
</configuration>

View File

@ -1,22 +0,0 @@
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<configuration>
<property>
<name>fs.default.name</name>
<value>file:///</value>
</property>
</configuration>

View File

@ -1,10 +0,0 @@
[libdefaults]
default_realm = EXAMPLE.COM
dns_lookup_kdc = false
dns_lookup_realm = false
[realms]
EXAMPLE.COM = {
kdc = kerberos.example.com
admin_server = kerberos.example.com
}

View File

@ -1,26 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
{"namespace": "example.avro",
"type": "record",
"name": "User",
"fields": [
{"name": "name", "type": "string"},
{"name": "favorite_number", "type": ["int", "null"]},
{"name": "favorite_color", "type": ["string", "null"]},
{"name": "scale", "type": ["double", "null"]}
]
}

View File

@ -29,10 +29,6 @@
<modules>
<module>nifi-hive-services-api</module>
<module>nifi-hive-services-api-nar</module>
<module>nifi-hive-processors</module>
<module>nifi-hive-nar</module>
<module>nifi-hive_1_1-processors</module>
<module>nifi-hive_1_1-nar</module>
<module>nifi-hive3-processors</module>
<module>nifi-hive3-nar</module>
<module>nifi-hive-test-utils</module>
@ -104,20 +100,10 @@
<artifactId>ant</artifactId>
<version>1.10.12</version>
</dependency>
<!-- Override Xerces 2.9.1 in Hive 1.1 and 1.2 -->
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
<version>2.12.2</version>
</dependency>
</dependencies>
</dependencyManagement>
<properties>
<hive11.version>1.1.1</hive11.version>
<hive11.hadoop.version>2.6.2</hive11.hadoop.version>
<hive12.version>1.2.2</hive12.version>
<hive12.hadoop.version>2.6.2</hive12.hadoop.version>
<hive3.version>3.1.3</hive3.version>
<hive.version>${hive3.version}</hive.version>
<avatica.version>1.22.0</avatica.version>