NIFI-9764: Atlas reporting task sends 'unknown' hive_table when table is name not available

Signed-off-by: Pierre Villard <pierre.villard.fr@gmail.com>

This closes #5839.
This commit is contained in:
Peter Turcsanyi 2022-03-04 21:42:34 +01:00 committed by Pierre Villard
parent df00cc6cb5
commit 92202a5b95
No known key found for this signature in database
GPG Key ID: F92A93B30C07C6D5
2 changed files with 21 additions and 17 deletions

View File

@ -26,6 +26,7 @@ import org.apache.nifi.util.Tuple;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.Set; import java.util.Set;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -61,6 +62,8 @@ public class Hive2JDBC extends AbstractHiveAnalyzer {
private static final String URI_PATTERN_STR = "jdbc:hive2://([^/]+)/?(.*)$"; private static final String URI_PATTERN_STR = "jdbc:hive2://([^/]+)/?(.*)$";
private static final Pattern URI_PATTERN = Pattern.compile(URI_PATTERN_STR); private static final Pattern URI_PATTERN = Pattern.compile(URI_PATTERN_STR);
private static final String UNKNOWN_TABLE = "unknown";
@Override @Override
public DataSetRefs analyze(AnalysisContext context, ProvenanceEventRecord event) { public DataSetRefs analyze(AnalysisContext context, ProvenanceEventRecord event) {
@ -72,7 +75,7 @@ public class Hive2JDBC extends AbstractHiveAnalyzer {
final Matcher uriMatcher = URI_PATTERN.matcher(transitUri); final Matcher uriMatcher = URI_PATTERN.matcher(transitUri);
if (!uriMatcher.matches()) { if (!uriMatcher.matches()) {
logger.warn("Unexpected transit URI: {}", new Object[]{transitUri}); logger.warn("Unexpected transit URI: {}", transitUri);
return null; return null;
} }
@ -89,13 +92,20 @@ public class Hive2JDBC extends AbstractHiveAnalyzer {
connectedDatabaseName = "default"; connectedDatabaseName = "default";
} }
final Set<Tuple<String, String>> inputTables = parseTableNames(connectedDatabaseName, event.getAttribute(ATTR_INPUT_TABLES)); Set<Tuple<String, String>> inputTables = parseTableNames(connectedDatabaseName, event.getAttribute(ATTR_INPUT_TABLES));
final Set<Tuple<String, String>> outputTables = parseTableNames(connectedDatabaseName, event.getAttribute(ATTR_OUTPUT_TABLES)); Set<Tuple<String, String>> outputTables = parseTableNames(connectedDatabaseName, event.getAttribute(ATTR_OUTPUT_TABLES));
if (inputTables.isEmpty() && outputTables.isEmpty()) { if (inputTables.isEmpty() && outputTables.isEmpty()) {
// If input/output tables are unknown, create database level lineage. // If input/output tables are unknown, create hive_table entity with name 'unknown' (hive_db is not a DataSet entity and therefore it cannot be used in the lineage).
// Handle case insensitivity of database and table names in Hive: send names uniformly in lower case // Handle case insensitivity of database and table names in Hive: send names uniformly in lower case
return getDatabaseRef(event.getComponentId(), event.getEventType(), namespace, connectedDatabaseName.toLowerCase()); final ProvenanceEventType eventType = event.getEventType();
if (eventType == ProvenanceEventType.RECEIVE || eventType == ProvenanceEventType.FETCH) {
logger.warn("Input table name is missing, defaults to '{}'. Transit URI: {}", UNKNOWN_TABLE, transitUri);
inputTables = Collections.singleton(new Tuple<>(connectedDatabaseName.toLowerCase(), UNKNOWN_TABLE));
} else if (eventType == ProvenanceEventType.SEND) {
logger.warn("Output table name is missing, defaults to '{}'. Transit URI: {}", UNKNOWN_TABLE, transitUri);
outputTables = Collections.singleton(new Tuple<>(connectedDatabaseName.toLowerCase(), UNKNOWN_TABLE));
}
} }
final DataSetRefs refs = new DataSetRefs(event.getComponentId()); final DataSetRefs refs = new DataSetRefs(event.getComponentId());
@ -104,13 +114,6 @@ public class Hive2JDBC extends AbstractHiveAnalyzer {
return refs; return refs;
} }
private DataSetRefs getDatabaseRef(String componentId, ProvenanceEventType eventType,
String namespace, String databaseName) {
final Referenceable ref = createDatabaseRef(namespace, databaseName);
return singleDataSetRef(componentId, eventType, ref);
}
private void addRefs(DataSetRefs refs, boolean isInput, String namespace, private void addRefs(DataSetRefs refs, boolean isInput, String namespace,
Set<Tuple<String, String>> tableNames) { Set<Tuple<String, String>> tableNames) {
tableNames.forEach(tableName -> { tableNames.forEach(tableName -> {

View File

@ -45,10 +45,11 @@ public class TestHive2JDBC {
/** /**
* If a provenance event does not have table name attributes, * If a provenance event does not have table name attributes,
* then a database lineage should be created. * then a table lineage is created with table name 'unknown'.
* Database lineage cannot be sent to Atlas because hive_db is not a DataSet entity.
*/ */
@Test @Test
public void testDatabaseLineage() { public void testUnknownTableLineage() {
final String processorName = "PutHiveQL"; final String processorName = "PutHiveQL";
final String transitUri = "jdbc:hive2://0.example.com:10000/database_A"; final String transitUri = "jdbc:hive2://0.example.com:10000/database_A";
final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class); final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
@ -69,9 +70,9 @@ public class TestHive2JDBC {
assertEquals(0, refs.getInputs().size()); assertEquals(0, refs.getInputs().size());
assertEquals(1, refs.getOutputs().size()); assertEquals(1, refs.getOutputs().size());
Referenceable ref = refs.getOutputs().iterator().next(); Referenceable ref = refs.getOutputs().iterator().next();
assertEquals("hive_db", ref.getTypeName()); assertEquals("hive_table", ref.getTypeName());
assertEquals("database_a", ref.get(ATTR_NAME)); assertEquals("unknown", ref.get(ATTR_NAME));
assertEquals("database_a@namespace1", ref.get(ATTR_QUALIFIED_NAME)); assertEquals("database_a.unknown@namespace1", ref.get(ATTR_QUALIFIED_NAME));
} }
/** /**