mirror of https://github.com/apache/nifi.git
NIFI-748 Fixed logic around handling partial query results from provenance repository
- Ensured that failures derived form correlating Document to its actual provenance event do fail the entire query and produce partial results with warning messages - Refactored DocsReader.read() operation. - Added test to validate two conditions where the such failures could occur
This commit is contained in:
parent
36d00a60f5
commit
a4d93c62c8
|
@ -1784,6 +1784,7 @@ public class PersistentProvenanceRepository implements ProvenanceEventRepository
|
|||
try {
|
||||
Thread.sleep(100L);
|
||||
} catch (final InterruptedException ie) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,15 +17,14 @@
|
|||
package org.apache.nifi.provenance.lucene;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
@ -47,9 +46,6 @@ import org.slf4j.LoggerFactory;
|
|||
public class DocsReader {
|
||||
private final Logger logger = LoggerFactory.getLogger(DocsReader.class);
|
||||
|
||||
public DocsReader(final List<File> storageDirectories) {
|
||||
}
|
||||
|
||||
public Set<ProvenanceEventRecord> read(final TopDocs topDocs, final IndexReader indexReader, final Collection<Path> allProvenanceLogFiles,
|
||||
final AtomicInteger retrievalCount, final int maxResults, final int maxAttributeChars) throws IOException {
|
||||
if (retrievalCount.get() >= maxResults) {
|
||||
|
@ -100,101 +96,61 @@ public class DocsReader {
|
|||
}
|
||||
}
|
||||
|
||||
if ( record == null ) {
|
||||
throw new IOException("Failed to find Provenance Event " + d);
|
||||
} else {
|
||||
return record;
|
||||
}
|
||||
if (record == null) {
|
||||
logger.warn("Failed to read Provenance Event for '" + d + "'. The event file may be missing or corrupted");
|
||||
}
|
||||
|
||||
return record;
|
||||
}
|
||||
|
||||
public Set<ProvenanceEventRecord> read(final List<Document> docs, final Collection<Path> allProvenanceLogFiles,
|
||||
final AtomicInteger retrievalCount, final int maxResults, final int maxAttributeChars) throws IOException {
|
||||
if (retrievalCount.get() >= maxResults) {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
LuceneUtil.sortDocsForRetrieval(docs);
|
||||
|
||||
RecordReader reader = null;
|
||||
String lastStorageFilename = null;
|
||||
final Set<ProvenanceEventRecord> matchingRecords = new LinkedHashSet<>();
|
||||
|
||||
final long start = System.nanoTime();
|
||||
|
||||
Set<ProvenanceEventRecord> matchingRecords = new LinkedHashSet<>();
|
||||
if (retrievalCount.get() >= maxResults) {
|
||||
return matchingRecords;
|
||||
}
|
||||
|
||||
Map<String, List<Document>> byStorageNameDocGroups = LuceneUtil.groupDocsByStorageFileName(docs);
|
||||
|
||||
int eventsReadThisFile = 0;
|
||||
int logFileCount = 0;
|
||||
|
||||
final Set<String> storageFilesToSkip = new HashSet<>();
|
||||
int eventsReadThisFile = 0;
|
||||
|
||||
try {
|
||||
for (final Document d : docs) {
|
||||
final String storageFilename = d.getField(FieldNames.STORAGE_FILENAME).stringValue();
|
||||
if ( storageFilesToSkip.contains(storageFilename) ) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
if (reader != null && storageFilename.equals(lastStorageFilename)) {
|
||||
matchingRecords.add(getRecord(d, reader));
|
||||
for (String storageFileName : byStorageNameDocGroups.keySet()) {
|
||||
File provenanceEventFile = LuceneUtil.getProvenanceLogFile(storageFileName, allProvenanceLogFiles);
|
||||
if (provenanceEventFile != null) {
|
||||
try (RecordReader reader = RecordReaders.newRecordReader(provenanceEventFile, allProvenanceLogFiles,
|
||||
maxAttributeChars)) {
|
||||
for (Document document : byStorageNameDocGroups.get(storageFileName)) {
|
||||
ProvenanceEventRecord eRec = this.getRecord(document, reader);
|
||||
if (eRec != null) {
|
||||
matchingRecords.add(eRec);
|
||||
eventsReadThisFile++;
|
||||
|
||||
if ( retrievalCount.incrementAndGet() >= maxResults ) {
|
||||
if (retrievalCount.incrementAndGet() >= maxResults) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed while trying to read Provenance Events. The event file '"
|
||||
+ provenanceEventFile.getAbsolutePath() +
|
||||
"' may be missing or corrupted.", e);
|
||||
}
|
||||
} else {
|
||||
logger.debug("Opening log file {}", storageFilename);
|
||||
|
||||
logFileCount++;
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
|
||||
final List<File> potentialFiles = LuceneUtil.getProvenanceLogFiles(storageFilename, allProvenanceLogFiles);
|
||||
if (potentialFiles.isEmpty()) {
|
||||
logger.warn("Could not find Provenance Log File with basename {} in the "
|
||||
+ "Provenance Repository; assuming file has expired and continuing without it", storageFilename);
|
||||
storageFilesToSkip.add(storageFilename);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (potentialFiles.size() > 1) {
|
||||
throw new FileNotFoundException("Found multiple Provenance Log Files with basename " +
|
||||
storageFilename + " in the Provenance Repository");
|
||||
}
|
||||
|
||||
for (final File file : potentialFiles) {
|
||||
try {
|
||||
if (reader != null) {
|
||||
logger.debug("Read {} records from previous file", eventsReadThisFile);
|
||||
}
|
||||
|
||||
reader = RecordReaders.newRecordReader(file, allProvenanceLogFiles, maxAttributeChars);
|
||||
matchingRecords.add(getRecord(d, reader));
|
||||
eventsReadThisFile = 1;
|
||||
|
||||
if ( retrievalCount.incrementAndGet() >= maxResults ) {
|
||||
break;
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
throw new IOException("Failed to retrieve record " + d + " from Provenance File " + file + " due to " + e, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
lastStorageFilename = storageFilename;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
logger.warn("Could not find Provenance Log File with "
|
||||
+ "basename {} in the Provenance Repository; assuming "
|
||||
+ "file has expired and continuing without it", storageFileName);
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Read {} records from previous file", eventsReadThisFile);
|
||||
final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
|
||||
logger.debug("Took {} ms to read {} events from {} prov log files", millis, matchingRecords.size(), logFileCount);
|
||||
logger.debug("Took {} ms to read {} events from {} prov log files", millis, matchingRecords.size(),
|
||||
logFileCount);
|
||||
|
||||
return matchingRecords;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -89,7 +89,7 @@ public class IndexSearch {
|
|||
return sqr;
|
||||
}
|
||||
|
||||
final DocsReader docsReader = new DocsReader(repository.getConfiguration().getStorageDirectories());
|
||||
final DocsReader docsReader = new DocsReader();
|
||||
matchingRecords = docsReader.read(topDocs, searcher.getIndexReader(), repository.getAllLogFiles(), retrievedCount,
|
||||
provenanceQuery.getMaxResults(), maxAttributeChars);
|
||||
|
||||
|
|
|
@ -93,7 +93,7 @@ public class LineageQuery {
|
|||
final TopDocs uuidQueryTopDocs = searcher.search(query, MAX_QUERY_RESULTS);
|
||||
final long searchEnd = System.nanoTime();
|
||||
|
||||
final DocsReader docsReader = new DocsReader(repo.getConfiguration().getStorageDirectories());
|
||||
final DocsReader docsReader = new DocsReader();
|
||||
final Set<ProvenanceEventRecord> recs = docsReader.read(uuidQueryTopDocs, searcher.getIndexReader(), repo.getAllLogFiles(),
|
||||
new AtomicInteger(0), Integer.MAX_VALUE, maxAttributeChars);
|
||||
|
||||
|
|
|
@ -22,7 +22,9 @@ import java.util.ArrayList;
|
|||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.nifi.processor.DataUnit;
|
||||
import org.apache.nifi.provenance.SearchableFields;
|
||||
|
@ -160,4 +162,25 @@ public class LuceneUtil {
|
|||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Will group documents based on the {@link FieldNames#STORAGE_FILENAME}.
|
||||
*
|
||||
* @param documents
|
||||
* list of {@link Document}s
|
||||
* @return a {@link Map} of document groups with
|
||||
* {@link FieldNames#STORAGE_FILENAME} as key and {@link List} of
|
||||
* {@link Document}s as value.
|
||||
*/
|
||||
public static Map<String, List<Document>> groupDocsByStorageFileName(final List<Document> documents) {
|
||||
Map<String, List<Document>> documentGroups = new HashMap<>();
|
||||
for (Document document : documents) {
|
||||
String fileName = document.get(FieldNames.STORAGE_FILENAME);
|
||||
if (!documentGroups.containsKey(fileName)) {
|
||||
documentGroups.put(fileName, new ArrayList<Document>());
|
||||
}
|
||||
documentGroups.get(fileName).add(document);
|
||||
}
|
||||
return documentGroups;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,9 +20,11 @@ import static org.apache.nifi.provenance.TestUtil.createFlowFile;
|
|||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
@ -36,6 +38,7 @@ import java.util.concurrent.TimeUnit;
|
|||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.SimpleAnalyzer;
|
||||
|
@ -48,6 +51,7 @@ import org.apache.lucene.search.ScoreDoc;
|
|||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.nifi.events.EventReporter;
|
||||
import org.apache.nifi.flowfile.FlowFile;
|
||||
import org.apache.nifi.provenance.lineage.EventNode;
|
||||
import org.apache.nifi.provenance.lineage.Lineage;
|
||||
import org.apache.nifi.provenance.lineage.LineageEdge;
|
||||
|
@ -869,6 +873,72 @@ public class TestPersistentProvenanceRepository {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Here the event file is simply corrupted by virtue of not having any event
|
||||
* records while having correct headers
|
||||
*/
|
||||
@Test
|
||||
public void testWithWithEventFileMissingRecord() throws Exception {
|
||||
File eventFile = this.prepCorruptedEventFileTests();
|
||||
|
||||
final Query query = new Query(UUID.randomUUID().toString());
|
||||
query.addSearchTerm(SearchTerms.newSearchTerm(SearchableFields.ComponentID, "foo-*"));
|
||||
query.setMaxResults(100);
|
||||
|
||||
DataOutputStream in = new DataOutputStream(new GZIPOutputStream(new FileOutputStream(eventFile)));
|
||||
in.writeUTF("BlahBlah");
|
||||
in.writeInt(4);
|
||||
in.close();
|
||||
assertTrue(eventFile.exists());
|
||||
final QueryResult result = repo.queryEvents(query);
|
||||
assertEquals(10, result.getMatchingEvents().size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Here the event file is simply corrupted by virtue of being empty (0
|
||||
* bytes)
|
||||
*/
|
||||
@Test
|
||||
public void testWithWithEventFileCorrupted() throws Exception {
|
||||
File eventFile = this.prepCorruptedEventFileTests();
|
||||
|
||||
final Query query = new Query(UUID.randomUUID().toString());
|
||||
query.addSearchTerm(SearchTerms.newSearchTerm(SearchableFields.ComponentID, "foo-*"));
|
||||
query.setMaxResults(100);
|
||||
DataOutputStream in = new DataOutputStream(new GZIPOutputStream(new FileOutputStream(eventFile)));
|
||||
in.close();
|
||||
final QueryResult result = repo.queryEvents(query);
|
||||
assertEquals(10, result.getMatchingEvents().size());
|
||||
}
|
||||
|
||||
private File prepCorruptedEventFileTests() throws Exception {
|
||||
RepositoryConfiguration config = createConfiguration();
|
||||
config.setMaxStorageCapacity(1024L * 1024L);
|
||||
config.setMaxEventFileLife(500, TimeUnit.MILLISECONDS);
|
||||
config.setMaxEventFileCapacity(1024L * 1024L);
|
||||
config.setSearchableFields(new ArrayList<>(SearchableFields.getStandardFields()));
|
||||
config.setDesiredIndexSize(10);
|
||||
|
||||
repo = new PersistentProvenanceRepository(config, DEFAULT_ROLLOVER_MILLIS);
|
||||
repo.initialize(getEventReporter());
|
||||
|
||||
String uuid = UUID.randomUUID().toString();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
ProvenanceEventRecord record = repo.eventBuilder().fromFlowFile(mock(FlowFile.class))
|
||||
.setEventType(ProvenanceEventType.CREATE).setComponentId("foo-" + i).setComponentType("myComponent")
|
||||
.setFlowFileUUID(uuid).build();
|
||||
repo.registerEvent(record);
|
||||
if (i == 9) {
|
||||
repo.waitForRollover();
|
||||
Thread.sleep(2000L);
|
||||
}
|
||||
}
|
||||
repo.waitForRollover();
|
||||
File eventFile = new File(config.getStorageDirectories().get(0), "10.prov.gz");
|
||||
assertTrue(eventFile.delete());
|
||||
return eventFile;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndexDirectoryRemoved() throws InterruptedException, IOException, ParseException {
|
||||
final RepositoryConfiguration config = createConfiguration();
|
||||
|
|
Loading…
Reference in New Issue