From 0dd45398cf5602012f77710224971e68b0b4a12b Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 31 Oct 2010 12:45:07 +0000 Subject: [PATCH] SOLR-2200: improve DIH perf for large delta-import updates git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1029325 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 3 ++- .../solr/handler/dataimport/DocBuilder.java | 22 ++++++++----------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 270a66ded7b..4c69968943b 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -337,7 +337,8 @@ Optimizations improvement is 5%, but can be much greater (up to 10x faster) when facet.offset is very large (deep paging). (yonik) - +* SOLR-2200: Improve the performance of DataImportHandler for large delta-import + updates. (Mark Waddle via rmuir) Bug Fixes ---------------------- diff --git a/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java b/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java index a6ecb0331e6..a2eff653082 100644 --- a/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java +++ b/solr/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/DocBuilder.java @@ -841,7 +841,7 @@ public class DocBuilder { } // identifying the modified rows for this entity - Set> deltaSet = new HashSet>(); + Map> deltaSet = new HashMap>(); LOG.info("Running ModifiedRowKey() for Entity: " + entity.name); //get the modified rows in this entity while (true) { @@ -850,7 +850,7 @@ public class DocBuilder { if (row == null) break; - deltaSet.add(row); + deltaSet.put(row.get(entity.getPk()).toString(), row); importStatistics.rowsCount.incrementAndGet(); // check for abort if (stop.get()) @@ -858,33 +858,29 @@ public class DocBuilder { } //get the deleted rows for this entity Set> deletedSet = new HashSet>(); - Set> deltaRemoveSet = new HashSet>(); while (true) { Map row = entityProcessor.nextDeletedRowKey(); if (row == null) break; - //Check to see if this delete is in the current delta set - for (Map modifiedRow : deltaSet) { - if (modifiedRow.get(entity.getPk()).equals(row.get(entity.getPk()))) { - deltaRemoveSet.add(modifiedRow); - } + deletedSet.add(row); + + // Remove deleted rows from the delta rows + String deletedRowPk = row.get(entity.getPk()).toString(); + if (deltaSet.containsKey(deletedRowPk)) { + deltaSet.remove(deletedRowPk); } - deletedSet.add(row); importStatistics.rowsCount.incrementAndGet(); // check for abort if (stop.get()) return new HashSet(); } - //asymmetric Set difference - deltaSet.removeAll(deltaRemoveSet); - LOG.info("Completed ModifiedRowKey for Entity: " + entity.name + " rows obtained : " + deltaSet.size()); LOG.info("Completed DeletedRowKey for Entity: " + entity.name + " rows obtained : " + deletedSet.size()); - myModifiedPks.addAll(deltaSet); + myModifiedPks.addAll(deltaSet.values()); Set> parentKeyList = new HashSet>(); //all that we have captured is useless (in a sub-entity) if no rows in the parent is modified because of these //propogate up the changes in the chain