SOLR-14270 export command to have an option to write to a zip file (#1266)

2025-02-28 13:29:26 +00:00 · 2020-02-21 13:41:50 +11:00 · 2020-02-21 13:41:50 +11:00 · 9f3f7244ac
commit 9f3f7244ac
parent 9302eee1e0
4 changed files with 59 additions and 16 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -96,6 +96,8 @@ Improvements
 * SOLR-14194: Highlighting now works when the uniqueKey field is not stored but has docValues.  And the original
  highlighter can now highlight text fields from docValues. (Andrzej Wislowski, David Smiley)

+* SOLR-14270: export command to have an option to write to a zip file (noble)
+
 Optimizations
 ---------------------

--- a/solr/core/src/java/org/apache/solr/util/ExportTool.java
+++ b/solr/core/src/java/org/apache/solr/util/ExportTool.java
@ -41,11 +41,13 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
+import java.util.zip.GZIPOutputStream;

 import com.google.common.collect.ImmutableSet;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
+import org.apache.lucene.util.SuppressForbidden;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.SolrServerException;
@ -185,13 +187,24 @@ public class ExportTool extends SolrCLI.ToolBase {
    info.exportDocs();
  }

-  interface DocsSink {
-    default void start() throws IOException {
+  static abstract class DocsSink {
+    Info info;
+    OutputStream fos;
+
+    abstract void start() throws IOException ;
+
+    @SuppressForbidden(reason = "Command line tool prints out to console")
+    void accept(SolrDocument document) throws IOException {
+      long count = info.docsWritten.incrementAndGet();
+
+      if (count % 100000 == 0) {
+        System.out.println("\nDOCS: " + count);
+      }
+
+
    }

-    void accept(SolrDocument document) throws IOException, InterruptedException;
-
-    default void end() throws IOException {
+    void end() throws IOException {
    }
  }

@ -228,13 +241,10 @@ public class ExportTool extends SolrCLI.ToolBase {
          .create("fields")
  };

-  static class JsonSink implements DocsSink {
-    private final Info info;
+  static class JsonSink extends DocsSink {
    private CharArr charArr = new CharArr(1024 * 2);
    JSONWriter jsonWriter = new JSONWriter(charArr, -1);
    private Writer writer;
-    private OutputStream fos;
-    public AtomicLong docs = new AtomicLong();

    public JsonSink(Info info) {
      this.info = info;
@ -243,6 +253,7 @@ public class ExportTool extends SolrCLI.ToolBase {
    @Override
    public void start() throws IOException {
      fos = new FileOutputStream(info.out);
+      if(info.out.endsWith(".json.gz") || info.out.endsWith(".json.")) fos = new GZIPOutputStream(fos);
      if (info.bufferSize > 0) {
        fos = new BufferedOutputStream(fos, info.bufferSize);
      }
@ -259,7 +270,6 @@ public class ExportTool extends SolrCLI.ToolBase {

    @Override
    public synchronized void accept(SolrDocument doc) throws IOException {
-      docs.incrementAndGet();
      charArr.reset();
      Map m = new LinkedHashMap(doc.size());
      doc.forEach((s, field) -> {
@ -274,13 +284,12 @@ public class ExportTool extends SolrCLI.ToolBase {
      jsonWriter.write(m);
      writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
      writer.append('\n');
+      super.accept(doc);
    }
  }

-  private static class JavabinSink implements DocsSink {
-    private final Info info;
+  static class JavabinSink extends DocsSink {
    JavaBinCodec codec;
-    OutputStream fos;

    public JavabinSink(Info info) {
      this.info = info;
@ -289,6 +298,7 @@ public class ExportTool extends SolrCLI.ToolBase {
    @Override
    public void start() throws IOException {
      fos = new FileOutputStream(info.out);
+      if(info.out.endsWith(".json.gz") || info.out.endsWith(".json.")) fos = new GZIPOutputStream(fos);
      if (info.bufferSize > 0) {
        fos = new BufferedOutputStream(fos, info.bufferSize);
      }
@ -330,6 +340,7 @@ public class ExportTool extends SolrCLI.ToolBase {
      codec.writeTag(SOLRINPUTDOC, sz);
      codec.writeFloat(1f); // document boost
      doc.forEach(bic);
+      super.accept(doc);
    }
  }

@ -339,13 +350,17 @@ public class ExportTool extends SolrCLI.ToolBase {
    SolrDocument EOFDOC = new SolrDocument();
    volatile boolean failed = false;
    Map<String, CoreHandler> corehandlers = new HashMap();
+    private long startTime ;

+    @SuppressForbidden(reason = "Need to print out time")
    public MultiThreadedRunner(String url) {
      super(url);
+      startTime= System.currentTimeMillis();
    }


    @Override
+    @SuppressForbidden(reason = "Need to print out time")
    void exportDocs() throws Exception {
      sink = getSink();
      fetchUniqueKey();
@ -362,7 +377,7 @@ public class ExportTool extends SolrCLI.ToolBase {
        addConsumer(consumerlatch);
        addProducers(m);
        if (output != null) {
-          output.println("NO of shards : " + corehandlers.size());
+          output.println("NO: of shards : " + corehandlers.size());
        }
        CountDownLatch producerLatch = new CountDownLatch(corehandlers.size());
        corehandlers.forEach((s, coreHandler) -> producerThreadpool.submit(() -> {
@ -390,6 +405,8 @@ public class ExportTool extends SolrCLI.ToolBase {
            //ignore
          }
        }
+        System.out.println("\nTotal Docs exported: "+ (docsWritten.get() -1)+
+            ". Time taken: "+( (System.currentTimeMillis() - startTime)/1000) + "secs");
      }
    }

@ -418,7 +435,6 @@ public class ExportTool extends SolrCLI.ToolBase {
          try {
            if (docsWritten.get() > limit) continue;
            sink.accept(doc);
-            docsWritten.incrementAndGet();
          } catch (Exception e) {
            if (output != null) output.println("Failed to write to file " + e.getMessage());
            failed = true;
--- a/solr/core/src/test/org/apache/solr/util/TestExportTool.java
+++ b/solr/core/src/test/org/apache/solr/util/TestExportTool.java
@ -186,7 +186,7 @@ public class TestExportTool extends SolrCloudTestCase {
      info.fields = "id,desc_s";
      info.setLimit("-1");
      info.exportDocs();
-      long actual = ((ExportTool.JsonSink) info.sink).docs.get();
+      long actual = ((ExportTool.JsonSink) info.sink).info.docsWritten.get();
      assertTrue("docs written :" + actual + "docs produced : " + info.docsWritten.get(), actual >= docCount);
      assertJsonDocsCount(info, docCount);
    } finally {
--- a/solr/solr-ref-guide/src/command-line-utilities.adoc
+++ b/solr/solr-ref-guide/src/command-line-utilities.adoc
@ -154,3 +154,28 @@ Unlike the CLUSTERPROP command on the <<cluster-node-management.adoc#clusterprop
 ----
 ./server/scripts/cloud-scripts/zkcli.sh -zkhost 127.0.0.1:2181 -cmd clusterprop -name urlScheme -val https
 ----
+
+=== Export data from a collection to a file
+
+This command downloads documents from all shards in parallel and write the documents to a single file. The supported format are `jsonl` and `javabin`
+
+Arguments are:
+
+ `-url` :: (Requred parameter) Url of the collection
+ `-out` :: (Optional) Name of the file to write to. default file name is `<collection-name>.json` . If the file name ends with `.json.gz` , the output is a zip file of json
+ `-format` :: (Optional) Supported values are json/javabin
+ `-limit` :: (Optional) No:of docs to export. By default the entire collection is exported
+ `-fields` :: (Optional) Fields to be exported. By default, all fields are exported
+
+example 1: Export all documents in a collection `gettingstarted` into a file called `gettingstarted.json`
+[source,bash]
+----
+bin/solr export -url http://localhost:8983/solr/gettingstarted
+----
+
+example 2: export 1M docs of collection `gettingstarted` into a file called `1MDocs.json.gz` as a zipped json file
+[source,bash]
+----
+bin/solr export -url http://localhost:8983/solr/gettingstarted -out 1MDocs.json.gz
+----
+