mirror of https://github.com/apache/lucene.git
SOLR-14270 export command to have an option to write to a zip file (#1266)
This commit is contained in:
parent
9302eee1e0
commit
9f3f7244ac
|
@ -96,6 +96,8 @@ Improvements
|
|||
* SOLR-14194: Highlighting now works when the uniqueKey field is not stored but has docValues. And the original
|
||||
highlighter can now highlight text fields from docValues. (Andrzej Wislowski, David Smiley)
|
||||
|
||||
* SOLR-14270: export command to have an option to write to a zip file (noble)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -41,11 +41,13 @@ import java.util.concurrent.TimeUnit;
|
|||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import org.apache.commons.cli.CommandLine;
|
||||
import org.apache.commons.cli.Option;
|
||||
import org.apache.commons.cli.OptionBuilder;
|
||||
import org.apache.lucene.util.SuppressForbidden;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
|
@ -185,13 +187,24 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
info.exportDocs();
|
||||
}
|
||||
|
||||
interface DocsSink {
|
||||
default void start() throws IOException {
|
||||
static abstract class DocsSink {
|
||||
Info info;
|
||||
OutputStream fos;
|
||||
|
||||
abstract void start() throws IOException ;
|
||||
|
||||
@SuppressForbidden(reason = "Command line tool prints out to console")
|
||||
void accept(SolrDocument document) throws IOException {
|
||||
long count = info.docsWritten.incrementAndGet();
|
||||
|
||||
if (count % 100000 == 0) {
|
||||
System.out.println("\nDOCS: " + count);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void accept(SolrDocument document) throws IOException, InterruptedException;
|
||||
|
||||
default void end() throws IOException {
|
||||
void end() throws IOException {
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -228,13 +241,10 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
.create("fields")
|
||||
};
|
||||
|
||||
static class JsonSink implements DocsSink {
|
||||
private final Info info;
|
||||
static class JsonSink extends DocsSink {
|
||||
private CharArr charArr = new CharArr(1024 * 2);
|
||||
JSONWriter jsonWriter = new JSONWriter(charArr, -1);
|
||||
private Writer writer;
|
||||
private OutputStream fos;
|
||||
public AtomicLong docs = new AtomicLong();
|
||||
|
||||
public JsonSink(Info info) {
|
||||
this.info = info;
|
||||
|
@ -243,6 +253,7 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
@Override
|
||||
public void start() throws IOException {
|
||||
fos = new FileOutputStream(info.out);
|
||||
if(info.out.endsWith(".json.gz") || info.out.endsWith(".json.")) fos = new GZIPOutputStream(fos);
|
||||
if (info.bufferSize > 0) {
|
||||
fos = new BufferedOutputStream(fos, info.bufferSize);
|
||||
}
|
||||
|
@ -259,7 +270,6 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
|
||||
@Override
|
||||
public synchronized void accept(SolrDocument doc) throws IOException {
|
||||
docs.incrementAndGet();
|
||||
charArr.reset();
|
||||
Map m = new LinkedHashMap(doc.size());
|
||||
doc.forEach((s, field) -> {
|
||||
|
@ -274,13 +284,12 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
jsonWriter.write(m);
|
||||
writer.write(charArr.getArray(), charArr.getStart(), charArr.getEnd());
|
||||
writer.append('\n');
|
||||
super.accept(doc);
|
||||
}
|
||||
}
|
||||
|
||||
private static class JavabinSink implements DocsSink {
|
||||
private final Info info;
|
||||
static class JavabinSink extends DocsSink {
|
||||
JavaBinCodec codec;
|
||||
OutputStream fos;
|
||||
|
||||
public JavabinSink(Info info) {
|
||||
this.info = info;
|
||||
|
@ -289,6 +298,7 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
@Override
|
||||
public void start() throws IOException {
|
||||
fos = new FileOutputStream(info.out);
|
||||
if(info.out.endsWith(".json.gz") || info.out.endsWith(".json.")) fos = new GZIPOutputStream(fos);
|
||||
if (info.bufferSize > 0) {
|
||||
fos = new BufferedOutputStream(fos, info.bufferSize);
|
||||
}
|
||||
|
@ -330,6 +340,7 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
codec.writeTag(SOLRINPUTDOC, sz);
|
||||
codec.writeFloat(1f); // document boost
|
||||
doc.forEach(bic);
|
||||
super.accept(doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -339,13 +350,17 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
SolrDocument EOFDOC = new SolrDocument();
|
||||
volatile boolean failed = false;
|
||||
Map<String, CoreHandler> corehandlers = new HashMap();
|
||||
private long startTime ;
|
||||
|
||||
@SuppressForbidden(reason = "Need to print out time")
|
||||
public MultiThreadedRunner(String url) {
|
||||
super(url);
|
||||
startTime= System.currentTimeMillis();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
@SuppressForbidden(reason = "Need to print out time")
|
||||
void exportDocs() throws Exception {
|
||||
sink = getSink();
|
||||
fetchUniqueKey();
|
||||
|
@ -362,7 +377,7 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
addConsumer(consumerlatch);
|
||||
addProducers(m);
|
||||
if (output != null) {
|
||||
output.println("NO of shards : " + corehandlers.size());
|
||||
output.println("NO: of shards : " + corehandlers.size());
|
||||
}
|
||||
CountDownLatch producerLatch = new CountDownLatch(corehandlers.size());
|
||||
corehandlers.forEach((s, coreHandler) -> producerThreadpool.submit(() -> {
|
||||
|
@ -390,6 +405,8 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
//ignore
|
||||
}
|
||||
}
|
||||
System.out.println("\nTotal Docs exported: "+ (docsWritten.get() -1)+
|
||||
". Time taken: "+( (System.currentTimeMillis() - startTime)/1000) + "secs");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -418,7 +435,6 @@ public class ExportTool extends SolrCLI.ToolBase {
|
|||
try {
|
||||
if (docsWritten.get() > limit) continue;
|
||||
sink.accept(doc);
|
||||
docsWritten.incrementAndGet();
|
||||
} catch (Exception e) {
|
||||
if (output != null) output.println("Failed to write to file " + e.getMessage());
|
||||
failed = true;
|
||||
|
|
|
@ -186,7 +186,7 @@ public class TestExportTool extends SolrCloudTestCase {
|
|||
info.fields = "id,desc_s";
|
||||
info.setLimit("-1");
|
||||
info.exportDocs();
|
||||
long actual = ((ExportTool.JsonSink) info.sink).docs.get();
|
||||
long actual = ((ExportTool.JsonSink) info.sink).info.docsWritten.get();
|
||||
assertTrue("docs written :" + actual + "docs produced : " + info.docsWritten.get(), actual >= docCount);
|
||||
assertJsonDocsCount(info, docCount);
|
||||
} finally {
|
||||
|
|
|
@ -154,3 +154,28 @@ Unlike the CLUSTERPROP command on the <<cluster-node-management.adoc#clusterprop
|
|||
----
|
||||
./server/scripts/cloud-scripts/zkcli.sh -zkhost 127.0.0.1:2181 -cmd clusterprop -name urlScheme -val https
|
||||
----
|
||||
|
||||
=== Export data from a collection to a file
|
||||
|
||||
This command downloads documents from all shards in parallel and write the documents to a single file. The supported format are `jsonl` and `javabin`
|
||||
|
||||
Arguments are:
|
||||
|
||||
`-url` :: (Requred parameter) Url of the collection
|
||||
`-out` :: (Optional) Name of the file to write to. default file name is `<collection-name>.json` . If the file name ends with `.json.gz` , the output is a zip file of json
|
||||
`-format` :: (Optional) Supported values are json/javabin
|
||||
`-limit` :: (Optional) No:of docs to export. By default the entire collection is exported
|
||||
`-fields` :: (Optional) Fields to be exported. By default, all fields are exported
|
||||
|
||||
example 1: Export all documents in a collection `gettingstarted` into a file called `gettingstarted.json`
|
||||
[source,bash]
|
||||
----
|
||||
bin/solr export -url http://localhost:8983/solr/gettingstarted
|
||||
----
|
||||
|
||||
example 2: export 1M docs of collection `gettingstarted` into a file called `1MDocs.json.gz` as a zipped json file
|
||||
[source,bash]
|
||||
----
|
||||
bin/solr export -url http://localhost:8983/solr/gettingstarted -out 1MDocs.json.gz
|
||||
----
|
||||
|
||||
|
|
Loading…
Reference in New Issue