From 8c2c276c6c47624a94eb7068361bd39775d89ae1 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Tue, 6 Feb 2024 22:08:09 +0100 Subject: [PATCH] Modify getEnWikiRandomLines to fetch and decompress the zstd resource #13083 --- gradle/datasets/external-datasets.gradle | 29 ++++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle index 4df47bec41b..2d6ae8e13d5 100644 --- a/gradle/datasets/external-datasets.gradle +++ b/gradle/datasets/external-datasets.gradle @@ -1,5 +1,7 @@ import org.apache.lucene.gradle.datasets.ExtractReuters +import java.nio.file.Files + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -17,8 +19,25 @@ import org.apache.lucene.gradle.datasets.ExtractReuters * limitations under the License. */ -// TODO: not sure whether this should live in benchmarks, but for now -// let it be. +buildscript { + repositories { + mavenCentral() + } + + dependencies { + classpath "com.github.luben:zstd-jni:1.5.5-11" + } +} + +def unzstd(java.nio.file.Path src, java.nio.file.Path dst) { + try (InputStream is = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(Files.newInputStream(src))); + OutputStream os = new BufferedOutputStream(Files.newOutputStream(dst))) { + is.transferTo(os) + } +} + + +// TODO: not sure whether this should live in benchmarks, but for now let it be. configure(project(":lucene:benchmark")) { apply plugin: "java" apply plugin: "de.undercouch.download" @@ -51,8 +70,8 @@ configure(project(":lucene:benchmark")) { task getEnWikiRandomLines(type: Download) { ext { name = "enwiki.random.lines.txt" - src = "https://home.apache.org/~mikemccand/${name}.bz2" - intermediate = file("${dataDir}/${name}.bz2") + src = "https://home.apache.org/~mikemccand/${name}.zst" + intermediate = file("${dataDir}/${name}.zst") dst = file("${dataDir}/${name}") } @@ -65,7 +84,7 @@ configure(project(":lucene:benchmark")) { doLast { logger.lifecycle("Decompressing ${ext.name}...") - ant.bunzip2(src: ext.intermediate, dest: ext.dst) + unzstd(ext.intermediate.toPath(), ext.dst.toPath()) } }