lucene/gradle/datasets/external-datasets.gradle

191 lines
5.0 KiB
Groovy

import java.nio.file.Files
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
buildscript {
repositories {
mavenCentral()
}
dependencies {
classpath deps.zstd
}
}
def unzstd(java.nio.file.Path src, java.nio.file.Path dst) {
try (InputStream is = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(Files.newInputStream(src)));
OutputStream os = new BufferedOutputStream(Files.newOutputStream(dst))) {
is.transferTo(os)
}
}
// TODO: not sure whether this should live in benchmarks, but for now let it be.
configure(project(":lucene:benchmark")) {
apply plugin: "java"
apply plugin: deps.plugins.undercouch.download.get().pluginId
ext {
dataDir = file("work")
}
task getEnWiki(type: Download) {
ext {
name = "enwiki-20070527-pages-articles.xml"
src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.file ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
}
}
task getEnWikiRandomLines(type: Download) {
ext {
name = "enwiki.random.lines.txt"
src = "https://home.apache.org/~mikemccand/${name}.zst"
intermediate = file("${dataDir}/${name}.zst")
dst = file("${dataDir}/${name}")
}
outputs.file ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
unzstd(ext.intermediate.toPath(), ext.dst.toPath())
}
}
task getGeoNames(type: Download) {
// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
// and then compress with: bzip2 -9 -k file_random.txt
ext {
name = "geonames_20130921_randomOrder_allCountries.txt"
src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.file ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
}
}
task getTop100kWikiWordFiles(type: Download) {
ext {
name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.dir ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
project.sync {
from tarTree(ext.intermediate) // defined above. Will decompress on the fly
into ext.dst
}
}
}
task getReuters(type: Download) {
ext {
name = "reuters21578"
src = "https://kdd.ics.uci.edu/databases/${name}/${name}.tar.gz"
intermediate = file("${dataDir}/${name}.tar.gz")
dst = file("${dataDir}/reuters-out")
}
outputs.dir ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
def untarPath = file("$temporaryDir/reuters-untar")
logger.lifecycle("Decompressing ${ext.name}...")
project.sync {
from(tarTree(intermediate)) {
exclude '*.txt'
}
into untarPath
}
logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
ext.dst.deleteDir()
buildinfra.extractReuters(untarPath.toString(), ext.dst.toString())
}
}
task downloadDatasets() {
group "Data set download"
description "Download all data sets."
}
[
getEnWiki,
getGeoNames,
getTop100kWikiWordFiles,
getReuters,
getEnWikiRandomLines
].each { task ->
task.group "Data set download"
task.description "Download the ${task.ext.name} data set."
downloadDatasets.dependsOn(task)
task.doFirst {
logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
}
}
}