import java.nio.file.Files /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ buildscript { repositories { mavenCentral() } dependencies { classpath deps.zstd } } def unzstd(java.nio.file.Path src, java.nio.file.Path dst) { try (InputStream is = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(Files.newInputStream(src))); OutputStream os = new BufferedOutputStream(Files.newOutputStream(dst))) { is.transferTo(os) } } // TODO: not sure whether this should live in benchmarks, but for now let it be. configure(project(":lucene:benchmark")) { apply plugin: "java" apply plugin: deps.plugins.undercouch.download.get().pluginId ext { dataDir = file("work") } task getEnWiki(type: Download) { ext { name = "enwiki-20070527-pages-articles.xml" src = "https://home.apache.org/~dsmiley/data/${name}.bz2" intermediate = file("${dataDir}/${name}.bz2") dst = file("${dataDir}/${name}") } outputs.file ext.dst src ext.src dest ext.intermediate overwrite false compress false doLast { logger.lifecycle("Decompressing ${ext.name}...") ant.bunzip2(src: ext.intermediate, dest: ext.dst) } } task getEnWikiRandomLines(type: Download) { ext { name = "enwiki.random.lines.txt" src = "https://home.apache.org/~mikemccand/${name}.zst" intermediate = file("${dataDir}/${name}.zst") dst = file("${dataDir}/${name}") } outputs.file ext.dst src ext.src dest ext.intermediate overwrite false compress false doLast { logger.lifecycle("Decompressing ${ext.name}...") unzstd(ext.intermediate.toPath(), ext.dst.toPath()) } } task getGeoNames(type: Download) { // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip // and then randomize with: gsort -R -S 1500M file.txt > file_random.txt // and then compress with: bzip2 -9 -k file_random.txt ext { name = "geonames_20130921_randomOrder_allCountries.txt" src = "https://home.apache.org/~dsmiley/data/${name}.bz2" intermediate = file("${dataDir}/${name}.bz2") dst = file("${dataDir}/${name}") } outputs.file ext.dst src ext.src dest ext.intermediate overwrite false compress false doLast { logger.lifecycle("Decompressing ${ext.name}...") ant.bunzip2(src: ext.intermediate, dest: ext.dst) } } task getTop100kWikiWordFiles(type: Download) { ext { name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11" src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2" intermediate = file("${dataDir}/${name}.bz2") dst = file("${dataDir}/${name}") } outputs.dir ext.dst src ext.src dest ext.intermediate overwrite false compress false doLast { logger.lifecycle("Decompressing ${ext.name}...") project.sync { from tarTree(ext.intermediate) // defined above. Will decompress on the fly into ext.dst } } } task getReuters(type: Download) { ext { name = "reuters21578" src = "https://kdd.ics.uci.edu/databases/${name}/${name}.tar.gz" intermediate = file("${dataDir}/${name}.tar.gz") dst = file("${dataDir}/reuters-out") } outputs.dir ext.dst src ext.src dest ext.intermediate overwrite false compress false doLast { def untarPath = file("$temporaryDir/reuters-untar") logger.lifecycle("Decompressing ${ext.name}...") project.sync { from(tarTree(intermediate)) { exclude '*.txt' } into untarPath } logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...") ext.dst.deleteDir() buildinfra.extractReuters(untarPath.toString(), ext.dst.toString()) } } task downloadDatasets() { group "Data set download" description "Download all data sets." } [ getEnWiki, getGeoNames, getTop100kWikiWordFiles, getReuters, getEnWikiRandomLines ].each { task -> task.group "Data set download" task.description "Download the ${task.ext.name} data set." downloadDatasets.dependsOn(task) task.doFirst { logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...") } } }