LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)

This commit is contained in:
Dawid Weiss 2021-03-22 12:22:39 +01:00 committed by GitHub
parent a5996dbecd
commit 246c4beb22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 216 additions and 114 deletions

View File

@ -23,7 +23,7 @@ plugins {
id "com.palantir.consistent-versions" version "1.14.0"
id "org.owasp.dependencycheck" version "5.3.0"
id 'de.thetaphi.forbiddenapis' version '3.1' apply false
id "de.undercouch.download" version "4.0.2" apply false
id "de.undercouch.download" version "4.1.1" apply false
id "net.ltgt.errorprone" version "1.2.1" apply false
id 'com.diffplug.spotless' version "5.8.2" apply false
}
@ -156,6 +156,8 @@ apply from: file('gradle/generation/nori.gradle')
apply from: file('gradle/generation/icu.gradle')
apply from: file('gradle/generation/javacc.gradle')
apply from: file('gradle/datasets/external-datasets.gradle')
// Shared configuration of subprojects containing native code.
apply from: file('gradle/native/disable-native.gradle')

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.utils;
package org.apache.lucene.gradle.datasets;
import java.io.BufferedReader;
import java.io.BufferedWriter;
@ -27,10 +27,10 @@ import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.util.IOUtils;
/**
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
* Split the Reuters SGML documents into Simple Text files containing:
* Title, Date, Dateline, Body
*/
public class ExtractReuters {
private Path reutersDir;
@ -39,13 +39,16 @@ public class ExtractReuters {
public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
this.reutersDir = reutersDir;
this.outputDir = outputDir;
System.out.println("Deleting all files in " + outputDir);
IOUtils.rm(outputDir);
}
public void extract() throws IOException {
long count = 0;
Files.createDirectories(outputDir);
if (Files.list(outputDir).count() > 0) {
throw new IOException("The output directory must be empty: " + outputDir);
}
try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) {
for (Path sgmFile : stream) {
extractFile(sgmFile);
@ -53,7 +56,7 @@ public class ExtractReuters {
}
}
if (count == 0) {
System.err.println("No .sgm files in " + reutersDir);
throw new IOException("No .sgm files in " + reutersDir);
}
}
@ -65,7 +68,7 @@ public class ExtractReuters {
private static String[] META_CHARS_SERIALIZATIONS = {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};
/** Override if you wish to change what is extracted */
protected void extractFile(Path sgmFile) {
protected void extractFile(Path sgmFile) throws IOException {
try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) {
StringBuilder buffer = new StringBuilder(1024);
StringBuilder outBuffer = new StringBuilder(1024);
@ -105,8 +108,6 @@ public class ExtractReuters {
buffer.setLength(0);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@ -135,6 +136,8 @@ public class ExtractReuters {
System.err.println(
"Usage: "
+ msg
+ " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+ " :: java -cp <...> "
+ ExtractReuters.class.getName()
+ " <Path to Reuters SGM files> <Output Path>");
}
}

View File

@ -0,0 +1,174 @@
import org.apache.lucene.gradle.datasets.ExtractReuters
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// TODO: not sure whether this should live in benchmarks, but for now
// let it be.
configure(project(":lucene:benchmark")) {
apply plugin: "java"
apply plugin: "de.undercouch.download"
ext {
dataDir = file("data")
}
task getEnWiki(type: Download) {
ext {
name = "enwiki-20070527-pages-articles.xml"
src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.file ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
}
}
task getEnWikiRandomLines(type: Download) {
ext {
name = "enwiki.random.lines.txt"
src = "https://home.apache.org/~mikemccand/${name}.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.file ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
}
}
task getGeoNames(type: Download) {
// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
// and then compress with: bzip2 -9 -k file_random.txt
ext {
name = "geonames_20130921_randomOrder_allCountries.txt"
src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.file ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
}
}
task getTop100kWikiWordFiles(type: Download) {
ext {
name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
intermediate = file("${dataDir}/${name}.bz2")
dst = file("${dataDir}/${name}")
}
outputs.dir ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
logger.lifecycle("Decompressing ${ext.name}...")
project.sync {
from tarTree(ext.intermediate) // defined above. Will decompress on the fly
into ext.dst
}
}
}
task getReuters(type: Download) {
ext {
name = "reuters21578"
// note: there is no HTTPS url and we don't care because this is merely test/perf data
src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
intermediate = file("${dataDir}/${name}.tar.gz")
dst = file("${dataDir}/${name}")
}
outputs.dir ext.dst
src ext.src
dest ext.intermediate
overwrite false
compress false
doLast {
def untarPath = file("$temporaryDir/reuters-untar")
logger.lifecycle("Decompressing ${ext.name}...")
project.sync {
from(tarTree(intermediate)) {
exclude '*.txt'
}
into untarPath
}
logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
ext.dst.deleteDir()
ExtractReuters.main(untarPath.toString(), ext.dst.toString())
}
}
task downloadDatasets() {
group "Data set download"
description "Download all data sets."
}
[
getEnWiki,
getGeoNames,
getTop100kWikiWordFiles,
getReuters,
getEnWikiRandomLines
].each { task ->
task.group "Data set download"
task.description "Download the ${task.ext.name} data set."
downloadDatasets.dependsOn(task)
task.doFirst {
logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
}
}
}

View File

@ -96,7 +96,7 @@ allprojects {
RandomPicks.randomFrom(new Random(projectSeedLong), ["US-ASCII", "ISO-8859-1", "UTF-8"])
},
description: "Sets the default file.encoding on test JVM.", buildOnly: true],
// test data
// Test data file used.
[propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."],
// miscellaneous; some of them very weird.
[propName: 'tests.LUCENE_VERSION', value: baseVersion, description: "Base Lucene version."],

View File

@ -90,8 +90,7 @@ subprojects {
configure(project(':lucene:benchmark')) {
project.tasks.withType(ValidateSourcePatternsTask) {
sourceFiles.exclude 'temp/**'
sourceFiles.exclude 'work/**'
sourceFiles.exclude 'data/**'
}
}

View File

@ -155,3 +155,12 @@ Using these additional options will make the results more sparse, so it may be u
to increase the top-N count:
gradlew -p lucene/core test -Ptests.profile=true -Ptests.profile.count=100
External data sets
------------------
Some tests may require external (and large) data sets. To see relevant tasks
that download and extract these data files automatically, run the following:
gradlew tasks --group "Data set download"

View File

@ -1,2 +1 @@
/temp
/work
/data

View File

@ -17,7 +17,6 @@
plugins {
id "java"
id "de.undercouch.download"
}
description = 'System for benchmarking Lucene'
@ -44,9 +43,6 @@ dependencies {
testImplementation project(':lucene:test-framework')
}
def tempDir = file("temp")
def workDir = file("work")
task run(type: JavaExec) {
description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
main 'org.apache.lucene.benchmark.byTask.Benchmark'
@ -67,92 +63,3 @@ task run(type: JavaExec) {
suspend = true
}
}
/* Old "collation" Ant target:
gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
*/
/* Old "shingle" Ant target:
gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
*/
// The remaining tasks just get / extract / prepare data
task getEnWiki(type: Download) {
def finalName = "enwiki-20070527-pages-articles.xml"
src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
dest file("$tempDir/" + finalName + ".bz2")
overwrite false
compress false
doLast {
ant.bunzip2(src: dest, dest: tempDir)
}
outputs.file file("$tempDir/$finalName")
}
task getGeoNames(type: Download) {
// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
// and then compress with: bzip2 -9 -k file_random.txt
def finalName = "geonames_20130921_randomOrder_allCountries.txt"
src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
dest file("$tempDir/" + finalName + ".bz2")
overwrite false
compress false
doLast {
ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
}
outputs.file file("$tempDir/$finalName")
}
task getTop100kWikiWordFiles(type: Download) {
src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
dest file("$tempDir/${src.file.split('/').last()}")
overwrite false
compress false
def finalPath = file("$workDir/top100k-out")
doLast {
project.sync {
from tarTree(dest) // defined above. Will decompress on the fly
into finalPath
}
}
outputs.dir finalPath
}
task getReuters(type: Download) {
// note: there is no HTTPS url and we don't care because this is merely test/perf data
src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
dest file("$tempDir/${src.file.split('/').last()}")
overwrite false
compress false
def untarPath = file("$workDir/reuters")
def finalPath = file("$workDir/reuters-out")
dependsOn sourceSets.main.runtimeClasspath
doLast {
project.sync {
from(tarTree(dest)) { // defined above. Will decompress on the fly
exclude '*.txt'
}
into untarPath
}
println "Extracting reuters to $finalPath"
finalPath.deleteDir() // necessary
// TODO consider porting ExtractReuters to groovy?
project.javaexec {
main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
classpath = sourceSets.main.runtimeClasspath
maxHeapSize = '1G'
args = [untarPath, finalPath]
}
}
outputs.dir finalPath
}

View File

@ -387,10 +387,17 @@ public abstract class LuceneTestCase extends Assert {
public static final boolean TEST_ASSERTS_ENABLED = systemPropertyAsBoolean("tests.asserts", true);
/** TODO: javadoc? */
/**
* The default (embedded resource) lines file.
*
* @see #TEST_LINE_DOCS_FILE
*/
public static final String DEFAULT_LINE_DOCS_FILE = "europarl.lines.txt.gz";
/** TODO: javadoc? */
/**
* Random sample from enwiki used in tests. See {@code help/tests.txt}. gradle task downloading
* this data set: {@code gradlew getEnWikiRandomLines}.
*/
public static final String JENKINS_LARGE_LINE_DOCS_FILE = "enwiki.random.lines.txt";
/** Gets the codec to run tests with. */
@ -407,7 +414,7 @@ public abstract class LuceneTestCase extends Assert {
/** Gets the directory to run tests with */
public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
/** the line file used by LineFileDocs */
/** The line file used in tests (by {@link LineFileDocs}). */
public static final String TEST_LINE_DOCS_FILE =
System.getProperty("tests.linedocsfile", DEFAULT_LINE_DOCS_FILE);

View File

@ -151,8 +151,10 @@ public final class RunListenerPrintReproduceInfo extends RunListener {
}
if (TEST_LINE_DOCS_FILE.endsWith(JENKINS_LARGE_LINE_DOCS_FILE)) {
System.err.println(
"NOTE: download the large Jenkins line-docs file by running "
+ "'ant get-jenkins-line-docs' in the lucene directory.");
"NOTE: large line-docs file was used in this run. You have to download "
+ "it manually ('gradlew getEnWikiRandomLines') and use -P"
+ TEST_LINE_DOCS_FILE
+ "=... property to point to it.");
}
final StringBuilder b = new StringBuilder();