mirror of https://github.com/apache/lucene.git
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)
This commit is contained in:
parent
a5996dbecd
commit
246c4beb22
|
@ -23,7 +23,7 @@ plugins {
|
|||
id "com.palantir.consistent-versions" version "1.14.0"
|
||||
id "org.owasp.dependencycheck" version "5.3.0"
|
||||
id 'de.thetaphi.forbiddenapis' version '3.1' apply false
|
||||
id "de.undercouch.download" version "4.0.2" apply false
|
||||
id "de.undercouch.download" version "4.1.1" apply false
|
||||
id "net.ltgt.errorprone" version "1.2.1" apply false
|
||||
id 'com.diffplug.spotless' version "5.8.2" apply false
|
||||
}
|
||||
|
@ -156,6 +156,8 @@ apply from: file('gradle/generation/nori.gradle')
|
|||
apply from: file('gradle/generation/icu.gradle')
|
||||
apply from: file('gradle/generation/javacc.gradle')
|
||||
|
||||
apply from: file('gradle/datasets/external-datasets.gradle')
|
||||
|
||||
// Shared configuration of subprojects containing native code.
|
||||
apply from: file('gradle/native/disable-native.gradle')
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.benchmark.utils;
|
||||
package org.apache.lucene.gradle.datasets;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
|
@ -27,10 +27,10 @@ import java.nio.file.Paths;
|
|||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
|
||||
* Split the Reuters SGML documents into Simple Text files containing:
|
||||
* Title, Date, Dateline, Body
|
||||
*/
|
||||
public class ExtractReuters {
|
||||
private Path reutersDir;
|
||||
|
@ -39,13 +39,16 @@ public class ExtractReuters {
|
|||
public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
|
||||
this.reutersDir = reutersDir;
|
||||
this.outputDir = outputDir;
|
||||
System.out.println("Deleting all files in " + outputDir);
|
||||
IOUtils.rm(outputDir);
|
||||
}
|
||||
|
||||
public void extract() throws IOException {
|
||||
long count = 0;
|
||||
Files.createDirectories(outputDir);
|
||||
|
||||
if (Files.list(outputDir).count() > 0) {
|
||||
throw new IOException("The output directory must be empty: " + outputDir);
|
||||
}
|
||||
|
||||
try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) {
|
||||
for (Path sgmFile : stream) {
|
||||
extractFile(sgmFile);
|
||||
|
@ -53,7 +56,7 @@ public class ExtractReuters {
|
|||
}
|
||||
}
|
||||
if (count == 0) {
|
||||
System.err.println("No .sgm files in " + reutersDir);
|
||||
throw new IOException("No .sgm files in " + reutersDir);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -65,7 +68,7 @@ public class ExtractReuters {
|
|||
private static String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"};
|
||||
|
||||
/** Override if you wish to change what is extracted */
|
||||
protected void extractFile(Path sgmFile) {
|
||||
protected void extractFile(Path sgmFile) throws IOException {
|
||||
try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) {
|
||||
StringBuilder buffer = new StringBuilder(1024);
|
||||
StringBuilder outBuffer = new StringBuilder(1024);
|
||||
|
@ -105,8 +108,6 @@ public class ExtractReuters {
|
|||
buffer.setLength(0);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,6 +136,8 @@ public class ExtractReuters {
|
|||
System.err.println(
|
||||
"Usage: "
|
||||
+ msg
|
||||
+ " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
|
||||
+ " :: java -cp <...> "
|
||||
+ ExtractReuters.class.getName()
|
||||
+ " <Path to Reuters SGM files> <Output Path>");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
import org.apache.lucene.gradle.datasets.ExtractReuters
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// TODO: not sure whether this should live in benchmarks, but for now
|
||||
// let it be.
|
||||
configure(project(":lucene:benchmark")) {
|
||||
apply plugin: "java"
|
||||
apply plugin: "de.undercouch.download"
|
||||
|
||||
ext {
|
||||
dataDir = file("data")
|
||||
}
|
||||
|
||||
task getEnWiki(type: Download) {
|
||||
ext {
|
||||
name = "enwiki-20070527-pages-articles.xml"
|
||||
src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
|
||||
intermediate = file("${dataDir}/${name}.bz2")
|
||||
dst = file("${dataDir}/${name}")
|
||||
}
|
||||
|
||||
outputs.file ext.dst
|
||||
|
||||
src ext.src
|
||||
dest ext.intermediate
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
logger.lifecycle("Decompressing ${ext.name}...")
|
||||
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
|
||||
}
|
||||
}
|
||||
|
||||
task getEnWikiRandomLines(type: Download) {
|
||||
ext {
|
||||
name = "enwiki.random.lines.txt"
|
||||
src = "https://home.apache.org/~mikemccand/${name}.bz2"
|
||||
intermediate = file("${dataDir}/${name}.bz2")
|
||||
dst = file("${dataDir}/${name}")
|
||||
}
|
||||
|
||||
outputs.file ext.dst
|
||||
|
||||
src ext.src
|
||||
dest ext.intermediate
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
logger.lifecycle("Decompressing ${ext.name}...")
|
||||
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
|
||||
}
|
||||
}
|
||||
|
||||
task getGeoNames(type: Download) {
|
||||
// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
|
||||
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
|
||||
// and then compress with: bzip2 -9 -k file_random.txt
|
||||
ext {
|
||||
name = "geonames_20130921_randomOrder_allCountries.txt"
|
||||
src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
|
||||
intermediate = file("${dataDir}/${name}.bz2")
|
||||
dst = file("${dataDir}/${name}")
|
||||
}
|
||||
|
||||
outputs.file ext.dst
|
||||
|
||||
src ext.src
|
||||
dest ext.intermediate
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
logger.lifecycle("Decompressing ${ext.name}...")
|
||||
ant.bunzip2(src: ext.intermediate, dest: ext.dst)
|
||||
}
|
||||
}
|
||||
|
||||
task getTop100kWikiWordFiles(type: Download) {
|
||||
ext {
|
||||
name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
|
||||
src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
|
||||
intermediate = file("${dataDir}/${name}.bz2")
|
||||
dst = file("${dataDir}/${name}")
|
||||
}
|
||||
|
||||
outputs.dir ext.dst
|
||||
|
||||
src ext.src
|
||||
dest ext.intermediate
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
logger.lifecycle("Decompressing ${ext.name}...")
|
||||
project.sync {
|
||||
from tarTree(ext.intermediate) // defined above. Will decompress on the fly
|
||||
into ext.dst
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
task getReuters(type: Download) {
|
||||
ext {
|
||||
name = "reuters21578"
|
||||
// note: there is no HTTPS url and we don't care because this is merely test/perf data
|
||||
src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
|
||||
intermediate = file("${dataDir}/${name}.tar.gz")
|
||||
dst = file("${dataDir}/${name}")
|
||||
}
|
||||
|
||||
outputs.dir ext.dst
|
||||
|
||||
src ext.src
|
||||
dest ext.intermediate
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
def untarPath = file("$temporaryDir/reuters-untar")
|
||||
|
||||
logger.lifecycle("Decompressing ${ext.name}...")
|
||||
project.sync {
|
||||
from(tarTree(intermediate)) {
|
||||
exclude '*.txt'
|
||||
}
|
||||
into untarPath
|
||||
}
|
||||
|
||||
logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
|
||||
ext.dst.deleteDir()
|
||||
ExtractReuters.main(untarPath.toString(), ext.dst.toString())
|
||||
}
|
||||
}
|
||||
|
||||
task downloadDatasets() {
|
||||
group "Data set download"
|
||||
description "Download all data sets."
|
||||
}
|
||||
|
||||
[
|
||||
getEnWiki,
|
||||
getGeoNames,
|
||||
getTop100kWikiWordFiles,
|
||||
getReuters,
|
||||
getEnWikiRandomLines
|
||||
].each { task ->
|
||||
task.group "Data set download"
|
||||
task.description "Download the ${task.ext.name} data set."
|
||||
|
||||
downloadDatasets.dependsOn(task)
|
||||
|
||||
task.doFirst {
|
||||
logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
|
||||
}
|
||||
}
|
||||
}
|
|
@ -96,7 +96,7 @@ allprojects {
|
|||
RandomPicks.randomFrom(new Random(projectSeedLong), ["US-ASCII", "ISO-8859-1", "UTF-8"])
|
||||
},
|
||||
description: "Sets the default file.encoding on test JVM.", buildOnly: true],
|
||||
// test data
|
||||
// Test data file used.
|
||||
[propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."],
|
||||
// miscellaneous; some of them very weird.
|
||||
[propName: 'tests.LUCENE_VERSION', value: baseVersion, description: "Base Lucene version."],
|
||||
|
|
|
@ -90,8 +90,7 @@ subprojects {
|
|||
|
||||
configure(project(':lucene:benchmark')) {
|
||||
project.tasks.withType(ValidateSourcePatternsTask) {
|
||||
sourceFiles.exclude 'temp/**'
|
||||
sourceFiles.exclude 'work/**'
|
||||
sourceFiles.exclude 'data/**'
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -155,3 +155,12 @@ Using these additional options will make the results more sparse, so it may be u
|
|||
to increase the top-N count:
|
||||
|
||||
gradlew -p lucene/core test -Ptests.profile=true -Ptests.profile.count=100
|
||||
|
||||
|
||||
External data sets
|
||||
------------------
|
||||
|
||||
Some tests may require external (and large) data sets. To see relevant tasks
|
||||
that download and extract these data files automatically, run the following:
|
||||
|
||||
gradlew tasks --group "Data set download"
|
||||
|
|
|
@ -1,2 +1 @@
|
|||
/temp
|
||||
/work
|
||||
/data
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
plugins {
|
||||
id "java"
|
||||
id "de.undercouch.download"
|
||||
}
|
||||
|
||||
description = 'System for benchmarking Lucene'
|
||||
|
@ -44,9 +43,6 @@ dependencies {
|
|||
testImplementation project(':lucene:test-framework')
|
||||
}
|
||||
|
||||
def tempDir = file("temp")
|
||||
def workDir = file("work")
|
||||
|
||||
task run(type: JavaExec) {
|
||||
description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
|
||||
main 'org.apache.lucene.benchmark.byTask.Benchmark'
|
||||
|
@ -67,92 +63,3 @@ task run(type: JavaExec) {
|
|||
suspend = true
|
||||
}
|
||||
}
|
||||
|
||||
/* Old "collation" Ant target:
|
||||
gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
|
||||
perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
|
||||
*/
|
||||
|
||||
/* Old "shingle" Ant target:
|
||||
gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
|
||||
perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
|
||||
*/
|
||||
|
||||
// The remaining tasks just get / extract / prepare data
|
||||
|
||||
task getEnWiki(type: Download) {
|
||||
def finalName = "enwiki-20070527-pages-articles.xml"
|
||||
src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
|
||||
dest file("$tempDir/" + finalName + ".bz2")
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
ant.bunzip2(src: dest, dest: tempDir)
|
||||
}
|
||||
outputs.file file("$tempDir/$finalName")
|
||||
}
|
||||
|
||||
task getGeoNames(type: Download) {
|
||||
// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
|
||||
// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
|
||||
// and then compress with: bzip2 -9 -k file_random.txt
|
||||
def finalName = "geonames_20130921_randomOrder_allCountries.txt"
|
||||
src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
|
||||
dest file("$tempDir/" + finalName + ".bz2")
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
doLast {
|
||||
ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
|
||||
}
|
||||
outputs.file file("$tempDir/$finalName")
|
||||
}
|
||||
|
||||
task getTop100kWikiWordFiles(type: Download) {
|
||||
src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
|
||||
dest file("$tempDir/${src.file.split('/').last()}")
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
def finalPath = file("$workDir/top100k-out")
|
||||
|
||||
doLast {
|
||||
project.sync {
|
||||
from tarTree(dest) // defined above. Will decompress on the fly
|
||||
into finalPath
|
||||
}
|
||||
}
|
||||
outputs.dir finalPath
|
||||
}
|
||||
|
||||
task getReuters(type: Download) {
|
||||
// note: there is no HTTPS url and we don't care because this is merely test/perf data
|
||||
src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
|
||||
dest file("$tempDir/${src.file.split('/').last()}")
|
||||
overwrite false
|
||||
compress false
|
||||
|
||||
def untarPath = file("$workDir/reuters")
|
||||
def finalPath = file("$workDir/reuters-out")
|
||||
dependsOn sourceSets.main.runtimeClasspath
|
||||
|
||||
doLast {
|
||||
project.sync {
|
||||
from(tarTree(dest)) { // defined above. Will decompress on the fly
|
||||
exclude '*.txt'
|
||||
}
|
||||
into untarPath
|
||||
}
|
||||
println "Extracting reuters to $finalPath"
|
||||
finalPath.deleteDir() // necessary
|
||||
// TODO consider porting ExtractReuters to groovy?
|
||||
project.javaexec {
|
||||
main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
|
||||
classpath = sourceSets.main.runtimeClasspath
|
||||
maxHeapSize = '1G'
|
||||
args = [untarPath, finalPath]
|
||||
}
|
||||
}
|
||||
outputs.dir finalPath
|
||||
}
|
||||
|
|
|
@ -387,10 +387,17 @@ public abstract class LuceneTestCase extends Assert {
|
|||
|
||||
public static final boolean TEST_ASSERTS_ENABLED = systemPropertyAsBoolean("tests.asserts", true);
|
||||
|
||||
/** TODO: javadoc? */
|
||||
/**
|
||||
* The default (embedded resource) lines file.
|
||||
*
|
||||
* @see #TEST_LINE_DOCS_FILE
|
||||
*/
|
||||
public static final String DEFAULT_LINE_DOCS_FILE = "europarl.lines.txt.gz";
|
||||
|
||||
/** TODO: javadoc? */
|
||||
/**
|
||||
* Random sample from enwiki used in tests. See {@code help/tests.txt}. gradle task downloading
|
||||
* this data set: {@code gradlew getEnWikiRandomLines}.
|
||||
*/
|
||||
public static final String JENKINS_LARGE_LINE_DOCS_FILE = "enwiki.random.lines.txt";
|
||||
|
||||
/** Gets the codec to run tests with. */
|
||||
|
@ -407,7 +414,7 @@ public abstract class LuceneTestCase extends Assert {
|
|||
/** Gets the directory to run tests with */
|
||||
public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");
|
||||
|
||||
/** the line file used by LineFileDocs */
|
||||
/** The line file used in tests (by {@link LineFileDocs}). */
|
||||
public static final String TEST_LINE_DOCS_FILE =
|
||||
System.getProperty("tests.linedocsfile", DEFAULT_LINE_DOCS_FILE);
|
||||
|
||||
|
|
|
@ -151,8 +151,10 @@ public final class RunListenerPrintReproduceInfo extends RunListener {
|
|||
}
|
||||
if (TEST_LINE_DOCS_FILE.endsWith(JENKINS_LARGE_LINE_DOCS_FILE)) {
|
||||
System.err.println(
|
||||
"NOTE: download the large Jenkins line-docs file by running "
|
||||
+ "'ant get-jenkins-line-docs' in the lucene directory.");
|
||||
"NOTE: large line-docs file was used in this run. You have to download "
|
||||
+ "it manually ('gradlew getEnWikiRandomLines') and use -P"
|
||||
+ TEST_LINE_DOCS_FILE
|
||||
+ "=... property to point to it.");
|
||||
}
|
||||
|
||||
final StringBuilder b = new StringBuilder();
|
||||
|
|
Loading…
Reference in New Issue