diff --git a/build.gradle b/build.gradle index 5c0bacac4e0..2789d03cc34 100644 --- a/build.gradle +++ b/build.gradle @@ -23,7 +23,7 @@ plugins { id "com.palantir.consistent-versions" version "1.14.0" id "org.owasp.dependencycheck" version "5.3.0" id 'de.thetaphi.forbiddenapis' version '3.1' apply false - id "de.undercouch.download" version "4.0.2" apply false + id "de.undercouch.download" version "4.1.1" apply false id "net.ltgt.errorprone" version "1.2.1" apply false id 'com.diffplug.spotless' version "5.8.2" apply false } @@ -156,6 +156,8 @@ apply from: file('gradle/generation/nori.gradle') apply from: file('gradle/generation/icu.gradle') apply from: file('gradle/generation/javacc.gradle') +apply from: file('gradle/datasets/external-datasets.gradle') + // Shared configuration of subprojects containing native code. apply from: file('gradle/native/disable-native.gradle') diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java similarity index 90% rename from lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java rename to buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java index 4e3003d525b..b8d6735c908 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java +++ b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.benchmark.utils; +package org.apache.lucene.gradle.datasets; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -27,10 +27,10 @@ import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.lucene.util.IOUtils; /** - * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body + * Split the Reuters SGML documents into Simple Text files containing: + * Title, Date, Dateline, Body */ public class ExtractReuters { private Path reutersDir; @@ -39,13 +39,16 @@ public class ExtractReuters { public ExtractReuters(Path reutersDir, Path outputDir) throws IOException { this.reutersDir = reutersDir; this.outputDir = outputDir; - System.out.println("Deleting all files in " + outputDir); - IOUtils.rm(outputDir); } public void extract() throws IOException { long count = 0; Files.createDirectories(outputDir); + + if (Files.list(outputDir).count() > 0) { + throw new IOException("The output directory must be empty: " + outputDir); + } + try (DirectoryStream stream = Files.newDirectoryStream(reutersDir, "*.sgm")) { for (Path sgmFile : stream) { extractFile(sgmFile); @@ -53,7 +56,7 @@ public class ExtractReuters { } } if (count == 0) { - System.err.println("No .sgm files in " + reutersDir); + throw new IOException("No .sgm files in " + reutersDir); } } @@ -65,7 +68,7 @@ public class ExtractReuters { private static String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"}; /** Override if you wish to change what is extracted */ - protected void extractFile(Path sgmFile) { + protected void extractFile(Path sgmFile) throws IOException { try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) { StringBuilder buffer = new StringBuilder(1024); StringBuilder outBuffer = new StringBuilder(1024); @@ -105,8 +108,6 @@ public class ExtractReuters { buffer.setLength(0); } } - } catch (IOException e) { - throw new RuntimeException(e); } } @@ -135,6 +136,8 @@ public class ExtractReuters { System.err.println( "Usage: " + msg - + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters "); + + " :: java -cp <...> " + + ExtractReuters.class.getName() + + " "); } } diff --git a/gradle/datasets/external-datasets.gradle b/gradle/datasets/external-datasets.gradle new file mode 100644 index 00000000000..f466909dac9 --- /dev/null +++ b/gradle/datasets/external-datasets.gradle @@ -0,0 +1,174 @@ +import org.apache.lucene.gradle.datasets.ExtractReuters + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO: not sure whether this should live in benchmarks, but for now +// let it be. +configure(project(":lucene:benchmark")) { + apply plugin: "java" + apply plugin: "de.undercouch.download" + + ext { + dataDir = file("data") + } + + task getEnWiki(type: Download) { + ext { + name = "enwiki-20070527-pages-articles.xml" + src = "https://home.apache.org/~dsmiley/data/${name}.bz2" + intermediate = file("${dataDir}/${name}.bz2") + dst = file("${dataDir}/${name}") + } + + outputs.file ext.dst + + src ext.src + dest ext.intermediate + overwrite false + compress false + + doLast { + logger.lifecycle("Decompressing ${ext.name}...") + ant.bunzip2(src: ext.intermediate, dest: ext.dst) + } + } + + task getEnWikiRandomLines(type: Download) { + ext { + name = "enwiki.random.lines.txt" + src = "https://home.apache.org/~mikemccand/${name}.bz2" + intermediate = file("${dataDir}/${name}.bz2") + dst = file("${dataDir}/${name}") + } + + outputs.file ext.dst + + src ext.src + dest ext.intermediate + overwrite false + compress false + + doLast { + logger.lifecycle("Decompressing ${ext.name}...") + ant.bunzip2(src: ext.intermediate, dest: ext.dst) + } + } + + task getGeoNames(type: Download) { + // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip + // and then randomize with: gsort -R -S 1500M file.txt > file_random.txt + // and then compress with: bzip2 -9 -k file_random.txt + ext { + name = "geonames_20130921_randomOrder_allCountries.txt" + src = "https://home.apache.org/~dsmiley/data/${name}.bz2" + intermediate = file("${dataDir}/${name}.bz2") + dst = file("${dataDir}/${name}") + } + + outputs.file ext.dst + + src ext.src + dest ext.intermediate + overwrite false + compress false + + doLast { + logger.lifecycle("Decompressing ${ext.name}...") + ant.bunzip2(src: ext.intermediate, dest: ext.dst) + } + } + + task getTop100kWikiWordFiles(type: Download) { + ext { + name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11" + src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2" + intermediate = file("${dataDir}/${name}.bz2") + dst = file("${dataDir}/${name}") + } + + outputs.dir ext.dst + + src ext.src + dest ext.intermediate + overwrite false + compress false + + doLast { + logger.lifecycle("Decompressing ${ext.name}...") + project.sync { + from tarTree(ext.intermediate) // defined above. Will decompress on the fly + into ext.dst + } + } + } + + task getReuters(type: Download) { + ext { + name = "reuters21578" + // note: there is no HTTPS url and we don't care because this is merely test/perf data + src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz" + intermediate = file("${dataDir}/${name}.tar.gz") + dst = file("${dataDir}/${name}") + } + + outputs.dir ext.dst + + src ext.src + dest ext.intermediate + overwrite false + compress false + + doLast { + def untarPath = file("$temporaryDir/reuters-untar") + + logger.lifecycle("Decompressing ${ext.name}...") + project.sync { + from(tarTree(intermediate)) { + exclude '*.txt' + } + into untarPath + } + + logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...") + ext.dst.deleteDir() + ExtractReuters.main(untarPath.toString(), ext.dst.toString()) + } + } + + task downloadDatasets() { + group "Data set download" + description "Download all data sets." + } + + [ + getEnWiki, + getGeoNames, + getTop100kWikiWordFiles, + getReuters, + getEnWikiRandomLines + ].each { task -> + task.group "Data set download" + task.description "Download the ${task.ext.name} data set." + + downloadDatasets.dependsOn(task) + + task.doFirst { + logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...") + } + } +} \ No newline at end of file diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 370b6920dad..fc6c991875d 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -96,7 +96,7 @@ allprojects { RandomPicks.randomFrom(new Random(projectSeedLong), ["US-ASCII", "ISO-8859-1", "UTF-8"]) }, description: "Sets the default file.encoding on test JVM.", buildOnly: true], - // test data + // Test data file used. [propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."], // miscellaneous; some of them very weird. [propName: 'tests.LUCENE_VERSION', value: baseVersion, description: "Base Lucene version."], diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle index d82f8b349b7..e9939a1991f 100644 --- a/gradle/validation/validate-source-patterns.gradle +++ b/gradle/validation/validate-source-patterns.gradle @@ -90,8 +90,7 @@ subprojects { configure(project(':lucene:benchmark')) { project.tasks.withType(ValidateSourcePatternsTask) { - sourceFiles.exclude 'temp/**' - sourceFiles.exclude 'work/**' + sourceFiles.exclude 'data/**' } } diff --git a/help/tests.txt b/help/tests.txt index 5054c0ee02d..b4a14449c77 100644 --- a/help/tests.txt +++ b/help/tests.txt @@ -155,3 +155,12 @@ Using these additional options will make the results more sparse, so it may be u to increase the top-N count: gradlew -p lucene/core test -Ptests.profile=true -Ptests.profile.count=100 + + +External data sets +------------------ + +Some tests may require external (and large) data sets. To see relevant tasks +that download and extract these data files automatically, run the following: + +gradlew tasks --group "Data set download" diff --git a/lucene/benchmark/.gitignore b/lucene/benchmark/.gitignore index a20524a7293..249cda967c1 100644 --- a/lucene/benchmark/.gitignore +++ b/lucene/benchmark/.gitignore @@ -1,2 +1 @@ -/temp -/work \ No newline at end of file +/data \ No newline at end of file diff --git a/lucene/benchmark/build.gradle b/lucene/benchmark/build.gradle index 73f1dd72833..9271b003710 100644 --- a/lucene/benchmark/build.gradle +++ b/lucene/benchmark/build.gradle @@ -17,7 +17,6 @@ plugins { id "java" - id "de.undercouch.download" } description = 'System for benchmarking Lucene' @@ -44,9 +43,6 @@ dependencies { testImplementation project(':lucene:test-framework') } -def tempDir = file("temp") -def workDir = file("work") - task run(type: JavaExec) { description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)" main 'org.apache.lucene.benchmark.byTask.Benchmark' @@ -67,92 +63,3 @@ task run(type: JavaExec) { suspend = true } } - -/* Old "collation" Ant target: -gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt -perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt - */ - -/* Old "shingle" Ant target: -gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt -perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt - */ - -// The remaining tasks just get / extract / prepare data - -task getEnWiki(type: Download) { - def finalName = "enwiki-20070527-pages-articles.xml" - src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2" - dest file("$tempDir/" + finalName + ".bz2") - overwrite false - compress false - - doLast { - ant.bunzip2(src: dest, dest: tempDir) - } - outputs.file file("$tempDir/$finalName") -} - -task getGeoNames(type: Download) { - // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip - // and then randomize with: gsort -R -S 1500M file.txt > file_random.txt - // and then compress with: bzip2 -9 -k file_random.txt - def finalName = "geonames_20130921_randomOrder_allCountries.txt" - src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2" - dest file("$tempDir/" + finalName + ".bz2") - overwrite false - compress false - - doLast { - ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2 - } - outputs.file file("$tempDir/$finalName") -} - -task getTop100kWikiWordFiles(type: Download) { - src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2" - dest file("$tempDir/${src.file.split('/').last()}") - overwrite false - compress false - - def finalPath = file("$workDir/top100k-out") - - doLast { - project.sync { - from tarTree(dest) // defined above. Will decompress on the fly - into finalPath - } - } - outputs.dir finalPath -} - -task getReuters(type: Download) { - // note: there is no HTTPS url and we don't care because this is merely test/perf data - src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz" - dest file("$tempDir/${src.file.split('/').last()}") - overwrite false - compress false - - def untarPath = file("$workDir/reuters") - def finalPath = file("$workDir/reuters-out") - dependsOn sourceSets.main.runtimeClasspath - - doLast { - project.sync { - from(tarTree(dest)) { // defined above. Will decompress on the fly - exclude '*.txt' - } - into untarPath - } - println "Extracting reuters to $finalPath" - finalPath.deleteDir() // necessary - // TODO consider porting ExtractReuters to groovy? - project.javaexec { - main = 'org.apache.lucene.benchmark.utils.ExtractReuters' - classpath = sourceSets.main.runtimeClasspath - maxHeapSize = '1G' - args = [untarPath, finalPath] - } - } - outputs.dir finalPath -} diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java index f05b62f4561..08d18bc704c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java @@ -387,10 +387,17 @@ public abstract class LuceneTestCase extends Assert { public static final boolean TEST_ASSERTS_ENABLED = systemPropertyAsBoolean("tests.asserts", true); - /** TODO: javadoc? */ + /** + * The default (embedded resource) lines file. + * + * @see #TEST_LINE_DOCS_FILE + */ public static final String DEFAULT_LINE_DOCS_FILE = "europarl.lines.txt.gz"; - /** TODO: javadoc? */ + /** + * Random sample from enwiki used in tests. See {@code help/tests.txt}. gradle task downloading + * this data set: {@code gradlew getEnWikiRandomLines}. + */ public static final String JENKINS_LARGE_LINE_DOCS_FILE = "enwiki.random.lines.txt"; /** Gets the codec to run tests with. */ @@ -407,7 +414,7 @@ public abstract class LuceneTestCase extends Assert { /** Gets the directory to run tests with */ public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random"); - /** the line file used by LineFileDocs */ + /** The line file used in tests (by {@link LineFileDocs}). */ public static final String TEST_LINE_DOCS_FILE = System.getProperty("tests.linedocsfile", DEFAULT_LINE_DOCS_FILE); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java index e088a7130b7..0f9664b9538 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java @@ -151,8 +151,10 @@ public final class RunListenerPrintReproduceInfo extends RunListener { } if (TEST_LINE_DOCS_FILE.endsWith(JENKINS_LARGE_LINE_DOCS_FILE)) { System.err.println( - "NOTE: download the large Jenkins line-docs file by running " - + "'ant get-jenkins-line-docs' in the lucene directory."); + "NOTE: large line-docs file was used in this run. You have to download " + + "it manually ('gradlew getEnWikiRandomLines') and use -P" + + TEST_LINE_DOCS_FILE + + "=... property to point to it."); } final StringBuilder b = new StringBuilder();