LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27)

2021-03-22 12:22:39 +01:00 · 2021-03-22 12:22:39 +01:00 · 246c4beb22
parent a5996dbecd
commit 246c4beb22
10 changed files with 216 additions and 114 deletions
--- a/build.gradle
+++ b/build.gradle
@ -23,7 +23,7 @@ plugins {
  id "com.palantir.consistent-versions" version "1.14.0"
  id "org.owasp.dependencycheck" version "5.3.0"
  id 'de.thetaphi.forbiddenapis' version '3.1' apply false
-  id "de.undercouch.download" version "4.0.2" apply false
+  id "de.undercouch.download" version "4.1.1" apply false
  id "net.ltgt.errorprone" version "1.2.1" apply false
  id 'com.diffplug.spotless' version "5.8.2" apply false
 }
@ -156,6 +156,8 @@ apply from: file('gradle/generation/nori.gradle')
 apply from: file('gradle/generation/icu.gradle')
 apply from: file('gradle/generation/javacc.gradle')

+apply from: file('gradle/datasets/external-datasets.gradle')
+
 // Shared configuration of subprojects containing native code.
 apply from: file('gradle/native/disable-native.gradle')

--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java
@ -14,7 +14,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.lucene.benchmark.utils;
+package org.apache.lucene.gradle.datasets;

 import java.io.BufferedReader;
 import java.io.BufferedWriter;
@ -27,10 +27,10 @@ import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import org.apache.lucene.util.IOUtils;

 /**
- * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body
+ * Split the Reuters SGML documents into Simple Text files containing:
+ * Title, Date, Dateline, Body
 */
 public class ExtractReuters {
  private Path reutersDir;
@ -39,13 +39,16 @@ public class ExtractReuters {
  public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
    this.reutersDir = reutersDir;
    this.outputDir = outputDir;
-    System.out.println("Deleting all files in " + outputDir);
-    IOUtils.rm(outputDir);
  }

  public void extract() throws IOException {
    long count = 0;
    Files.createDirectories(outputDir);
+
+    if (Files.list(outputDir).count() > 0) {
+      throw new IOException("The output directory must be empty: " + outputDir);
+    }
+
    try (DirectoryStream<Path> stream = Files.newDirectoryStream(reutersDir, "*.sgm")) {
      for (Path sgmFile : stream) {
        extractFile(sgmFile);
@ -53,7 +56,7 @@ public class ExtractReuters {
      }
    }
    if (count == 0) {
-      System.err.println("No .sgm files in " + reutersDir);
+      throw new IOException("No .sgm files in " + reutersDir);
    }
  }

@ -65,7 +68,7 @@ public class ExtractReuters {
  private static String[] META_CHARS_SERIALIZATIONS = {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};

  /** Override if you wish to change what is extracted */
-  protected void extractFile(Path sgmFile) {
+  protected void extractFile(Path sgmFile) throws IOException {
    try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1)) {
      StringBuilder buffer = new StringBuilder(1024);
      StringBuilder outBuffer = new StringBuilder(1024);
@ -105,8 +108,6 @@ public class ExtractReuters {
          buffer.setLength(0);
        }
      }
-    } catch (IOException e) {
-      throw new RuntimeException(e);
    }
  }

@ -135,6 +136,8 @@ public class ExtractReuters {
    System.err.println(
        "Usage: "
            + msg
-            + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
+            + " :: java -cp <...> "
+            + ExtractReuters.class.getName()
+            + " <Path to Reuters SGM files> <Output Path>");
  }
 }
--- a/gradle/datasets/external-datasets.gradle
+++ b/gradle/datasets/external-datasets.gradle
@ -0,0 +1,174 @@
+import org.apache.lucene.gradle.datasets.ExtractReuters
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// TODO: not sure whether this should live in benchmarks, but for now
+// let it be.
+configure(project(":lucene:benchmark")) {
+  apply plugin: "java"
+  apply plugin: "de.undercouch.download"
+
+  ext {
+    dataDir = file("data")
+  }
+
+  task getEnWiki(type: Download) {
+    ext {
+      name = "enwiki-20070527-pages-articles.xml"
+      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.file ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+    }
+  }
+
+  task getEnWikiRandomLines(type: Download) {
+    ext {
+      name = "enwiki.random.lines.txt"
+      src = "https://home.apache.org/~mikemccand/${name}.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.file ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+    }
+  }
+
+  task getGeoNames(type: Download) {
+    // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
+    //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
+    //       and then compress with: bzip2 -9 -k file_random.txt
+    ext {
+      name = "geonames_20130921_randomOrder_allCountries.txt"
+      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.file ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
+    }
+  }
+
+  task getTop100kWikiWordFiles(type: Download) {
+    ext {
+      name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
+      src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
+      intermediate = file("${dataDir}/${name}.bz2")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.dir ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      logger.lifecycle("Decompressing ${ext.name}...")
+      project.sync {
+        from tarTree(ext.intermediate) // defined above. Will decompress on the fly
+        into ext.dst
+      }
+    }
+  }
+
+  task getReuters(type: Download) {
+    ext {
+      name = "reuters21578"
+      // note: there is no HTTPS url and we don't care because this is merely test/perf data
+      src = "http://www.daviddlewis.com/resources/testcollections/reuters21578/${name}.tar.gz"
+      intermediate = file("${dataDir}/${name}.tar.gz")
+      dst = file("${dataDir}/${name}")
+    }
+
+    outputs.dir ext.dst
+
+    src ext.src
+    dest ext.intermediate
+    overwrite false
+    compress false
+
+    doLast {
+      def untarPath = file("$temporaryDir/reuters-untar")
+
+      logger.lifecycle("Decompressing ${ext.name}...")
+      project.sync {
+        from(tarTree(intermediate)) {
+          exclude '*.txt'
+        }
+        into untarPath
+      }
+
+      logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
+      ext.dst.deleteDir()
+      ExtractReuters.main(untarPath.toString(), ext.dst.toString())
+    }
+  }
+
+  task downloadDatasets() {
+    group "Data set download"
+    description "Download all data sets."
+  }
+
+  [
+      getEnWiki,
+      getGeoNames,
+      getTop100kWikiWordFiles,
+      getReuters,
+      getEnWikiRandomLines
+  ].each { task ->
+    task.group "Data set download"
+    task.description "Download the ${task.ext.name} data set."
+
+    downloadDatasets.dependsOn(task)
+
+    task.doFirst {
+      logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
+    }
+  }
+}
--- a/gradle/testing/randomization.gradle
+++ b/gradle/testing/randomization.gradle
@ -96,7 +96,7 @@ allprojects {
             RandomPicks.randomFrom(new Random(projectSeedLong), ["US-ASCII", "ISO-8859-1", "UTF-8"])
           },
           description: "Sets the default file.encoding on test JVM.", buildOnly: true],
-          // test data
+          // Test data file used.
          [propName: 'tests.linedocsfile', value: 'europarl.lines.txt.gz', description: "Test data file path."],
          // miscellaneous; some of them very weird.
          [propName: 'tests.LUCENE_VERSION', value: baseVersion, description: "Base Lucene version."],
--- a/gradle/validation/validate-source-patterns.gradle
+++ b/gradle/validation/validate-source-patterns.gradle
@ -90,8 +90,7 @@ subprojects {

 configure(project(':lucene:benchmark')) {
  project.tasks.withType(ValidateSourcePatternsTask) {
-    sourceFiles.exclude 'temp/**'
-    sourceFiles.exclude 'work/**'
+    sourceFiles.exclude 'data/**'
  }
 }

--- a/help/tests.txt
+++ b/help/tests.txt
@ -155,3 +155,12 @@ Using these additional options will make the results more sparse, so it may be u
 to increase the top-N count:

 gradlew -p lucene/core test -Ptests.profile=true -Ptests.profile.count=100
+
+
+External data sets
+------------------
+
+Some tests may require external (and large) data sets. To see relevant tasks
+that download and extract these data files automatically, run the following:
+
+gradlew tasks --group "Data set download"
--- a/lucene/benchmark/.gitignore
+++ b/lucene/benchmark/.gitignore
@ -1,2 +1 @@
-/temp
-/work
+/data
--- a/lucene/benchmark/build.gradle
+++ b/lucene/benchmark/build.gradle
@ -17,7 +17,6 @@

 plugins {
  id "java"
-  id "de.undercouch.download"
 }

 description = 'System for benchmarking Lucene'
@ -44,9 +43,6 @@ dependencies {
  testImplementation project(':lucene:test-framework')
 }

-def tempDir = file("temp")
-def workDir = file("work")
-
 task run(type: JavaExec) {
  description "Run a perf test (optional: -PtaskAlg=conf/your-algorithm-file -PmaxHeapSize=1G)"
  main 'org.apache.lucene.benchmark.byTask.Benchmark'
@ -67,92 +63,3 @@ task run(type: JavaExec) {
    suspend = true
  }
 }
-
-/* Old "collation" Ant target:
-gradle getTop100kWikiWordFiles run -PtaskAlg=conf/collation.alg -PstandardOutput=work/collation.benchmark.output.txt
-perl -CSD scripts/collation.bm2jira.pl work/collation.benchmark.output.txt
- */
-
-/* Old "shingle" Ant target:
-gradle getReuters run -PtaskAlg=conf/shingle.alg -PstandardOutput=work/shingle.benchmark.output.txt
-perl -CSD scripts/shingle.bm2jira.pl work/shingle.benchmark.output.txt
- */
-
-// The remaining tasks just get / extract / prepare data
-
-task getEnWiki(type: Download) {
-  def finalName = "enwiki-20070527-pages-articles.xml"
-  src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
-  dest file("$tempDir/" + finalName + ".bz2")
-  overwrite false
-  compress false
-
-  doLast {
-    ant.bunzip2(src: dest, dest: tempDir)
-  }
-  outputs.file file("$tempDir/$finalName")
-}
-
-task getGeoNames(type: Download) {
-  // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
-  //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
-  //       and then compress with: bzip2 -9 -k file_random.txt
-  def finalName = "geonames_20130921_randomOrder_allCountries.txt"
-  src "https://home.apache.org/~dsmiley/data/" + finalName + ".bz2"
-  dest file("$tempDir/" + finalName + ".bz2")
-  overwrite false
-  compress false
-
-  doLast {
-    ant.bunzip2(src: dest, dest: tempDir) // will chop off .bz2
-  }
-  outputs.file file("$tempDir/$finalName")
-}
-
-task getTop100kWikiWordFiles(type: Download) {
-  src "https://home.apache.org/~rmuir/wikipedia/top.100k.words.de.en.fr.uk.wikipedia.2009-11.tar.bz2"
-  dest file("$tempDir/${src.file.split('/').last()}")
-  overwrite false
-  compress false
-
-  def finalPath = file("$workDir/top100k-out")
-
-  doLast {
-    project.sync {
-      from tarTree(dest) // defined above.  Will decompress on the fly
-      into finalPath
-    }
-  }
-  outputs.dir finalPath
-}
-
-task getReuters(type: Download) {
-  // note: there is no HTTPS url and we don't care because this is merely test/perf data
-  src "http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz"
-  dest file("$tempDir/${src.file.split('/').last()}")
-  overwrite false
-  compress false
-
-  def untarPath = file("$workDir/reuters")
-  def finalPath = file("$workDir/reuters-out")
-  dependsOn sourceSets.main.runtimeClasspath
-
-  doLast {
-    project.sync {
-      from(tarTree(dest)) { // defined above.  Will decompress on the fly
-        exclude '*.txt'
-      }
-      into untarPath
-    }
-    println "Extracting reuters to $finalPath"
-    finalPath.deleteDir() // necessary
-    // TODO consider porting ExtractReuters to groovy?
-    project.javaexec {
-      main = 'org.apache.lucene.benchmark.utils.ExtractReuters'
-      classpath = sourceSets.main.runtimeClasspath
-      maxHeapSize = '1G'
-      args = [untarPath, finalPath]
-    }
-  }
-  outputs.dir finalPath
-}
--- a/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/LuceneTestCase.java
@ -387,10 +387,17 @@ public abstract class LuceneTestCase extends Assert {

  public static final boolean TEST_ASSERTS_ENABLED = systemPropertyAsBoolean("tests.asserts", true);

-  /** TODO: javadoc? */
+  /**
+   * The default (embedded resource) lines file.
+   *
+   * @see #TEST_LINE_DOCS_FILE
+   */
  public static final String DEFAULT_LINE_DOCS_FILE = "europarl.lines.txt.gz";

-  /** TODO: javadoc? */
+  /**
+   * Random sample from enwiki used in tests. See {@code help/tests.txt}. gradle task downloading
+   * this data set: {@code gradlew getEnWikiRandomLines}.
+   */
  public static final String JENKINS_LARGE_LINE_DOCS_FILE = "enwiki.random.lines.txt";

  /** Gets the codec to run tests with. */
@ -407,7 +414,7 @@ public abstract class LuceneTestCase extends Assert {
  /** Gets the directory to run tests with */
  public static final String TEST_DIRECTORY = System.getProperty("tests.directory", "random");

-  /** the line file used by LineFileDocs */
+  /** The line file used in tests (by {@link LineFileDocs}). */
  public static final String TEST_LINE_DOCS_FILE =
      System.getProperty("tests.linedocsfile", DEFAULT_LINE_DOCS_FILE);

--- a/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/RunListenerPrintReproduceInfo.java
@ -151,8 +151,10 @@ public final class RunListenerPrintReproduceInfo extends RunListener {
    }
    if (TEST_LINE_DOCS_FILE.endsWith(JENKINS_LARGE_LINE_DOCS_FILE)) {
      System.err.println(
-          "NOTE: download the large Jenkins line-docs file by running "
-              + "'ant get-jenkins-line-docs' in the lucene directory.");
+          "NOTE: large line-docs file was used in this run. You have to download "
+              + "it manually ('gradlew getEnWikiRandomLines') and use -P"
+              + TEST_LINE_DOCS_FILE
+              + "=... property to point to it.");
    }

    final StringBuilder b = new StringBuilder();