lucene/gradle/datasets/external-datasets.gradle

import java.nio.file.Files

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

buildscript {
  repositories {
    mavenCentral()
  }

  dependencies {
    classpath deps.zstd
  }
}

def unzstd(java.nio.file.Path src, java.nio.file.Path dst) {
  try (InputStream is = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(Files.newInputStream(src)));
       OutputStream os = new BufferedOutputStream(Files.newOutputStream(dst))) {
    is.transferTo(os)
  }
}


// TODO: not sure whether this should live in benchmarks, but for now let it be.
configure(project(":lucene:benchmark")) {
  apply plugin: "java"
  apply plugin: deps.plugins.undercouch.download.get().pluginId

  ext {
    dataDir = file("work")
  }

  task getEnWiki(type: Download) {
    ext {
      name = "enwiki-20070527-pages-articles.xml"
      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
      intermediate = file("${dataDir}/${name}.bz2")
      dst = file("${dataDir}/${name}")
    }

    outputs.file ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
    }
  }

  task getEnWikiRandomLines(type: Download) {
    ext {
      name = "enwiki.random.lines.txt"
      src = "https://home.apache.org/~mikemccand/${name}.zst"
      intermediate = file("${dataDir}/${name}.zst")
      dst = file("${dataDir}/${name}")
    }

    outputs.file ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      unzstd(ext.intermediate.toPath(), ext.dst.toPath())
    }
  }

  task getGeoNames(type: Download) {
    // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
    //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
    //       and then compress with: bzip2 -9 -k file_random.txt
    ext {
      name = "geonames_20130921_randomOrder_allCountries.txt"
      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
      intermediate = file("${dataDir}/${name}.bz2")
      dst = file("${dataDir}/${name}")
    }

    outputs.file ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
    }
  }

  task getTop100kWikiWordFiles(type: Download) {
    ext {
      name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
      src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
      intermediate = file("${dataDir}/${name}.bz2")
      dst = file("${dataDir}/${name}")
    }

    outputs.dir ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      project.sync {
        from tarTree(ext.intermediate) // defined above. Will decompress on the fly
        into ext.dst
      }
    }
  }

  task getReuters(type: Download) {
    ext {
      name = "reuters21578"
      src = "https://kdd.ics.uci.edu/databases/${name}/${name}.tar.gz"
      intermediate = file("${dataDir}/${name}.tar.gz")
      dst = file("${dataDir}/reuters-out")
    }

    outputs.dir ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      def untarPath = file("$temporaryDir/reuters-untar")

      logger.lifecycle("Decompressing ${ext.name}...")
      project.sync {
        from(tarTree(intermediate)) {
          exclude '*.txt'
        }
        into untarPath
      }

      logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
      ext.dst.deleteDir()
      buildinfra.extractReuters(untarPath.toString(), ext.dst.toString())
    }
  }

  task downloadDatasets() {
    group "Data set download"
    description "Download all data sets."
  }

  [
      getEnWiki,
      getGeoNames,
      getTop100kWikiWordFiles,
      getReuters,
      getEnWikiRandomLines
  ].each { task ->
    task.group "Data set download"
    task.description "Download the ${task.ext.name} data set."

    downloadDatasets.dependsOn(task)

    task.doFirst {
      logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
    }
  }
}