lucene/gradle/datasets/external-datasets.gradle

import org.apache.lucene.gradle.datasets.ExtractReuters

import java.nio.file.Files

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

buildscript {
  repositories {
    mavenCentral()
  }

  dependencies {
    classpath "com.github.luben:zstd-jni:1.5.5-11"
  }
}

def unzstd(java.nio.file.Path src, java.nio.file.Path dst) {
  try (InputStream is = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(Files.newInputStream(src)));
       OutputStream os = new BufferedOutputStream(Files.newOutputStream(dst))) {
    is.transferTo(os)
  }
}


// TODO: not sure whether this should live in benchmarks, but for now let it be.
configure(project(":lucene:benchmark")) {
  apply plugin: "java"
  apply plugin: "de.undercouch.download"

  ext {
    dataDir = file("work")
  }

  task getEnWiki(type: Download) {
    ext {
      name = "enwiki-20070527-pages-articles.xml"
      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
      intermediate = file("${dataDir}/${name}.bz2")
      dst = file("${dataDir}/${name}")
    }

    outputs.file ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
    }
  }

  task getEnWikiRandomLines(type: Download) {
    ext {
      name = "enwiki.random.lines.txt"
      src = "https://home.apache.org/~mikemccand/${name}.zst"
      intermediate = file("${dataDir}/${name}.zst")
      dst = file("${dataDir}/${name}")
    }

    outputs.file ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      unzstd(ext.intermediate.toPath(), ext.dst.toPath())
    }
  }

  task getGeoNames(type: Download) {
    // note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip
    //       and then randomize with: gsort -R -S 1500M file.txt > file_random.txt
    //       and then compress with: bzip2 -9 -k file_random.txt
    ext {
      name = "geonames_20130921_randomOrder_allCountries.txt"
      src = "https://home.apache.org/~dsmiley/data/${name}.bz2"
      intermediate = file("${dataDir}/${name}.bz2")
      dst = file("${dataDir}/${name}")
    }

    outputs.file ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      ant.bunzip2(src: ext.intermediate, dest: ext.dst)
    }
  }

  task getTop100kWikiWordFiles(type: Download) {
    ext {
      name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"
      src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"
      intermediate = file("${dataDir}/${name}.bz2")
      dst = file("${dataDir}/${name}")
    }

    outputs.dir ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      logger.lifecycle("Decompressing ${ext.name}...")
      project.sync {
        from tarTree(ext.intermediate) // defined above. Will decompress on the fly
        into ext.dst
      }
    }
  }

  task getReuters(type: Download) {
    ext {
      name = "reuters21578"
      src = "https://kdd.ics.uci.edu/databases/${name}/${name}.tar.gz"
      intermediate = file("${dataDir}/${name}.tar.gz")
      dst = file("${dataDir}/reuters-out")
    }

    outputs.dir ext.dst

    src ext.src
    dest ext.intermediate
    overwrite false
    compress false

    doLast {
      def untarPath = file("$temporaryDir/reuters-untar")

      logger.lifecycle("Decompressing ${ext.name}...")
      project.sync {
        from(tarTree(intermediate)) {
          exclude '*.txt'
        }
        into untarPath
      }

      logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")
      ext.dst.deleteDir()
      ExtractReuters.main(untarPath.toString(), ext.dst.toString())
    }
  }

  task downloadDatasets() {
    group "Data set download"
    description "Download all data sets."
  }

  [
      getEnWiki,
      getGeoNames,
      getTop100kWikiWordFiles,
      getReuters,
      getEnWikiRandomLines
  ].each { task ->
    task.group "Data set download"
    task.description "Download the ${task.ext.name} data set."

    downloadDatasets.dependsOn(task)

    task.doFirst {
      logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")
    }
  }
}
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`import org.apache.lucene.gradle.datasets.ExtractReuters`

Modify getEnWikiRandomLines to fetch and decompress the zstd resource #13083 2024-02-06 16:08:09 -05:00			`import java.nio.file.Files`

LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`/*`
			`* Licensed to the Apache Software Foundation (ASF) under one or more`
			`* contributor license agreements. See the NOTICE file distributed with`
			`* this work for additional information regarding copyright ownership.`
			`* The ASF licenses this file to You under the Apache License, Version 2.0`
			`* (the "License"); you may not use this file except in compliance with`
			`* the License. You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

Modify getEnWikiRandomLines to fetch and decompress the zstd resource #13083 2024-02-06 16:08:09 -05:00			`buildscript {`
			`repositories {`
			`mavenCentral()`
			`}`

			`dependencies {`
			`classpath "com.github.luben:zstd-jni:1.5.5-11"`
			`}`
			`}`

			`def unzstd(java.nio.file.Path src, java.nio.file.Path dst) {`
			`try (InputStream is = new com.github.luben.zstd.ZstdInputStream(new BufferedInputStream(Files.newInputStream(src)));`
			`OutputStream os = new BufferedOutputStream(Files.newOutputStream(dst))) {`
			`is.transferTo(os)`
			`}`
			`}`


			`// TODO: not sure whether this should live in benchmarks, but for now let it be.`
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`configure(project(":lucene:benchmark")) {`
			`apply plugin: "java"`
			`apply plugin: "de.undercouch.download"`

			`ext {`
LUCENE-9651 Update benchmark module docs (#759) 2022-03-23 15:51:28 -04:00			`dataDir = file("work")`
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`}`

			`task getEnWiki(type: Download) {`
			`ext {`
			`name = "enwiki-20070527-pages-articles.xml"`
			`src = "https://home.apache.org/~dsmiley/data/${name}.bz2"`
			`intermediate = file("${dataDir}/${name}.bz2")`
			`dst = file("${dataDir}/${name}")`
			`}`

			`outputs.file ext.dst`

			`src ext.src`
			`dest ext.intermediate`
			`overwrite false`
			`compress false`

			`doLast {`
			`logger.lifecycle("Decompressing ${ext.name}...")`
			`ant.bunzip2(src: ext.intermediate, dest: ext.dst)`
			`}`
			`}`

			`task getEnWikiRandomLines(type: Download) {`
			`ext {`
			`name = "enwiki.random.lines.txt"`
Modify getEnWikiRandomLines to fetch and decompress the zstd resource #13083 2024-02-06 16:08:09 -05:00			`src = "https://home.apache.org/~mikemccand/${name}.zst"`
			`intermediate = file("${dataDir}/${name}.zst")`
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`dst = file("${dataDir}/${name}")`
			`}`

			`outputs.file ext.dst`

			`src ext.src`
			`dest ext.intermediate`
			`overwrite false`
			`compress false`

			`doLast {`
			`logger.lifecycle("Decompressing ${ext.name}...")`
Modify getEnWikiRandomLines to fetch and decompress the zstd resource #13083 2024-02-06 16:08:09 -05:00			`unzstd(ext.intermediate.toPath(), ext.dst.toPath())`
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`}`
			`}`

			`task getGeoNames(type: Download) {`
			`// note: latest data is at: https://download.geonames.org/export/dump/allCountries.zip`
			`// and then randomize with: gsort -R -S 1500M file.txt > file_random.txt`
			`// and then compress with: bzip2 -9 -k file_random.txt`
			`ext {`
			`name = "geonames_20130921_randomOrder_allCountries.txt"`
			`src = "https://home.apache.org/~dsmiley/data/${name}.bz2"`
			`intermediate = file("${dataDir}/${name}.bz2")`
			`dst = file("${dataDir}/${name}")`
			`}`

			`outputs.file ext.dst`

			`src ext.src`
			`dest ext.intermediate`
			`overwrite false`
			`compress false`

			`doLast {`
			`logger.lifecycle("Decompressing ${ext.name}...")`
			`ant.bunzip2(src: ext.intermediate, dest: ext.dst)`
			`}`
			`}`

			`task getTop100kWikiWordFiles(type: Download) {`
			`ext {`
			`name = "top.100k.words.de.en.fr.uk.wikipedia.2009-11"`
			`src = "https://home.apache.org/~rmuir/wikipedia/${name}.tar.bz2"`
			`intermediate = file("${dataDir}/${name}.bz2")`
			`dst = file("${dataDir}/${name}")`
			`}`

			`outputs.dir ext.dst`

			`src ext.src`
			`dest ext.intermediate`
			`overwrite false`
			`compress false`

			`doLast {`
			`logger.lifecycle("Decompressing ${ext.name}...")`
			`project.sync {`
			`from tarTree(ext.intermediate) // defined above. Will decompress on the fly`
			`into ext.dst`
			`}`
			`}`
			`}`

			`task getReuters(type: Download) {`
			`ext {`
			`name = "reuters21578"`
LUCENE-9651 Update benchmark module docs (#759) 2022-03-23 15:51:28 -04:00			`src = "https://kdd.ics.uci.edu/databases/${name}/${name}.tar.gz"`
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`intermediate = file("${dataDir}/${name}.tar.gz")`
LUCENE-9651 Update benchmark module docs (#759) 2022-03-23 15:51:28 -04:00			`dst = file("${dataDir}/reuters-out")`
LUCENE-9854: Clean up utilities to download and extract test/ benchmark data sets. (#27) 2021-03-22 07:22:39 -04:00			`}`

			`outputs.dir ext.dst`

			`src ext.src`
			`dest ext.intermediate`
			`overwrite false`
			`compress false`

			`doLast {`
			`def untarPath = file("$temporaryDir/reuters-untar")`

			`logger.lifecycle("Decompressing ${ext.name}...")`
			`project.sync {`
			`from(tarTree(intermediate)) {`
			`exclude '*.txt'`
			`}`
			`into untarPath`
			`}`

			`logger.lifecycle("Extracting ${ext.name} into ${ext.dst}...")`
			`ext.dst.deleteDir()`
			`ExtractReuters.main(untarPath.toString(), ext.dst.toString())`
			`}`
			`}`

			`task downloadDatasets() {`
			`group "Data set download"`
			`description "Download all data sets."`
			`}`

			`[`
			`getEnWiki,`
			`getGeoNames,`
			`getTop100kWikiWordFiles,`
			`getReuters,`
			`getEnWikiRandomLines`
			`].each { task ->`
			`task.group "Data set download"`
			`task.description "Download the ${task.ext.name} data set."`

			`downloadDatasets.dependsOn(task)`

			`task.doFirst {`
			`logger.lifecycle("Downloading data set ${task.ext.name} from ${task.ext.src} to ${task.ext.dst}...")`
			`}`
			`}`
LUCENE-9651 Update benchmark module docs (#759) 2022-03-23 15:51:28 -04:00			`}`