lucene/gradle/generation/snowball.sh

#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# remove this script when problems are fixed
SRCDIR=$1
WWWSRCDIR=$2
TESTSRCDIR=$3
PROJECTDIR=$4
DESTDIR="${PROJECTDIR}/src/java/org/tartarus/snowball"
WWWDSTDIR="${PROJECTDIR}/src/resources/org/apache/lucene/analysis/snowball"
TESTDSTDIR="${PROJECTDIR}/src/test/org/apache/lucene/analysis/snowball"

trap 'echo "usage: ./snowball.sh <snowball> <snowball-website> <snowball-data> <analysis-common>" && exit 2' ERR
test $# -eq 4

trap 'echo "*** BUILD FAILED ***" $BASH_SOURCE:$LINENO: error: "$BASH_COMMAND" returned $?' ERR
set -eEuo pipefail

# reformats file indentation to kill the crazy space/tabs mix.
# prevents early blindness !
function reformat_java() {
  # convert tabs to 8 spaces, then reduce indent from 4 space to 2 space
  target=$1
  tmpfile=$(mktemp)
  cat ${target} | perl -p -e 's/\t/        /g' | perl -p -e 's/    /  /g' > ${tmpfile}
  mv ${tmpfile} ${target}
}

# generate stuff with existing makefile, just 'make' will try to do crazy stuff with e.g. python
# and likely fail. so only ask for our specific target.
(cd ${SRCDIR} && chmod a+x libstemmer/mkalgorithms.pl && make dist_libstemmer_java)

for file in "SnowballStemmer.java" "Among.java" "SnowballProgram.java"; do
  # add license header to files since they have none, otherwise rat will flip the fuck out
  echo "/*" > ${DESTDIR}/${file}
  cat ${SRCDIR}/COPYING >> ${DESTDIR}/${file}
  echo "*/" >> ${DESTDIR}/${file}
  cat ${SRCDIR}/java/org/tartarus/snowball/${file} >> ${DESTDIR}/${file}
  reformat_java ${DESTDIR}/${file}
done

rm ${DESTDIR}/ext/*Stemmer.java
rm -f ${TESTDSTDIR}/languages.txt
for file in ${SRCDIR}/java/org/tartarus/snowball/ext/*.java; do
  # title-case the classes (fooStemmer -> FooStemmer) so they obey normal java conventions
  base=$(basename $file)
  oldclazz="${base%.*}"
  # one-off
  if [ "${oldclazz}" == "kraaij_pohlmannStemmer" ]; then
    newclazz="KpStemmer"
  else
    newclazz=${oldclazz^}
  fi
  echo ${newclazz} | sed -e 's/Stemmer//' >> ${TESTDSTDIR}/languages.txt
  cat $file | sed "s/${oldclazz}/${newclazz}/g" > ${DESTDIR}/ext/${newclazz}.java
  reformat_java ${DESTDIR}/ext/${newclazz}.java
done

# regenerate test data
rm -f ${TESTDSTDIR}/test_languages.txt
rm -f ${TESTDSTDIR}/*.zip
for file in ${TESTSRCDIR}/*; do
  # look for input (voc.txt) and expected output (output.txt) without any special licenses (COPYING)
  if [ -f "${file}/voc.txt" ] && [ -f "${file}/output.txt" ] && [ ! -f "${file}/COPYING" ]; then
    language=$(basename ${file})
    if [ "${language}" == "kraaij_pohlmann" ]; then
      language="kp"
    fi
    # make the .zip reproducible if data hasn't changed.
    arbitrary_timestamp="200001010000"
    # some test files are yuge, randomly sample up to this amount
    row_limit="2000"
    tmpdir=$(mktemp -d)
    myrandom="openssl enc -aes-256-ctr -k ${arbitrary_timestamp} -nosalt -iv 0 -md md5"
    for data in "voc.txt" "output.txt"; do
      shuf -n ${row_limit} --random-source=<(${myrandom} < /dev/zero 2>/dev/null) ${file}/${data} > ${tmpdir}/${data} \
        && touch -t ${arbitrary_timestamp} ${tmpdir}/${data}
    done
    # explicitly set permissions in case someone has a crazy umask (otherwise zip will differ)
    chmod 644 ${tmpdir}/voc.txt ${tmpdir}/output.txt
    zip --quiet --junk-paths -X -9 ${TESTDSTDIR}/${language}.zip ${tmpdir}/voc.txt ${tmpdir}/output.txt
    echo "${language}" >> ${TESTDSTDIR}/test_languages.txt
    rm -r ${tmpdir}
  fi
done

# regenerate stopwords data
rm -f ${WWWDSTDIR}/*_stop.txt
for file in ${WWWSRCDIR}/algorithms/*/stop.txt; do
  language=$(basename $(dirname ${file}))
  cat > ${WWWDSTDIR}/${language}_stop.txt << EOF
 | From https://snowballstem.org/algorithms/${language}/stop.txt
 | This file is distributed under the BSD License.
 | See https://snowballstem.org/license.html
 | Also see https://opensource.org/licenses/bsd-license.html
 |  - Encoding was converted to UTF-8.
 |  - This notice was added.
 |
 | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
EOF
  case "$language" in
    danish)
      # clear up some slight mojibake on the website. TODO: fix this file!
      cat $file | sed 's/Ã¥/å/g' | sed 's/Ã¦/æ/g' >> ${WWWDSTDIR}/${language}_stop.txt
      ;;
    *)
      # try to confirm its really UTF-8
      iconv -f UTF-8 -t UTF-8 $file >> ${WWWDSTDIR}/${language}_stop.txt
      ;;
  esac
done
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions. 2020-02-17 12:38:01 -05:00			`#!/usr/bin/env bash`

			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`# remove this script when problems are fixed`
			`SRCDIR=$1`
			`WWWSRCDIR=$2`
			`TESTSRCDIR=$3`
			`PROJECTDIR=$4`
			`DESTDIR="${PROJECTDIR}/src/java/org/tartarus/snowball"`
			`WWWDSTDIR="${PROJECTDIR}/src/resources/org/apache/lucene/analysis/snowball"`
			`TESTDSTDIR="${PROJECTDIR}/src/test/org/apache/lucene/analysis/snowball"`

			`trap 'echo "usage: ./snowball.sh <snowball> <snowball-website> <snowball-data> <analysis-common>" && exit 2' ERR`
			`test $# -eq 4`

			`trap 'echo "* BUILD FAILED *" $BASH_SOURCE:$LINENO: error: "$BASH_COMMAND" returned $?' ERR`
			`set -eEuo pipefail`

			`# reformats file indentation to kill the crazy space/tabs mix.`
			`# prevents early blindness !`
			`function reformat_java() {`
			`# convert tabs to 8 spaces, then reduce indent from 4 space to 2 space`
			`target=$1`
			`tmpfile=$(mktemp)`
			`cat ${target} \| perl -p -e 's/\t/ /g' \| perl -p -e 's/ / /g' > ${tmpfile}`
			`mv ${tmpfile} ${target}`
			`}`

			`# generate stuff with existing makefile, just 'make' will try to do crazy stuff with e.g. python`
			`# and likely fail. so only ask for our specific target.`
			`(cd ${SRCDIR} && chmod a+x libstemmer/mkalgorithms.pl && make dist_libstemmer_java)`

			`for file in "SnowballStemmer.java" "Among.java" "SnowballProgram.java"; do`
			`# add license header to files since they have none, otherwise rat will flip the fuck out`
			`echo "/*" > ${DESTDIR}/${file}`
			`cat ${SRCDIR}/COPYING >> ${DESTDIR}/${file}`
			`echo "*/" >> ${DESTDIR}/${file}`
			`cat ${SRCDIR}/java/org/tartarus/snowball/${file} >> ${DESTDIR}/${file}`
			`reformat_java ${DESTDIR}/${file}`
			`done`

			`rm ${DESTDIR}/ext/*Stemmer.java`
			`rm -f ${TESTDSTDIR}/languages.txt`
			`for file in ${SRCDIR}/java/org/tartarus/snowball/ext/*.java; do`
			`# title-case the classes (fooStemmer -> FooStemmer) so they obey normal java conventions`
			`base=$(basename $file)`
			`oldclazz="${base%.*}"`
			`# one-off`
			`if [ "${oldclazz}" == "kraaij_pohlmannStemmer" ]; then`
			`newclazz="KpStemmer"`
			`else`
			`newclazz=${oldclazz^}`
			`fi`
			`echo ${newclazz} \| sed -e 's/Stemmer//' >> ${TESTDSTDIR}/languages.txt`
			`cat $file \| sed "s/${oldclazz}/${newclazz}/g" > ${DESTDIR}/ext/${newclazz}.java`
			`reformat_java ${DESTDIR}/ext/${newclazz}.java`
			`done`

			`# regenerate test data`
			`rm -f ${TESTDSTDIR}/test_languages.txt`
			`rm -f ${TESTDSTDIR}/*.zip`
			`for file in ${TESTSRCDIR}/*; do`
			`# look for input (voc.txt) and expected output (output.txt) without any special licenses (COPYING)`
			`if [ -f "${file}/voc.txt" ] && [ -f "${file}/output.txt" ] && [ ! -f "${file}/COPYING" ]; then`
			`language=$(basename ${file})`
			`if [ "${language}" == "kraaij_pohlmann" ]; then`
			`language="kp"`
			`fi`
			`# make the .zip reproducible if data hasn't changed.`
			`arbitrary_timestamp="200001010000"`
			`# some test files are yuge, randomly sample up to this amount`
			`row_limit="2000"`
			`tmpdir=$(mktemp -d)`
			`myrandom="openssl enc -aes-256-ctr -k ${arbitrary_timestamp} -nosalt -iv 0 -md md5"`
			`for data in "voc.txt" "output.txt"; do`
			`shuf -n ${row_limit} --random-source=<(${myrandom} < /dev/zero 2>/dev/null) ${file}/${data} > ${tmpdir}/${data} \`
			`&& touch -t ${arbitrary_timestamp} ${tmpdir}/${data}`
			`done`
LUCENE-9220: prevent zip file reproducibility issues based on users umask 2020-02-17 13:34:00 -05:00			`# explicitly set permissions in case someone has a crazy umask (otherwise zip will differ)`
			`chmod 644 ${tmpdir}/voc.txt ${tmpdir}/output.txt`
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions. 2020-02-17 12:38:01 -05:00			`zip --quiet --junk-paths -X -9 ${TESTDSTDIR}/${language}.zip ${tmpdir}/voc.txt ${tmpdir}/output.txt`
			`echo "${language}" >> ${TESTDSTDIR}/test_languages.txt`
			`rm -r ${tmpdir}`
			`fi`
			`done`

			`# regenerate stopwords data`
			`rm -f ${WWWDSTDIR}/*_stop.txt`
			`for file in ${WWWSRCDIR}/algorithms/*/stop.txt; do`
			`language=$(basename $(dirname ${file}))`
			`cat > ${WWWDSTDIR}/${language}_stop.txt << EOF`
			`\| From https://snowballstem.org/algorithms/${language}/stop.txt`
			`\| This file is distributed under the BSD License.`
			`\| See https://snowballstem.org/license.html`
			`\| Also see https://opensource.org/licenses/bsd-license.html`
			`\| - Encoding was converted to UTF-8.`
			`\| - This notice was added.`
			`\|`
			`\| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"`
			`EOF`
			`case "$language" in`
			`danish)`
			`# clear up some slight mojibake on the website. TODO: fix this file!`
			`cat $file \| sed 's/Ã¥/å/g' \| sed 's/Ã¦/æ/g' >> ${WWWDSTDIR}/${language}_stop.txt`
			`;;`
			`*)`
			`# try to confirm its really UTF-8`
			`iconv -f UTF-8 -t UTF-8 $file >> ${WWWDSTDIR}/${language}_stop.txt`
			`;;`
			`esac`
			`done`