lucene/gradle/generation/snowball/snowball.sh

#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# remove this script when problems are fixed
SRCDIR=$1
WWWSRCDIR=$2
PROJECTDIR=$3
DESTDIR="${PROJECTDIR}/src/java/org/tartarus/snowball"
WWWDSTDIR="${PROJECTDIR}/src/resources/org/apache/lucene/analysis/snowball"
TESTDSTDIR="${PROJECTDIR}/src/test/org/apache/lucene/analysis/snowball"

trap 'echo "usage: ./snowball.sh <snowball> <snowball-website> <analysis-common>" && exit 2' ERR
test $# -eq 3

trap 'echo "*** BUILD FAILED ***" $BASH_SOURCE:$LINENO: error: "$BASH_COMMAND" returned $?' ERR
set -eEuo pipefail

# generate stuff with existing makefile, just 'make' will try to do crazy stuff with e.g. python
# and likely fail. so only ask for our specific target.
(cd ${SRCDIR} && chmod a+x libstemmer/mkalgorithms.pl && make dist_libstemmer_java)

for file in "SnowballStemmer.java" "Among.java" "SnowballProgram.java"; do
  # add license header to files since they have none, otherwise rat will flip the fuck out
  echo "/*" > ${DESTDIR}/${file}
  cat ${SRCDIR}/COPYING >> ${DESTDIR}/${file}
  echo "*/" >> ${DESTDIR}/${file}
  cat ${SRCDIR}/java/org/tartarus/snowball/${file} >> ${DESTDIR}/${file}
done

rm ${DESTDIR}/ext/*Stemmer.java
rm -f ${TESTDSTDIR}/languages.txt
for file in ${SRCDIR}/java/org/tartarus/snowball/ext/*.java; do
  # title-case the classes (fooStemmer -> FooStemmer) so they obey normal java conventions
  base=$(basename $file)
  oldclazz="${base%.*}"
  # one-off
  if [ "${oldclazz}" == "kraaij_pohlmannStemmer" ]; then
    newclazz="KpStemmer"
  else
    newclazz=${oldclazz^}
  fi
  echo ${newclazz} | sed -e 's/Stemmer//' >> ${TESTDSTDIR}/languages.txt
  cat $file | sed "s/${oldclazz}/${newclazz}/g" > ${DESTDIR}/ext/${newclazz}.java
done

# regenerate stopwords data
rm -f ${WWWDSTDIR}/*_stop.txt
for file in ${WWWSRCDIR}/algorithms/*/stop.txt; do
  language=$(basename $(dirname ${file}))
  cat > ${WWWDSTDIR}/${language}_stop.txt << EOF
 | From https://snowballstem.org/algorithms/${language}/stop.txt
 | This file is distributed under the BSD License.
 | See https://snowballstem.org/license.html
 | Also see https://opensource.org/licenses/bsd-license.html
 |  - Encoding was converted to UTF-8.
 |  - This notice was added.
 |
 | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
EOF
  case "$language" in
    danish)
      # clear up some slight mojibake on the website. TODO: fix this file!
      cat $file | sed 's/Ã¥/å/g' | sed 's/Ã¦/æ/g' >> ${WWWDSTDIR}/${language}_stop.txt
      ;;
    *)
      # try to confirm its really UTF-8
      iconv -f UTF-8 -t UTF-8 $file >> ${WWWDSTDIR}/${language}_stop.txt
      ;;
  esac
done
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions. 2020-02-17 12:38:01 -05:00			`#!/usr/bin/env bash`

			`# Licensed to the Apache Software Foundation (ASF) under one or more`
			`# contributor license agreements. See the NOTICE file distributed with`
			`# this work for additional information regarding copyright ownership.`
			`# The ASF licenses this file to You under the Apache License, Version 2.0`
			`# (the "License"); you may not use this file except in compliance with`
			`# the License. You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`# remove this script when problems are fixed`
			`SRCDIR=$1`
			`WWWSRCDIR=$2`
upgrade snowball to 34f3612e5e8c (round two) (#13227) * upgrade snowball to 34f3612e5e8c (round two) * disable forbidden-apis on snowball code (thanks @uschindler) 2024-03-27 17:51:48 -04:00			`PROJECTDIR=$3`
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions. 2020-02-17 12:38:01 -05:00			`DESTDIR="${PROJECTDIR}/src/java/org/tartarus/snowball"`
			`WWWDSTDIR="${PROJECTDIR}/src/resources/org/apache/lucene/analysis/snowball"`
			`TESTDSTDIR="${PROJECTDIR}/src/test/org/apache/lucene/analysis/snowball"`

upgrade snowball to 34f3612e5e8c (round two) (#13227) * upgrade snowball to 34f3612e5e8c (round two) * disable forbidden-apis on snowball code (thanks @uschindler) 2024-03-27 17:51:48 -04:00			`trap 'echo "usage: ./snowball.sh <snowball> <snowball-website> <analysis-common>" && exit 2' ERR`
			`test $# -eq 3`
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions. 2020-02-17 12:38:01 -05:00
			`trap 'echo "* BUILD FAILED *" $BASH_SOURCE:$LINENO: error: "$BASH_COMMAND" returned $?' ERR`
			`set -eEuo pipefail`

			`# generate stuff with existing makefile, just 'make' will try to do crazy stuff with e.g. python`
			`# and likely fail. so only ask for our specific target.`
			`(cd ${SRCDIR} && chmod a+x libstemmer/mkalgorithms.pl && make dist_libstemmer_java)`

			`for file in "SnowballStemmer.java" "Among.java" "SnowballProgram.java"; do`
			`# add license header to files since they have none, otherwise rat will flip the fuck out`
			`echo "/*" > ${DESTDIR}/${file}`
			`cat ${SRCDIR}/COPYING >> ${DESTDIR}/${file}`
			`echo "*/" >> ${DESTDIR}/${file}`
			`cat ${SRCDIR}/java/org/tartarus/snowball/${file} >> ${DESTDIR}/${file}`
			`done`

			`rm ${DESTDIR}/ext/*Stemmer.java`
			`rm -f ${TESTDSTDIR}/languages.txt`
			`for file in ${SRCDIR}/java/org/tartarus/snowball/ext/*.java; do`
			`# title-case the classes (fooStemmer -> FooStemmer) so they obey normal java conventions`
			`base=$(basename $file)`
			`oldclazz="${base%.*}"`
			`# one-off`
			`if [ "${oldclazz}" == "kraaij_pohlmannStemmer" ]; then`
			`newclazz="KpStemmer"`
			`else`
			`newclazz=${oldclazz^}`
			`fi`
			`echo ${newclazz} \| sed -e 's/Stemmer//' >> ${TESTDSTDIR}/languages.txt`
			`cat $file \| sed "s/${oldclazz}/${newclazz}/g" > ${DESTDIR}/ext/${newclazz}.java`
			`done`

			`# regenerate stopwords data`
			`rm -f ${WWWDSTDIR}/*_stop.txt`
			`for file in ${WWWSRCDIR}/algorithms/*/stop.txt; do`
			`language=$(basename $(dirname ${file}))`
			`cat > ${WWWDSTDIR}/${language}_stop.txt << EOF`
			`\| From https://snowballstem.org/algorithms/${language}/stop.txt`
			`\| This file is distributed under the BSD License.`
			`\| See https://snowballstem.org/license.html`
			`\| Also see https://opensource.org/licenses/bsd-license.html`
			`\| - Encoding was converted to UTF-8.`
			`\| - This notice was added.`
			`\|`
			`\| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"`
			`EOF`
			`case "$language" in`
			`danish)`
			`# clear up some slight mojibake on the website. TODO: fix this file!`
			`cat $file \| sed 's/Ã¥/å/g' \| sed 's/Ã¦/æ/g' >> ${WWWDSTDIR}/${language}_stop.txt`
			`;;`
			`*)`
			`# try to confirm its really UTF-8`
			`iconv -f UTF-8 -t UTF-8 $file >> ${WWWDSTDIR}/${language}_stop.txt`
			`;;`
			`esac`
			`done`