lucene/gradle/validation/rat-sources.gradle

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import groovy.xml.NamespaceBuilder

configure(rootProject) {
    configurations {
        ratDeps
    }

    dependencies {
        ratDeps "org.apache.rat:apache-rat:${scriptDepVersions['apache-rat']}"
    }
}

allprojects {
    task("rat", type: RatTask) {
        group = 'Verification'
        description = 'Runs Apache Rat checks.'
    }
}

configure(rootProject) {
    rat {
        includes += [
            "buildSrc/**/*.java",
            "gradle/**/*.gradle",
            "lucene/tools/forbiddenApis/**",
            "lucene/tools/prettify/**",
        ]
        excludes += [
            // Unclear if this needs ASF header, depends on how much was copied from ElasticSearch
            "**/ErrorReportingTestListener.java"
        ]
    }
}

configure(project(":lucene:analysis:common")) {
    rat {
        srcExcludes += [
            "**/*.aff",
            "**/*.dic",
            "**/*.wrong",
            "**/*.good",
            "**/*.sug",
            "**/charfilter/*.htm*",
            "**/*LuceneResourcesWikiPage.html"
        ]
    }
}

configure(project(":lucene:analysis:kuromoji")) {
    rat {
        srcExcludes += [
            // whether rat detects this as binary or not is platform dependent?!
            "**/bocchan.utf-8"
        ]
    }
}

configure(project(":lucene:analysis:opennlp")) {
    rat {
        excludes += [
            "src/tools/test-model-data/*.txt",
        ]
    }
}

configure(project(":lucene:highlighter")) {
    rat {
        srcExcludes += [
            "**/CambridgeMA.utf8"
        ]
    }
}

configure(project(":lucene:suggest")) {
    rat {
        srcExcludes += [
            "**/Top50KWiki.utf8",
            "**/stop-snowball.txt"
        ]
    }
}

configure(project(":solr:core")) {
    rat {
        srcExcludes += [
            "**/htmlStripReaderTest.html"
        ]
    }
}

configure(project(":solr:webapp")) {
    rat {
        includes = [ "**" ]
        excludes += [
            "web/img/**",
            "*.iml",
            "build.gradle",
            "build/**",
        ]
    }
}

// Structure inspired by existing task from Apache Kafka, heavily modified since then.
class RatTask extends DefaultTask {
    @Input
    List<String> includes = [
        "*.gradle",
        "*.xml",
        "src/tools/**"
    ]

    @Input
    List<String> excludes = []

    @Input
    List<String> srcExcludes = [
        "**/TODO",
        "**/*.txt",
        "**/*.md",
        "**/*.iml",
        "build/**"
    ]

    @OutputFile
    def xmlReport = new File(new File(project.buildDir, 'rat'), 'rat-report.xml')

    def generateXmlReport() {
        def uri = 'antlib:org.apache.rat.anttasks'
        def ratClasspath = project.rootProject.configurations.ratDeps.asPath
        ant.taskdef(resource: 'org/apache/rat/anttasks/antlib.xml', uri: uri, classpath: ratClasspath)

        def rat = NamespaceBuilder.newInstance(ant, uri)
        rat.report(format: 'xml', reportFile: xmlReport, addDefaultLicenseMatchers: true) {
            ant.fileset(dir: "${project.projectDir}") {
                includes.each { pattern -> ant.include(name: pattern) }
                excludes.each { pattern -> ant.exclude(name: pattern) }
            }

            if (project.plugins.findPlugin(JavaPlugin)) {
                def checkSets = [
                    project.sourceSets.main.java.srcDirs,
                    project.sourceSets.test.java.srcDirs,
                ]

                project.sourceSets.matching { it.name == 'tools' }.all {
                    checkSets += project.sourceSets.tools.java.srcDirs
                }

                checkSets.flatten().each { srcLocation ->
                    ant.fileset(dir: srcLocation, erroronmissingdir: false) {
                        srcExcludes.each { pattern -> ant.exclude(name: pattern) }
                    }
                }

                [
                    project.sourceSets.main.resources.srcDirs
                ].flatten().each { srcLocation ->
                    ant.fileset(dir: srcLocation, erroronmissingdir: false) {
                        ant.include(name: "META-INF/**")
                    }
                }
            }

            // The license rules below were manually copied from lucene/common-build.xml, there is currently no mechanism to sync them

            // BSD 4-clause stuff (is disallowed below)
            substringMatcher(licenseFamilyCategory: "BSD4 ", licenseFamilyName: "Original BSD License (with advertising clause)") {
                pattern(substring: "All advertising materials")
            }

            // BSD-like stuff
            substringMatcher(licenseFamilyCategory: "BSD  ", licenseFamilyName: "Modified BSD License") {
                // brics automaton
                pattern(substring: "Copyright (c) 2001-2009 Anders Moeller")
                // snowball
                pattern(substring: "Copyright (c) 2001, Dr Martin Porter")
                // UMASS kstem
                pattern(substring: "THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS")
                // Egothor
                pattern(substring: "Egothor Software License version 1.00")
                // JaSpell
                pattern(substring: "Copyright (c) 2005 Bruno Martins")
                // d3.js
                pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS")
                // highlight.js
                pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS")
            }

            // MIT-like
            substringMatcher(licenseFamilyCategory: "MIT  ", licenseFamilyName:"Modified BSD License") {
                // ICU license
                pattern(substring: "Permission is hereby granted, free of charge, to any person obtaining a copy")
            }

            // Apache
            substringMatcher(licenseFamilyCategory: "AL   ", licenseFamilyName: "Apache") {
                pattern(substring: "Licensed to the Apache Software Foundation (ASF) under")
                // this is the old - school one under some files
                pattern(substring: 'Licensed under the Apache License, Version 2.0 (the "License")')
            }

            substringMatcher(licenseFamilyCategory: "GEN  ", licenseFamilyName: "Generated") {
                // svg files generated by gnuplot
                pattern(substring: "Produced by GNUPLOT")
                // snowball stemmers generated by snowball compiler
                pattern(substring: "Generated by Snowball")
                // parsers generated by antlr
                pattern(substring: "ANTLR GENERATED CODE")
            }

            approvedLicense(familyName: "Apache")
            approvedLicense(familyName: "The MIT License")
            approvedLicense(familyName: "Modified BSD License")
            approvedLicense(familyName: "Generated")
        }
    }

    def printUnknownFiles() {
        def ratXml = new XmlParser().parse(xmlReport)
        def errors = []
        ratXml.resource.each { resource ->
            if (resource.'license-approval'.@name[0] == "false") {
                errors << "Unknown license: ${resource.@name}"
            }
        }
        if (errors) {
            throw new GradleException("Found " + errors.size() + " file(s) with errors:\n" +
                    errors.collect{ msg -> "  - ${msg}" }.join("\n"))
        }
    }

    @TaskAction
    def rat() {
        def origEncoding = System.getProperty("file.encoding")
        try {
            generateXmlReport()
            printUnknownFiles()
        } finally {
            if (System.getProperty("file.encoding") != origEncoding) {
                throw new GradleException("Insane: rat changed file.encoding to ${System.getProperty('file.encoding')}?")
            }
        }
    }
}
LUCENE-9182: add apache license headers to all .gradle files and enforce in rat task 2020-01-27 12:05:34 -05:00			`/*`
			`* Licensed to the Apache Software Foundation (ASF) under one or more`
			`* contributor license agreements. See the NOTICE file distributed with`
			`* this work for additional information regarding copyright ownership.`
			`* The ASF licenses this file to You under the Apache License, Version 2.0`
			`* (the "License"); you may not use this file except in compliance with`
			`* the License. You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`import groovy.xml.NamespaceBuilder`

			`configure(rootProject) {`
			`configurations {`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`ratDeps`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`

			`dependencies {`
Add workaround for https://github.com/palantir/gradle-consistent-versions/issues/383 2020-01-15 05:44:21 -05:00			`ratDeps "org.apache.rat:apache-rat:${scriptDepVersions['apache-rat']}"`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
			`}`

			`allprojects {`
			`task("rat", type: RatTask) {`
			`group = 'Verification'`
			`description = 'Runs Apache Rat checks.'`
			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(rootProject) {`
			`rat {`
			`includes += [`
			`"buildSrc/*/.java",`
LUCENE-9182: add apache license headers to all .gradle files and enforce in rat task 2020-01-27 12:05:34 -05:00			`"gradle/*/.gradle",`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`"lucene/tools/forbiddenApis/**",`
			`"lucene/tools/prettify/**",`
			`]`
			`excludes += [`
			`// Unclear if this needs ASF header, depends on how much was copied from ElasticSearch`
			`"**/ErrorReportingTestListener.java"`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":lucene:analysis:common")) {`
			`rat {`
			`srcExcludes += [`
			`"*/.aff",`
			`"*/.dic",`
LUCENE-9667: Hunspell: add spellchecker API, support BREAK and FORBIDDENWORD affix rules (#2207) 2021-01-20 04:57:27 -05:00			`"*/.wrong",`
			`"*/.good",`
LUCENE-9701: Hunspell: implement simple REP-based suggestion algorithm (#2251) 2021-02-01 04:23:54 -05:00			`"*/.sug",`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`"*/charfilter/.htm*",`
			`"*/LuceneResourcesWikiPage.html"`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":lucene:analysis:kuromoji")) {`
			`rat {`
			`srcExcludes += [`
			`// whether rat detects this as binary or not is platform dependent?!`
			`"**/bocchan.utf-8"`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":lucene:analysis:opennlp")) {`
			`rat {`
			`excludes += [`
			`"src/tools/test-model-data/*.txt",`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":lucene:highlighter")) {`
			`rat {`
			`srcExcludes += [`
			`"**/CambridgeMA.utf8"`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":lucene:suggest")) {`
			`rat {`
			`srcExcludes += [`
			`"**/Top50KWiki.utf8",`
			`"**/stop-snowball.txt"`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":solr:core")) {`
			`rat {`
			`srcExcludes += [`
			`"**/htmlStripReaderTest.html"`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`}`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`configure(project(":solr:webapp")) {`
			`rat {`
			`includes = [ "**" ]`
			`excludes += [`
			`"web/img/**",`
			`"*.iml",`
			`"build.gradle",`
			`"build/**",`
			`]`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`}`
			`}`

			`// Structure inspired by existing task from Apache Kafka, heavily modified since then.`
			`class RatTask extends DefaultTask {`
			`@Input`
			`List<String> includes = [`
LUCENE-9182: add apache license headers to all .gradle files and enforce in rat task 2020-01-27 12:05:34 -05:00			`"*.gradle",`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`"*.xml",`
			`"src/tools/**"`
			`]`

			`@Input`
			`List<String> excludes = []`

			`@Input`
			`List<String> srcExcludes = [`
			`"**/TODO",`
			`"*/.txt",`
SOLR-14429: Convert .txt files to properly formatted .md files (#1450) 2020-04-26 19:43:04 -04:00			`"*/.md",`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`"*/.iml",`
			`"build/**"`
			`]`

			`@OutputFile`
			`def xmlReport = new File(new File(project.buildDir, 'rat'), 'rat-report.xml')`

			`def generateXmlReport() {`
			`def uri = 'antlib:org.apache.rat.anttasks'`
Cleaning up minor things in rat task. 2020-01-15 04:07:10 -05:00			`def ratClasspath = project.rootProject.configurations.ratDeps.asPath`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`ant.taskdef(resource: 'org/apache/rat/anttasks/antlib.xml', uri: uri, classpath: ratClasspath)`

			`def rat = NamespaceBuilder.newInstance(ant, uri)`
			`rat.report(format: 'xml', reportFile: xmlReport, addDefaultLicenseMatchers: true) {`
			`ant.fileset(dir: "${project.projectDir}") {`
			`includes.each { pattern -> ant.include(name: pattern) }`
			`excludes.each { pattern -> ant.exclude(name: pattern) }`
			`}`

			`if (project.plugins.findPlugin(JavaPlugin)) {`
LUCENE-9768: Add source sets for src/tools, clean up forbidden API and formatting errors (#2361) 2021-02-12 11:03:54 -05:00			`def checkSets = [`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`project.sourceSets.main.java.srcDirs,`
			`project.sourceSets.test.java.srcDirs,`
LUCENE-9768: Add source sets for src/tools, clean up forbidden API and formatting errors (#2361) 2021-02-12 11:03:54 -05:00			`]`

			`project.sourceSets.matching { it.name == 'tools' }.all {`
			`checkSets += project.sourceSets.tools.java.srcDirs`
			`}`

			`checkSets.flatten().each { srcLocation ->`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`ant.fileset(dir: srcLocation, erroronmissingdir: false) {`
			`srcExcludes.each { pattern -> ant.exclude(name: pattern) }`
			`}`
			`}`

			`[`
			`project.sourceSets.main.resources.srcDirs`
			`].flatten().each { srcLocation ->`
			`ant.fileset(dir: srcLocation, erroronmissingdir: false) {`
			`ant.include(name: "META-INF/**")`
			`}`
			`}`
			`}`

			`// The license rules below were manually copied from lucene/common-build.xml, there is currently no mechanism to sync them`

			`// BSD 4-clause stuff (is disallowed below)`
			`substringMatcher(licenseFamilyCategory: "BSD4 ", licenseFamilyName: "Original BSD License (with advertising clause)") {`
			`pattern(substring: "All advertising materials")`
			`}`

			`// BSD-like stuff`
			`substringMatcher(licenseFamilyCategory: "BSD ", licenseFamilyName: "Modified BSD License") {`
			`// brics automaton`
			`pattern(substring: "Copyright (c) 2001-2009 Anders Moeller")`
			`// snowball`
			`pattern(substring: "Copyright (c) 2001, Dr Martin Porter")`
			`// UMASS kstem`
			`pattern(substring: "THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS")`
			`// Egothor`
			`pattern(substring: "Egothor Software License version 1.00")`
			`// JaSpell`
			`pattern(substring: "Copyright (c) 2005 Bruno Martins")`
			`// d3.js`
			`pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS")`
			`// highlight.js`
			`pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS")`
			`}`

			`// MIT-like`
			`substringMatcher(licenseFamilyCategory: "MIT ", licenseFamilyName:"Modified BSD License") {`
			`// ICU license`
			`pattern(substring: "Permission is hereby granted, free of charge, to any person obtaining a copy")`
			`}`

			`// Apache`
			`substringMatcher(licenseFamilyCategory: "AL ", licenseFamilyName: "Apache") {`
			`pattern(substring: "Licensed to the Apache Software Foundation (ASF) under")`
			`// this is the old - school one under some files`
			`pattern(substring: 'Licensed under the Apache License, Version 2.0 (the "License")')`
			`}`

			`substringMatcher(licenseFamilyCategory: "GEN ", licenseFamilyName: "Generated") {`
			`// svg files generated by gnuplot`
			`pattern(substring: "Produced by GNUPLOT")`
			`// snowball stemmers generated by snowball compiler`
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions. 2020-02-17 12:38:01 -05:00			`pattern(substring: "Generated by Snowball")`
Add RAT check using Gradle (#1157) Merging Apache rat checks. 2020-01-15 03:55:41 -05:00			`// parsers generated by antlr`
			`pattern(substring: "ANTLR GENERATED CODE")`
			`}`

			`approvedLicense(familyName: "Apache")`
			`approvedLicense(familyName: "The MIT License")`
			`approvedLicense(familyName: "Modified BSD License")`
			`approvedLicense(familyName: "Generated")`
			`}`
			`}`

			`def printUnknownFiles() {`
			`def ratXml = new XmlParser().parse(xmlReport)`
			`def errors = []`
			`ratXml.resource.each { resource ->`
			`if (resource.'license-approval'.@name[0] == "false") {`
			`errors << "Unknown license: ${resource.@name}"`
			`}`
			`}`
			`if (errors) {`
			`throw new GradleException("Found " + errors.size() + " file(s) with errors:\n" +`
			`errors.collect{ msg -> " - ${msg}" }.join("\n"))`
			`}`
			`}`

			`@TaskAction`
			`def rat() {`
			`def origEncoding = System.getProperty("file.encoding")`
			`try {`
			`generateXmlReport()`
			`printUnknownFiles()`
			`} finally {`
			`if (System.getProperty("file.encoding") != origEncoding) {`
			`throw new GradleException("Insane: rat changed file.encoding to ${System.getProperty('file.encoding')}?")`
			`}`
			`}`
			`}`
			`}`