lucene/gradle/validation/rat-sources.gradle

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

262 lines
8.5 KiB
Groovy
Raw Normal View History

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import groovy.xml.NamespaceBuilder
configure(rootProject) {
configurations {
2020-01-15 04:07:10 -05:00
ratDeps
}
dependencies {
ratDeps "org.apache.rat:apache-rat:${scriptDepVersions['apache-rat']}"
}
}
allprojects {
task("rat", type: RatTask) {
group = 'Verification'
description = 'Runs Apache Rat checks.'
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(rootProject) {
rat {
includes += [
"buildSrc/**/*.java",
"gradle/**/*.gradle",
2020-01-15 04:07:10 -05:00
"lucene/tools/forbiddenApis/**",
"lucene/tools/prettify/**",
]
excludes += [
// Unclear if this needs ASF header, depends on how much was copied from ElasticSearch
"**/ErrorReportingTestListener.java"
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":lucene:analysis:common")) {
rat {
srcExcludes += [
"**/*.aff",
"**/*.dic",
"**/*.wrong",
"**/*.good",
"**/*.sug",
2020-01-15 04:07:10 -05:00
"**/charfilter/*.htm*",
"**/*LuceneResourcesWikiPage.html"
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":lucene:analysis:kuromoji")) {
rat {
srcExcludes += [
// whether rat detects this as binary or not is platform dependent?!
"**/bocchan.utf-8"
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":lucene:analysis:opennlp")) {
rat {
excludes += [
"src/tools/test-model-data/*.txt",
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":lucene:highlighter")) {
rat {
srcExcludes += [
"**/CambridgeMA.utf8"
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":lucene:suggest")) {
rat {
srcExcludes += [
"**/Top50KWiki.utf8",
"**/stop-snowball.txt"
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":solr:core")) {
rat {
srcExcludes += [
"**/htmlStripReaderTest.html"
]
}
2020-01-15 04:07:10 -05:00
}
2020-01-15 04:07:10 -05:00
configure(project(":solr:webapp")) {
rat {
includes = [ "**" ]
excludes += [
"web/img/**",
"*.iml",
"build.gradle",
"build/**",
]
}
}
// Structure inspired by existing task from Apache Kafka, heavily modified since then.
class RatTask extends DefaultTask {
@Input
List<String> includes = [
"*.gradle",
"*.xml",
"src/tools/**"
]
@Input
List<String> excludes = []
@Input
List<String> srcExcludes = [
"**/TODO",
"**/*.txt",
"**/*.md",
"**/*.iml",
"build/**"
]
@OutputFile
def xmlReport = new File(new File(project.buildDir, 'rat'), 'rat-report.xml')
def generateXmlReport() {
def uri = 'antlib:org.apache.rat.anttasks'
2020-01-15 04:07:10 -05:00
def ratClasspath = project.rootProject.configurations.ratDeps.asPath
ant.taskdef(resource: 'org/apache/rat/anttasks/antlib.xml', uri: uri, classpath: ratClasspath)
def rat = NamespaceBuilder.newInstance(ant, uri)
rat.report(format: 'xml', reportFile: xmlReport, addDefaultLicenseMatchers: true) {
ant.fileset(dir: "${project.projectDir}") {
includes.each { pattern -> ant.include(name: pattern) }
excludes.each { pattern -> ant.exclude(name: pattern) }
}
if (project.plugins.findPlugin(JavaPlugin)) {
def checkSets = [
project.sourceSets.main.java.srcDirs,
project.sourceSets.test.java.srcDirs,
]
project.sourceSets.matching { it.name == 'tools' }.all {
checkSets += project.sourceSets.tools.java.srcDirs
}
checkSets.flatten().each { srcLocation ->
ant.fileset(dir: srcLocation, erroronmissingdir: false) {
srcExcludes.each { pattern -> ant.exclude(name: pattern) }
}
}
[
project.sourceSets.main.resources.srcDirs
].flatten().each { srcLocation ->
ant.fileset(dir: srcLocation, erroronmissingdir: false) {
ant.include(name: "META-INF/**")
}
}
}
// The license rules below were manually copied from lucene/common-build.xml, there is currently no mechanism to sync them
// BSD 4-clause stuff (is disallowed below)
substringMatcher(licenseFamilyCategory: "BSD4 ", licenseFamilyName: "Original BSD License (with advertising clause)") {
pattern(substring: "All advertising materials")
}
// BSD-like stuff
substringMatcher(licenseFamilyCategory: "BSD ", licenseFamilyName: "Modified BSD License") {
// brics automaton
pattern(substring: "Copyright (c) 2001-2009 Anders Moeller")
// snowball
pattern(substring: "Copyright (c) 2001, Dr Martin Porter")
// UMASS kstem
pattern(substring: "THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS")
// Egothor
pattern(substring: "Egothor Software License version 1.00")
// JaSpell
pattern(substring: "Copyright (c) 2005 Bruno Martins")
// d3.js
pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS")
// highlight.js
pattern(substring: "THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS")
}
// MIT-like
substringMatcher(licenseFamilyCategory: "MIT ", licenseFamilyName:"Modified BSD License") {
// ICU license
pattern(substring: "Permission is hereby granted, free of charge, to any person obtaining a copy")
}
// Apache
substringMatcher(licenseFamilyCategory: "AL ", licenseFamilyName: "Apache") {
pattern(substring: "Licensed to the Apache Software Foundation (ASF) under")
// this is the old - school one under some files
pattern(substring: 'Licensed under the Apache License, Version 2.0 (the "License")')
}
substringMatcher(licenseFamilyCategory: "GEN ", licenseFamilyName: "Generated") {
// svg files generated by gnuplot
pattern(substring: "Produced by GNUPLOT")
// snowball stemmers generated by snowball compiler
LUCENE-9220: regenerate all stemmers/stopwords/test data from snowball 2.0 (#1262) Previous situation: * The snowball base classes (Among, SnowballProgram, etc) had accumulated local performance-related changes. There was a task that would also "patch" generated classes (e.g. GermanStemmer) after-the-fact. * Snowball classes had many "non-changes" from the original such as removal of tabs addition of javadocs, license headers, etc. * Snowball test data (inputs and expected stems) was incorporated into lucene testing, but this was maintained manually. Also files had become large, making the test too slow (Nightly). * Snowball stopwords lists from their website were manually maintained. In some cases encoding fixes were manually applied. * Some generated stemmers (such as Estonian and Armenian) exist in lucene, but have no corresponding `.sbl` file in snowball sources at all. Besides this mess, snowball project is "moving along" and acquiring new languages, adding non-BSD-licensed test data, huge test data, and other complexity. So it is time to automate the integration better. New situation: * Lucene has a `gradle snowball` regeneration task. It works on Linux or Mac only. It checks out their repos, applies the `snowball.patch` in our repository, compiles snowball stemmers, regenerates all java code, applies any adjustments so that our build is happy. * Tests data is automatically regenerated from the commit hash of the snowball test data repository. Not all languages are tested from their data: only where the license is simple BSD. Test data is also (deterministically) sampled, so that we don't have huge files. We just want to make sure our integration works. * Randomized tests are still set to test every language with generated fake words. The regeneration task ensures all languages get tested (it writes a simple text file list of them). * Stopword files are automatically regenerated from the commit hash of the snowball website repository. * The regeneration procedure is idempotent. This way when stuff does change, you know exactly what happened. For example if test data changes to a different license, you may see a git deletion. Or if a new language/stopwords/test data gets added, you will see git additions.
2020-02-17 12:38:01 -05:00
pattern(substring: "Generated by Snowball")
// parsers generated by antlr
pattern(substring: "ANTLR GENERATED CODE")
}
approvedLicense(familyName: "Apache")
approvedLicense(familyName: "The MIT License")
approvedLicense(familyName: "Modified BSD License")
approvedLicense(familyName: "Generated")
}
}
def printUnknownFiles() {
def ratXml = new XmlParser().parse(xmlReport)
def errors = []
ratXml.resource.each { resource ->
if (resource.'license-approval'.@name[0] == "false") {
errors << "Unknown license: ${resource.@name}"
}
}
if (errors) {
throw new GradleException("Found " + errors.size() + " file(s) with errors:\n" +
errors.collect{ msg -> " - ${msg}" }.join("\n"))
}
}
@TaskAction
def rat() {
def origEncoding = System.getProperty("file.encoding")
try {
generateXmlReport()
printUnknownFiles()
} finally {
if (System.getProperty("file.encoding") != origEncoding) {
throw new GradleException("Insane: rat changed file.encoding to ${System.getProperty('file.encoding')}?")
}
}
}
}