Prevent the common zero-width code points and detect invalid UTF-8 encoding in our sources and selected resource files (#12937)

* Simple patch to prevent the common zero-width code points in our source and some types of resource files

* Validate correct UTF-8 input and fix buggy CSS file (ISO-8859-x encoded)

* add a bit of context

* Add CHANGES.txt
This commit is contained in:
Uwe Schindler 2023-12-13 17:27:05 +01:00 committed by GitHub
parent 6c5dcc1795
commit 16d0b822b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 43 additions and 18 deletions

View File

@ -15,6 +15,10 @@
* limitations under the License.
*/
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import org.apache.rat.Defaults
import org.apache.rat.document.impl.FileDocument
import org.apache.rat.api.MetaData
@ -144,8 +148,8 @@ class ValidateSourcePatternsTask extends DefaultTask {
(~$/\$$Id\b/$) : 'svn keyword',
(~$/\$$Header\b/$) : 'svn keyword',
(~$/\$$Source\b/$) : 'svn keyword',
(~$/^\uFEFF/$) : 'UTF-8 byte order mark',
(~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary'
(~$/[\u200B\uFEFF]/$) : 'UTF-8 byte order mark or other zero-width codepoints',
(~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary',
]
// Python and others merrily use var declarations, this is a problem _only_ in Java at least for 8x where we're forbidding var declarations
@ -198,15 +202,29 @@ class ValidateSourcePatternsTask extends DefaultTask {
ProgressLogger progress = progressLoggerFactory.newOperation(this.class)
progress.start(this.name, this.name)
def validatingDecoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT)
sourceFiles.each { f ->
try {
progress.progress("Scanning ${f.name}")
logger.debug('Scanning source file: {}', f);
def text = f.getText('UTF-8');
String text
try {
validatingDecoder.reset()
text = f.withInputStream {
in -> new InputStreamReader(in, validatingDecoder).getText()
}
} catch (CharacterCodingException e) {
reportViolation(f, "incorrect UTF-8 encoding [${e}]")
return // we can't proceed for this file
}
invalidPatterns.each { pattern, name ->
if (pattern.matcher(text).find()) {
reportViolation(f, name);
def matcher = pattern.matcher(text);
if (matcher.find()) {
reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
}
}
def javadocsMatcher = javadocsPattern.matcher(text);
@ -230,9 +248,10 @@ class ValidateSourcePatternsTask extends DefaultTask {
}
checkLicenseHeaderPrecedes(f, 'package', packagePattern, javaCommentPattern, text, ratDocument);
invalidJavaOnlyPatterns.each { pattern,name ->
if (pattern.matcher(text).find()) {
reportViolation(f, name);
invalidJavaOnlyPatterns.each { pattern, name ->
def matcher = pattern.matcher(text);
if (matcher.find()) {
reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
}
}
}

View File

@ -198,6 +198,12 @@ Bug Fixes
* GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
fail with certain random seeds. (Greg Miller)
Build
---------------------
* GITHUB#12931, GITHUB#12936, GITHUB#12937: Improve source file validation to detect incorrect
UTF-8 sequences and forbid U+200B; enable errorprone DisableUnicodeInCode check. (Robert Muir, Uwe Schindler)
Other
---------------------

View File

@ -2,5 +2,5 @@
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"
}

View File

@ -334,7 +334,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
//
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
//
// WB3c ZWJ × (Extended_Pictographic | EmojiNRK)
// WB3c ZWJ × (Extended_Pictographic | EmojiNRK)
//
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}

View File

@ -1,5 +1,5 @@
{
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1"
}

View File

@ -200,7 +200,7 @@ ComplexContextEx = \p{LB:Complex_Context}
//
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
//
// WB3c ZWJ × (Extended_Pictographic | EmojiNRK)
// WB3c ZWJ × (Extended_Pictographic | EmojiNRK)
//
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}

View File

@ -1,16 +1,16 @@
/* This code is based on the one originally provided by
Geir Landrö in his dTree 2.05 package. You can get it
Geir Landrö in his dTree 2.05 package. You can get it
at : www.destroydrop.com/javascript/tree/.
Therefore, the DTDDoc team considers that this code is
Copyright (c) 2002-2003 Geir Landrö. Since the original
Copyright (c) 2002-2003 Geir Landrö. Since the original
author didn't clearly forbids copies of this part, we
assume we're not doing anything wrong in porviding it
to you, in a modified or non-modified form.
*/
/*
Geir Landrö : Orignal version, for dTree.
Geir Landrö : Orignal version, for dTree.
Michael Koehrsen (10/2004) : Original modification to
allow DTDDoc to use this.

View File

@ -1,16 +1,16 @@
/* This CSS is based on the one originally provided by
Geir Landrö in his dTree 2.05 package. You can get it
Geir Landrö in his dTree 2.05 package. You can get it
at : www.destroydrop.com/javascript/tree/.
Therefore, the DTDDoc team considers that this code is
Copyright (c) 2002-2003 Geir Landrö. Since the original
Copyright (c) 2002-2003 Geir Landrö. Since the original
author didn't clearly forbids copies of this part, we
assume we're not doing anything wrong in porviding it
to you, in a modified or non-modified form.
*/
/*
Geir Landrö : Orignal version, for dTree.
Geir Landrö : Orignal version, for dTree.
Stefan Champailler (10/2004) : Style changes here and
there.