mirror of https://github.com/apache/lucene.git
Prevent the common zero-width code points and detect invalid UTF-8 encoding in our sources and selected resource files (#12937)
* Simple patch to prevent the common zero-width code points in our source and some types of resource files * Validate correct UTF-8 input and fix buggy CSS file (ISO-8859-x encoded) * add a bit of context * Add CHANGES.txt
This commit is contained in:
parent
6c5dcc1795
commit
16d0b822b3
|
@ -15,6 +15,10 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.rat.Defaults
|
||||
import org.apache.rat.document.impl.FileDocument
|
||||
import org.apache.rat.api.MetaData
|
||||
|
@ -144,8 +148,8 @@ class ValidateSourcePatternsTask extends DefaultTask {
|
|||
(~$/\$$Id\b/$) : 'svn keyword',
|
||||
(~$/\$$Header\b/$) : 'svn keyword',
|
||||
(~$/\$$Source\b/$) : 'svn keyword',
|
||||
(~$/^\uFEFF/$) : 'UTF-8 byte order mark',
|
||||
(~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary'
|
||||
(~$/[\u200B\uFEFF]/$) : 'UTF-8 byte order mark or other zero-width codepoints',
|
||||
(~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary',
|
||||
]
|
||||
|
||||
// Python and others merrily use var declarations, this is a problem _only_ in Java at least for 8x where we're forbidding var declarations
|
||||
|
@ -198,15 +202,29 @@ class ValidateSourcePatternsTask extends DefaultTask {
|
|||
ProgressLogger progress = progressLoggerFactory.newOperation(this.class)
|
||||
progress.start(this.name, this.name)
|
||||
|
||||
def validatingDecoder = StandardCharsets.UTF_8.newDecoder()
|
||||
.onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT)
|
||||
|
||||
sourceFiles.each { f ->
|
||||
try {
|
||||
progress.progress("Scanning ${f.name}")
|
||||
logger.debug('Scanning source file: {}', f);
|
||||
|
||||
def text = f.getText('UTF-8');
|
||||
String text
|
||||
try {
|
||||
validatingDecoder.reset()
|
||||
text = f.withInputStream {
|
||||
in -> new InputStreamReader(in, validatingDecoder).getText()
|
||||
}
|
||||
} catch (CharacterCodingException e) {
|
||||
reportViolation(f, "incorrect UTF-8 encoding [${e}]")
|
||||
return // we can't proceed for this file
|
||||
}
|
||||
|
||||
invalidPatterns.each { pattern, name ->
|
||||
if (pattern.matcher(text).find()) {
|
||||
reportViolation(f, name);
|
||||
def matcher = pattern.matcher(text);
|
||||
if (matcher.find()) {
|
||||
reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
|
||||
}
|
||||
}
|
||||
def javadocsMatcher = javadocsPattern.matcher(text);
|
||||
|
@ -230,9 +248,10 @@ class ValidateSourcePatternsTask extends DefaultTask {
|
|||
}
|
||||
checkLicenseHeaderPrecedes(f, 'package', packagePattern, javaCommentPattern, text, ratDocument);
|
||||
|
||||
invalidJavaOnlyPatterns.each { pattern,name ->
|
||||
if (pattern.matcher(text).find()) {
|
||||
reportViolation(f, name);
|
||||
invalidJavaOnlyPatterns.each { pattern, name ->
|
||||
def matcher = pattern.matcher(text);
|
||||
if (matcher.find()) {
|
||||
reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -198,6 +198,12 @@ Bug Fixes
|
|||
* GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
|
||||
fail with certain random seeds. (Greg Miller)
|
||||
|
||||
Build
|
||||
---------------------
|
||||
|
||||
* GITHUB#12931, GITHUB#12936, GITHUB#12937: Improve source file validation to detect incorrect
|
||||
UTF-8 sequences and forbid U+200B; enable errorprone DisableUnicodeInCode check. (Robert Muir, Uwe Schindler)
|
||||
|
||||
Other
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -2,5 +2,5 @@
|
|||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
|
||||
"lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"
|
||||
}
|
|
@ -334,7 +334,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
//
|
||||
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
|
||||
//
|
||||
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
|
||||
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
|
||||
//
|
||||
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
|
||||
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
|
||||
"lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1"
|
||||
}
|
|
@ -200,7 +200,7 @@ ComplexContextEx = \p{LB:Complex_Context}
|
|||
//
|
||||
// In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
|
||||
//
|
||||
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
|
||||
// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
|
||||
//
|
||||
{EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} )
|
||||
| {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx}
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
/* This code is based on the one originally provided by
|
||||
Geir Landrö in his dTree 2.05 package. You can get it
|
||||
Geir Landrö in his dTree 2.05 package. You can get it
|
||||
at : www.destroydrop.com/javascript/tree/.
|
||||
|
||||
Therefore, the DTDDoc team considers that this code is
|
||||
Copyright (c) 2002-2003 Geir Landrö. Since the original
|
||||
Copyright (c) 2002-2003 Geir Landrö. Since the original
|
||||
author didn't clearly forbids copies of this part, we
|
||||
assume we're not doing anything wrong in porviding it
|
||||
to you, in a modified or non-modified form.
|
||||
*/
|
||||
|
||||
/*
|
||||
Geir Landrö : Orignal version, for dTree.
|
||||
Geir Landrö : Orignal version, for dTree.
|
||||
|
||||
Michael Koehrsen (10/2004) : Original modification to
|
||||
allow DTDDoc to use this.
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
/* This CSS is based on the one originally provided by
|
||||
Geir Landrö in his dTree 2.05 package. You can get it
|
||||
Geir Landrö in his dTree 2.05 package. You can get it
|
||||
at : www.destroydrop.com/javascript/tree/.
|
||||
|
||||
Therefore, the DTDDoc team considers that this code is
|
||||
Copyright (c) 2002-2003 Geir Landrö. Since the original
|
||||
Copyright (c) 2002-2003 Geir Landrö. Since the original
|
||||
author didn't clearly forbids copies of this part, we
|
||||
assume we're not doing anything wrong in porviding it
|
||||
to you, in a modified or non-modified form.
|
||||
*/
|
||||
|
||||
/*
|
||||
Geir Landrö : Orignal version, for dTree.
|
||||
Geir Landrö : Orignal version, for dTree.
|
||||
|
||||
Stefan Champailler (10/2004) : Style changes here and
|
||||
there.
|
||||
|
|
Loading…
Reference in New Issue