diff --git a/gradle/validation/validate-source-patterns.gradle b/gradle/validation/validate-source-patterns.gradle index b1121ccbb50..908169f61af 100644 --- a/gradle/validation/validate-source-patterns.gradle +++ b/gradle/validation/validate-source-patterns.gradle @@ -15,6 +15,10 @@ * limitations under the License. */ +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; + import org.apache.rat.Defaults import org.apache.rat.document.impl.FileDocument import org.apache.rat.api.MetaData @@ -144,8 +148,8 @@ class ValidateSourcePatternsTask extends DefaultTask { (~$/\$$Id\b/$) : 'svn keyword', (~$/\$$Header\b/$) : 'svn keyword', (~$/\$$Source\b/$) : 'svn keyword', - (~$/^\uFEFF/$) : 'UTF-8 byte order mark', - (~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary' + (~$/[\u200B\uFEFF]/$) : 'UTF-8 byte order mark or other zero-width codepoints', + (~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary', ] // Python and others merrily use var declarations, this is a problem _only_ in Java at least for 8x where we're forbidding var declarations @@ -198,15 +202,29 @@ class ValidateSourcePatternsTask extends DefaultTask { ProgressLogger progress = progressLoggerFactory.newOperation(this.class) progress.start(this.name, this.name) + def validatingDecoder = StandardCharsets.UTF_8.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT) + sourceFiles.each { f -> try { progress.progress("Scanning ${f.name}") logger.debug('Scanning source file: {}', f); - def text = f.getText('UTF-8'); + String text + try { + validatingDecoder.reset() + text = f.withInputStream { + in -> new InputStreamReader(in, validatingDecoder).getText() + } + } catch (CharacterCodingException e) { + reportViolation(f, "incorrect UTF-8 encoding [${e}]") + return // we can't proceed for this file + } + invalidPatterns.each { pattern, name -> - if (pattern.matcher(text).find()) { - reportViolation(f, name); + def matcher = pattern.matcher(text); + if (matcher.find()) { + reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end())); } } def javadocsMatcher = javadocsPattern.matcher(text); @@ -230,9 +248,10 @@ class ValidateSourcePatternsTask extends DefaultTask { } checkLicenseHeaderPrecedes(f, 'package', packagePattern, javaCommentPattern, text, ratDocument); - invalidJavaOnlyPatterns.each { pattern,name -> - if (pattern.matcher(text).find()) { - reportViolation(f, name); + invalidJavaOnlyPatterns.each { pattern, name -> + def matcher = pattern.matcher(text); + if (matcher.find()) { + reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end())); } } } diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index b7b370132b7..77c084e8f0a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -198,6 +198,12 @@ Bug Fixes * GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to fail with certain random seeds. (Greg Miller) +Build +--------------------- + +* GITHUB#12931, GITHUB#12936, GITHUB#12937: Improve source file validation to detect incorrect + UTF-8 sequences and forbid U+200B; enable errorprone DisableUnicodeInCode check. (Robert Muir, Uwe Schindler) + Other --------------------- diff --git a/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json b/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json index 105bea85c3a..b7843e73bc2 100644 --- a/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json +++ b/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json @@ -2,5 +2,5 @@ "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e", "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f", "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef", - "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf" + "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535" } \ No newline at end of file diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex index ecc7a8ddaa8..e3a24e33432 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex @@ -334,7 +334,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost}) // // In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"): // - // WB3c′ ZWJ × ​(Extended_Pictographic | EmojiNRK) + // WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK) // {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} diff --git a/lucene/core/src/generated/checksums/generateStandardTokenizer.json b/lucene/core/src/generated/checksums/generateStandardTokenizer.json index 181172e86c5..a443c47a3f6 100644 --- a/lucene/core/src/generated/checksums/generateStandardTokenizer.json +++ b/lucene/core/src/generated/checksums/generateStandardTokenizer.json @@ -1,5 +1,5 @@ { "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e", "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe", - "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99" + "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1" } \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex index dfc5ddd517f..712295ace96 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex +++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex @@ -200,7 +200,7 @@ ComplexContextEx = \p{LB:Complex_Context} // // In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"): // -// WB3c′ ZWJ × ​(Extended_Pictographic | EmojiNRK) +// WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK) // {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} diff --git a/lucene/queryparser/docs/xml/cctree.js b/lucene/queryparser/docs/xml/cctree.js index bf080b0c0a2..589c88b6209 100644 --- a/lucene/queryparser/docs/xml/cctree.js +++ b/lucene/queryparser/docs/xml/cctree.js @@ -1,16 +1,16 @@ /* This code is based on the one originally provided by - Geir Landrö in his dTree 2.05 package. You can get it + Geir Landrö in his dTree 2.05 package. You can get it at : www.destroydrop.com/javascript/tree/. Therefore, the DTDDoc team considers that this code is - Copyright (c) 2002-2003 Geir Landrö. Since the original + Copyright (c) 2002-2003 Geir Landrö. Since the original author didn't clearly forbids copies of this part, we assume we're not doing anything wrong in porviding it to you, in a modified or non-modified form. */ /* - Geir Landrö : Orignal version, for dTree. + Geir Landrö : Orignal version, for dTree. Michael Koehrsen (10/2004) : Original modification to allow DTDDoc to use this. diff --git a/lucene/queryparser/docs/xml/dtreeStyle.css b/lucene/queryparser/docs/xml/dtreeStyle.css index f21c66e4258..1b1235906c9 100644 --- a/lucene/queryparser/docs/xml/dtreeStyle.css +++ b/lucene/queryparser/docs/xml/dtreeStyle.css @@ -1,16 +1,16 @@ /* This CSS is based on the one originally provided by - Geir Landrö in his dTree 2.05 package. You can get it + Geir Landrö in his dTree 2.05 package. You can get it at : www.destroydrop.com/javascript/tree/. Therefore, the DTDDoc team considers that this code is - Copyright (c) 2002-2003 Geir Landrö. Since the original + Copyright (c) 2002-2003 Geir Landrö. Since the original author didn't clearly forbids copies of this part, we assume we're not doing anything wrong in porviding it to you, in a modified or non-modified form. */ /* - Geir Landrö : Orignal version, for dTree. + Geir Landrö : Orignal version, for dTree. Stefan Champailler (10/2004) : Style changes here and there.