Prevent the common zero-width code points and detect invalid UTF-8 encoding in our sources and selected resource files (#12937)

* Simple patch to prevent the common zero-width code points in our source and some types of resource files * Validate correct UTF-8 input and fix buggy CSS file (ISO-8859-x encoded) * add a bit of context * Add CHANGES.txt
2025-02-06 10:08:58 +00:00 · 2023-12-13 17:27:05 +01:00 · 2023-12-13 17:27:05 +01:00 · 16d0b822b3
commit 16d0b822b3
parent 6c5dcc1795
8 changed files with 43 additions and 18 deletions
--- a/gradle/validation/validate-source-patterns.gradle
+++ b/gradle/validation/validate-source-patterns.gradle
@ -15,6 +15,10 @@
 * limitations under the License.
 */

+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+
 import org.apache.rat.Defaults
 import org.apache.rat.document.impl.FileDocument
 import org.apache.rat.api.MetaData
@ -144,8 +148,8 @@ class ValidateSourcePatternsTask extends DefaultTask {
      (~$/\$$Id\b/$) : 'svn keyword',
      (~$/\$$Header\b/$) : 'svn keyword',
      (~$/\$$Source\b/$) : 'svn keyword',
-      (~$/^\uFEFF/$) : 'UTF-8 byte order mark',
-      (~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary'
+      (~$/[\u200B\uFEFF]/$) : 'UTF-8 byte order mark or other zero-width codepoints',
+      (~$/import java\.lang\.\w+;/$) : 'java.lang import is unnecessary',
    ]

    // Python and others merrily use var declarations, this is a problem _only_ in Java at least for 8x where we're forbidding var declarations
@ -198,15 +202,29 @@ class ValidateSourcePatternsTask extends DefaultTask {
    ProgressLogger progress = progressLoggerFactory.newOperation(this.class)
    progress.start(this.name, this.name)

+    def validatingDecoder = StandardCharsets.UTF_8.newDecoder()
+      .onMalformedInput(CodingErrorAction.REPORT).onUnmappableCharacter(CodingErrorAction.REPORT)
+
    sourceFiles.each { f ->
      try {
        progress.progress("Scanning ${f.name}")
        logger.debug('Scanning source file: {}', f);

-        def text = f.getText('UTF-8');
+        String text
+        try {
+          validatingDecoder.reset()
+          text = f.withInputStream {
+            in -> new InputStreamReader(in, validatingDecoder).getText()
+          }
+        } catch (CharacterCodingException e) {
+          reportViolation(f, "incorrect UTF-8 encoding [${e}]")
+          return // we can't proceed for this file
+        }
+
        invalidPatterns.each { pattern, name ->
-          if (pattern.matcher(text).find()) {
-            reportViolation(f, name);
+          def matcher = pattern.matcher(text);
+          if (matcher.find()) {
+            reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
          }
        }
        def javadocsMatcher = javadocsPattern.matcher(text);
@ -230,9 +248,10 @@ class ValidateSourcePatternsTask extends DefaultTask {
          }
          checkLicenseHeaderPrecedes(f, 'package', packagePattern, javaCommentPattern, text, ratDocument);

-          invalidJavaOnlyPatterns.each { pattern,name ->
-            if (pattern.matcher(text).find()) {
-              reportViolation(f, name);
+          invalidJavaOnlyPatterns.each { pattern, name ->
+            def matcher = pattern.matcher(text);
+            if (matcher.find()) {
+              reportViolation(f, String.format(Locale.ROOT, '%s [start=%d, end=%d]', name, matcher.start(), matcher.end()));
            }
          }
        }
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -198,6 +198,12 @@ Bug Fixes
 * GITHUB#12920: Address bug in TestDrillSideways#testCollectionTerminated that could occasionally cause the test to
  fail with certain random seeds. (Greg Miller)

+Build
+---------------------
+
+* GITHUB#12931, GITHUB#12936, GITHUB#12937: Improve source file validation to detect incorrect
+  UTF-8 sequences and forbid U+200B; enable errorprone DisableUnicodeInCode check.  (Robert Muir, Uwe Schindler)
+
 Other
 ---------------------

--- a/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json
+++ b/lucene/analysis/common/src/generated/checksums/generateUAX29URLEmailTokenizer.json
@ -2,5 +2,5 @@
    "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/ASCIITLD.jflex": "346eb94735de07f845269e8f042ed0a25f09bc7f",
    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.java": "971a80479af08c54fd2fab9a75bb2321904cb3ef",
-    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "00a4d8cccb5e8d05aba986d48ef1acedd88a15cf"
+    "lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex": "8d7cd1a935443deda6cad73b91f1a45c1c714535"
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/email/UAX29URLEmailTokenizerImpl.jflex
@ -334,7 +334,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
  //
  //     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
  //
-  //         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+  //         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
  //
    {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
  | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
--- a/lucene/core/src/generated/checksums/generateStandardTokenizer.json
+++ b/lucene/core/src/generated/checksums/generateStandardTokenizer.json
@ -1,5 +1,5 @@
 {
    "gradle/generation/jflex/skeleton.disable.buffer.expansion.txt": "6e43a3a64a9b5eb82ec5b4bc21f95aff5a2a061e",
    "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java": "de1fac09ea1550250db3a044b757617f8d62cffe",
-    "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "36abbde28c6b47e1f8e90351e839ec8b2da73b99"
+    "lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex": "2db77654bcef8add1523d9e6260a3f72a3c58ed1"
 }
--- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
+++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex
@ -200,7 +200,7 @@ ComplexContextEx    = \p{LB:Complex_Context}
 //
 //     In particular, the above docs recommend a modified UAX#29 WB3c rule (covered by TR#51's "emoji_zwj_sequence"):
 //
-//         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
+//         WB3c′ ZWJ × (Extended_Pictographic | EmojiNRK)
 //
  {EmojiCharOrPresSeqOrModSeq} ( ( \p{WB:ZWJ} {EmojiCharOrPresSeqOrModSeq} )* | {TagSpec}+ {TagTerm} ) 
 | {KeyCapBaseCharEx} {EmojiPresentationSelector}? {KeyCapEx} 
--- a/lucene/queryparser/docs/xml/cctree.js
+++ b/lucene/queryparser/docs/xml/cctree.js
@ -1,16 +1,16 @@
 /* This code is based on the one originally provided by
-   Geir Landrö in his dTree 2.05 package. You can get it
+   Geir Landrö in his dTree 2.05 package. You can get it
   at : www.destroydrop.com/javascript/tree/.
   
   Therefore, the DTDDoc team considers that this code is 
-   Copyright (c) 2002-2003 Geir Landrö. Since the original
+   Copyright (c) 2002-2003 Geir Landrö. Since the original
   author didn't clearly forbids copies of this part, we
   assume we're not doing anything wrong in porviding it
   to you, in a modified or non-modified form.
 */

 /*   
-   Geir Landrö : Orignal version, for dTree.
+   Geir Landrö : Orignal version, for dTree.
   
   Michael Koehrsen (10/2004) : Original modification to
      allow DTDDoc to use this.
--- a/lucene/queryparser/docs/xml/dtreeStyle.css
+++ b/lucene/queryparser/docs/xml/dtreeStyle.css
@ -1,16 +1,16 @@
 /* This CSS is based on the one originally provided by
-   Geir Landrö in his dTree 2.05 package. You can get it
+   Geir Landrö in his dTree 2.05 package. You can get it
   at : www.destroydrop.com/javascript/tree/.
   
   Therefore, the DTDDoc team considers that this code is 
-   Copyright (c) 2002-2003 Geir Landrö. Since the original
+   Copyright (c) 2002-2003 Geir Landrö. Since the original
   author didn't clearly forbids copies of this part, we
   assume we're not doing anything wrong in porviding it
   to you, in a modified or non-modified form.
 */

 /*   
-   Geir Landrö : Orignal version, for dTree.
+   Geir Landrö : Orignal version, for dTree.
   
   Stefan Champailler (10/2004) : Style changes here and
      there.