From 3f6410b738557f42292d7499e69f6df4456413ef Mon Sep 17 00:00:00 2001
From: Dawid Weiss <dawid.weiss@carrotsearch.com>
Date: Sat, 19 Nov 2022 23:40:45 +0100
Subject: [PATCH] Implement source code regeneration for test-framework perl
 scripts (#11952)

---
 build.gradle                                  |  1 +
 gradle/generation/regenerate.gradle           |  1 +
 gradle/generation/unicode-test-classes.gradle | 76 +++++++++++++++++++
 gradle/globals.gradle                         |  2 +
 .../email/TestUAX29URLEmailTokenizer.java     |  6 +-
 .../standard/TestStandardAnalyzer.java        |  6 +-
 .../generateEmojiTokenizationTest.json        |  5 ++
 .../generateJavaUnicodeWordBreakTest.json     |  5 ++
 .../EmojiTokenizationTestUnicode_12_1.java    | 25 +++---
 .../standard/WordBreakTestUnicode_12_1_0.java |  9 +--
 .../standard/generateEmojiTokenizationTest.pl | 26 +++----
 .../generateJavaUnicodeWordBreakTest.pl       |  8 +-
 12 files changed, 122 insertions(+), 48 deletions(-)
 create mode 100644 gradle/generation/unicode-test-classes.gradle
 create mode 100644 lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json
 create mode 100644 lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json

diff --git a/build.gradle b/build.gradle
index eb2cc45f62c..24e8b580583 100644
--- a/build.gradle
+++ b/build.gradle
@@ -156,6 +156,7 @@ apply from: file('gradle/generation/icu.gradle')
 apply from: file('gradle/generation/javacc.gradle')
 apply from: file('gradle/generation/forUtil.gradle')
 apply from: file('gradle/generation/antlr.gradle')
+apply from: file('gradle/generation/unicode-test-classes.gradle')
 
 apply from: file('gradle/datasets/external-datasets.gradle')
 
diff --git a/gradle/generation/regenerate.gradle b/gradle/generation/regenerate.gradle
index f13cf9d9936..da438e5e14e 100644
--- a/gradle/generation/regenerate.gradle
+++ b/gradle/generation/regenerate.gradle
@@ -92,6 +92,7 @@ configure([
         project(":lucene:core"),
         project(":lucene:queryparser"),
         project(":lucene:expressions"),
+        project(":lucene:test-framework"),
 ]) {
     task regenerate() {
       description "Rerun any code or static data generation tasks."
diff --git a/gradle/generation/unicode-test-classes.gradle b/gradle/generation/unicode-test-classes.gradle
new file mode 100644
index 00000000000..cd2086edeb1
--- /dev/null
+++ b/gradle/generation/unicode-test-classes.gradle
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Regenerates test classes from unicode data.
+
+configure(project(":lucene:test-framework")) {
+  task generateEmojiTokenizationTestInternal() {
+    def unicodeVersion = "12.1"
+
+    def genDir = file("src/java/org/apache/lucene/tests/analysis/standard")
+    def genScript = file("${genDir}/generateEmojiTokenizationTest.pl")
+    def genOutput = file("${genDir}/EmojiTokenizationTestUnicode_${unicodeVersion.replace('.', '_')}.java")
+
+    description "Regenerate ${genOutput}"
+    group "generation"
+
+    inputs.file genScript
+    inputs.property "unicodeVersion", unicodeVersion
+    outputs.file genOutput
+
+    doLast {
+      quietExec {
+        workingDir genDir
+        executable project.externalTool("perl")
+        args = [
+            genScript,
+            "-v", unicodeVersion
+        ]
+      }
+    }
+  }
+  regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiTokenizationTestInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
+
+  task generateJavaUnicodeWordBreakTestInternal() {
+    def unicodeVersion = "12.1"
+
+    def genDir = file("src/java/org/apache/lucene/tests/analysis/standard")
+    def genScript = file("${genDir}/generateJavaUnicodeWordBreakTest.pl")
+    def genOutput = file("${genDir}/WordBreakTestUnicode_${unicodeVersion.replace('.', '_')}.java")
+
+    description "Regenerate ${genOutput}"
+    group "generation"
+
+    inputs.file genScript
+    inputs.property "unicodeVersion", unicodeVersion
+    outputs.file genOutput
+
+    doLast {
+      quietExec {
+        workingDir genDir
+        executable project.externalTool("perl")
+        args = [
+            genScript,
+            "-v", unicodeVersion
+        ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateJavaUnicodeWordBreakTestInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
+
+}
diff --git a/gradle/globals.gradle b/gradle/globals.gradle
index 60e5101437e..662b58d4205 100644
--- a/gradle/globals.gradle
+++ b/gradle/globals.gradle
@@ -113,6 +113,8 @@ allprojects {
               "code: ${result.exitValue}, " +
               "output at: ${outputFile} (and logged above).")
         }
+      } else {
+        logger.info(new String(outputFile.bytes))
       }
 
       return result
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java
index c72c9073c66..7d709e62e51 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java
@@ -678,8 +678,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
   }
 
   public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_12_1_0 wordBreakTest = new WordBreakTestUnicode_12_1_0();
-    wordBreakTest.test(a);
+    WordBreakTestUnicode_12_1_0.test(a);
   }
 
   public void testSupplementary() throws Exception {
@@ -852,8 +851,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
   }
 
   public void testUnicodeEmojiTests() throws Exception {
-    EmojiTokenizationTestUnicode_12_1 emojiTest = new EmojiTokenizationTestUnicode_12_1();
-    emojiTest.test(a);
+    EmojiTokenizationTestUnicode_12_1.test(a);
   }
 
   /** blast some random strings through the analyzer */
diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
index 655ba8bf9b0..8fbe0359698 100644
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@@ -463,8 +463,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
   }
 
   public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_12_1_0 wordBreakTest = new WordBreakTestUnicode_12_1_0();
-    wordBreakTest.test(a);
+    WordBreakTestUnicode_12_1_0.test(a);
   }
 
   public void testSupplementary() throws Exception {
@@ -637,8 +636,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
   }
 
   public void testUnicodeEmojiTests() throws Exception {
-    EmojiTokenizationTestUnicode_12_1 emojiTest = new EmojiTokenizationTestUnicode_12_1();
-    emojiTest.test(a);
+    EmojiTokenizationTestUnicode_12_1.test(a);
   }
 
   /** blast some random strings through the analyzer */
diff --git a/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json b/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json
new file mode 100644
index 00000000000..77785ed9882
--- /dev/null
+++ b/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json
@@ -0,0 +1,5 @@
+{
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java": "22e03ada47168b0986220c57260cfaf8e6e12e16",
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl": "a21d8aea5d2c30fb47b2bf9b24e20ddf605de46d",
+    "property:unicodeVersion": "12.1"
+}
\ No newline at end of file
diff --git a/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json b/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json
new file mode 100644
index 00000000000..72bfd5f8c4a
--- /dev/null
+++ b/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json
@@ -0,0 +1,5 @@
+{
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1.java": "--",
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl": "1dd7148f4514976503a2be2e00be75c20ce784fb",
+    "property:unicodeVersion": "12.1"
+}
\ No newline at end of file
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java
index 3e291100914..0276063e6c7 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java
@@ -19,30 +19,27 @@ package org.apache.lucene.tests.analysis.standard;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
 
 /**
- * This class was automatically generated by generateEmojiTokenizationTest.pl from:
- * http://www.unicode.org/Public/emoji/12.1/emoji-test.txt
- *
- * <p>emoji-test.txt contains emoji char sequences, which are represented as tokenization tests in
- * this class.
+ * This class was automatically generated by generateEmojiTokenizationTest.pl. from: <a
+ * href="http://www.unicode.org/Public/emoji/12.1/emoji-test.txt"><code>
+ * http://www.unicode.org/Public/emoji/12.1/emoji-test.txt</code></a>
  */
-@Ignore
-public class EmojiTokenizationTestUnicode_12_1 extends BaseTokenStreamTestCase {
+public final class EmojiTokenizationTestUnicode_12_1 {
 
-  public void test(Analyzer analyzer) throws Exception {
-    for (int i = 0; i < tests.length; i += 2) {
-      String test = tests[i + 1];
+  public static void test(Analyzer analyzer) throws Exception {
+    for (int i = 0; i < TESTS.length; i += 2) {
+      String test = TESTS[i + 1];
       try {
-        assertAnalyzesTo(analyzer, test, new String[] {test}, new String[] {"<EMOJI>"});
+        BaseTokenStreamTestCase.assertAnalyzesTo(
+            analyzer, test, new String[] {test}, new String[] {"<EMOJI>"});
       } catch (Throwable t) {
-        throw new Exception("Failed to tokenize \"" + tests[i] + "\":", t);
+        throw new Exception("Failed to tokenize \"" + TESTS[i] + "\":", t);
       }
     }
   }
 
-  private String[] tests =
+  private static String[] TESTS =
       new String[] {
         "1F600                                      ; fully-qualified     # 😀 E2.0 grinning face",
         "\uD83D\uDE00",
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java
index 238301c7f3c..c94c6737a12 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java
@@ -17,9 +17,9 @@
 
 package org.apache.lucene.tests.analysis.standard;
 
+import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
+
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
 
 /**
  * This class was automatically generated by generateJavaUnicodeWordBreakTest.pl from:
@@ -38,10 +38,9 @@ import org.junit.Ignore;
  * Hebrew_Letter} \p{WordBreak = Katakana} \p{WordBreak = Numeric} \p{Extended_Pictographic} (From
  * http://www.unicode.org/Public/emoji/12.1/emoji-data.txt)
  */
-@Ignore
-public class WordBreakTestUnicode_12_1_0 extends BaseTokenStreamTestCase {
+public final class WordBreakTestUnicode_12_1_0 {
 
-  public void test(Analyzer analyzer) throws Exception {
+  public static void test(Analyzer analyzer) throws Exception {
     // ÷ 0001 ÷ 0001 ÷  #  ÷ [0.2] <START OF HEADING> (Other) ÷ [999.0] <START OF HEADING> (Other) ÷
     // [0.3]
     assertAnalyzesTo(analyzer, "\u0001\u0001", new String[] {});
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl
index 95c504f2fcc..5a0d8e06c10 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl
@@ -56,32 +56,26 @@ my $header =<<"__HEADER__";
 package org.apache.lucene.tests.analysis.standard;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
 
 /**
- * This class was automatically generated by ${script_name}
- * from: ${url}
- *
- * emoji-test.txt contains emoji char sequences, which are represented as
- * tokenization tests in this class.
- * 
+ * This class was automatically generated by ${script_name}.
+ * from: <a href="${url}"><code>${url}</code></a>
  */
-\@Ignore
-public class ${class_name} extends BaseTokenStreamTestCase {
+public final class ${class_name} {
 
-  public void test(Analyzer analyzer) throws Exception {
-    for (int i = 0 ; i < tests.length ; i += 2) {
-      String test = tests[i + 1];
+  public static void test(Analyzer analyzer) throws Exception {
+    for (int i = 0 ; i < TESTS.length ; i += 2) {
+      String test = TESTS[i + 1];
       try {
-        assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
+        BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
       } catch (Throwable t) {
-        throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);        
+        throw new Exception("Failed to tokenize \\"" + TESTS[i] + "\\":", t);
       }
     }
   }
 
-  private String[] tests = new String[] {
+  private static String[] TESTS = new String[] {
 __HEADER__
 
 my @tests = split /\r?\n/, get_URL_content($url);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl
index 07f7afe86ff..16e59c1fc6e 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl
@@ -62,8 +62,7 @@ my $header =<<"__HEADER__";
 package org.apache.lucene.tests.analysis.standard;
 
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
+import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
 
 /**
  * This class was automatically generated by ${script_name}
@@ -85,10 +84,9 @@ import org.junit.Ignore;
  *    \\p{WordBreak = Numeric}
  *    \\p{Extended_Pictographic}       (From $emoji_url)
  */
-\@Ignore
-public class ${class_name} extends BaseTokenStreamTestCase {
+public final class ${class_name} {
 
-  public void test(Analyzer analyzer) throws Exception {
+  public static void test(Analyzer analyzer) throws Exception {
 __HEADER__
 
 my $codepoints = [];