Implement source code regeneration for test-framework perl scripts (#11952)

2022-11-19 23:40:45 +01:00 · 2022-11-19 23:40:45 +01:00 · 3f6410b738
parent e78210b7f0
commit 3f6410b738
12 changed files with 122 additions and 48 deletions
--- a/build.gradle
+++ b/build.gradle
@ -156,6 +156,7 @@ apply from: file('gradle/generation/icu.gradle')
 apply from: file('gradle/generation/javacc.gradle')
 apply from: file('gradle/generation/forUtil.gradle')
 apply from: file('gradle/generation/antlr.gradle')
+apply from: file('gradle/generation/unicode-test-classes.gradle')

 apply from: file('gradle/datasets/external-datasets.gradle')

--- a/gradle/generation/regenerate.gradle
+++ b/gradle/generation/regenerate.gradle
@ -92,6 +92,7 @@ configure([
        project(":lucene:core"),
        project(":lucene:queryparser"),
        project(":lucene:expressions"),
+        project(":lucene:test-framework"),
 ]) {
    task regenerate() {
      description "Rerun any code or static data generation tasks."
--- a/gradle/generation/unicode-test-classes.gradle
+++ b/gradle/generation/unicode-test-classes.gradle
@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Regenerates test classes from unicode data.
+
+configure(project(":lucene:test-framework")) {
+  task generateEmojiTokenizationTestInternal() {
+    def unicodeVersion = "12.1"
+
+    def genDir = file("src/java/org/apache/lucene/tests/analysis/standard")
+    def genScript = file("${genDir}/generateEmojiTokenizationTest.pl")
+    def genOutput = file("${genDir}/EmojiTokenizationTestUnicode_${unicodeVersion.replace('.', '_')}.java")
+
+    description "Regenerate ${genOutput}"
+    group "generation"
+
+    inputs.file genScript
+    inputs.property "unicodeVersion", unicodeVersion
+    outputs.file genOutput
+
+    doLast {
+      quietExec {
+        workingDir genDir
+        executable project.externalTool("perl")
+        args = [
+            genScript,
+            "-v", unicodeVersion
+        ]
+      }
+    }
+  }
+  regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiTokenizationTestInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
+
+  task generateJavaUnicodeWordBreakTestInternal() {
+    def unicodeVersion = "12.1"
+
+    def genDir = file("src/java/org/apache/lucene/tests/analysis/standard")
+    def genScript = file("${genDir}/generateJavaUnicodeWordBreakTest.pl")
+    def genOutput = file("${genDir}/WordBreakTestUnicode_${unicodeVersion.replace('.', '_')}.java")
+
+    description "Regenerate ${genOutput}"
+    group "generation"
+
+    inputs.file genScript
+    inputs.property "unicodeVersion", unicodeVersion
+    outputs.file genOutput
+
+    doLast {
+      quietExec {
+        workingDir genDir
+        executable project.externalTool("perl")
+        args = [
+            genScript,
+            "-v", unicodeVersion
+        ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateJavaUnicodeWordBreakTestInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ])
+
+}
--- a/gradle/globals.gradle
+++ b/gradle/globals.gradle
@ -113,6 +113,8 @@ allprojects {
              "code: ${result.exitValue}, " +
              "output at: ${outputFile} (and logged above).")
        }
+      } else {
+        logger.info(new String(outputFile.bytes))
      }

      return result
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java
@ -678,8 +678,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
  }

  public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_12_1_0 wordBreakTest = new WordBreakTestUnicode_12_1_0();
-    wordBreakTest.test(a);
+    WordBreakTestUnicode_12_1_0.test(a);
  }

  public void testSupplementary() throws Exception {
@ -852,8 +851,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
  }

  public void testUnicodeEmojiTests() throws Exception {
-    EmojiTokenizationTestUnicode_12_1 emojiTest = new EmojiTokenizationTestUnicode_12_1();
-    emojiTest.test(a);
+    EmojiTokenizationTestUnicode_12_1.test(a);
  }

  /** blast some random strings through the analyzer */
--- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java
@ -463,8 +463,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
  }

  public void testUnicodeWordBreaks() throws Exception {
-    WordBreakTestUnicode_12_1_0 wordBreakTest = new WordBreakTestUnicode_12_1_0();
-    wordBreakTest.test(a);
+    WordBreakTestUnicode_12_1_0.test(a);
  }

  public void testSupplementary() throws Exception {
@ -637,8 +636,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
  }

  public void testUnicodeEmojiTests() throws Exception {
-    EmojiTokenizationTestUnicode_12_1 emojiTest = new EmojiTokenizationTestUnicode_12_1();
-    emojiTest.test(a);
+    EmojiTokenizationTestUnicode_12_1.test(a);
  }

  /** blast some random strings through the analyzer */
--- a/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json
+++ b/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json
@ -0,0 +1,5 @@
+{
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java": "22e03ada47168b0986220c57260cfaf8e6e12e16",
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl": "a21d8aea5d2c30fb47b2bf9b24e20ddf605de46d",
+    "property:unicodeVersion": "12.1"
+}
--- a/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json
+++ b/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json
@ -0,0 +1,5 @@
+{
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1.java": "--",
+    "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl": "1dd7148f4514976503a2be2e00be75c20ce784fb",
+    "property:unicodeVersion": "12.1"
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java
@ -19,30 +19,27 @@ package org.apache.lucene.tests.analysis.standard;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;

 /**
- * This class was automatically generated by generateEmojiTokenizationTest.pl from:
- * http://www.unicode.org/Public/emoji/12.1/emoji-test.txt
- *
- * <p>emoji-test.txt contains emoji char sequences, which are represented as tokenization tests in
- * this class.
+ * This class was automatically generated by generateEmojiTokenizationTest.pl. from: <a
+ * href="http://www.unicode.org/Public/emoji/12.1/emoji-test.txt"><code>
+ * http://www.unicode.org/Public/emoji/12.1/emoji-test.txt</code></a>
 */
-@Ignore
-public class EmojiTokenizationTestUnicode_12_1 extends BaseTokenStreamTestCase {
+public final class EmojiTokenizationTestUnicode_12_1 {

-  public void test(Analyzer analyzer) throws Exception {
-    for (int i = 0; i < tests.length; i += 2) {
-      String test = tests[i + 1];
+  public static void test(Analyzer analyzer) throws Exception {
+    for (int i = 0; i < TESTS.length; i += 2) {
+      String test = TESTS[i + 1];
      try {
-        assertAnalyzesTo(analyzer, test, new String[] {test}, new String[] {"<EMOJI>"});
+        BaseTokenStreamTestCase.assertAnalyzesTo(
+            analyzer, test, new String[] {test}, new String[] {"<EMOJI>"});
      } catch (Throwable t) {
-        throw new Exception("Failed to tokenize \"" + tests[i] + "\":", t);
+        throw new Exception("Failed to tokenize \"" + TESTS[i] + "\":", t);
      }
    }
  }

-  private String[] tests =
+  private static String[] TESTS =
      new String[] {
        "1F600                                      ; fully-qualified     # 😀 E2.0 grinning face",
        "\uD83D\uDE00",
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java
@ -17,9 +17,9 @@

 package org.apache.lucene.tests.analysis.standard;

+import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
+
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;

 /**
 * This class was automatically generated by generateJavaUnicodeWordBreakTest.pl from:
@ -38,10 +38,9 @@ import org.junit.Ignore;
 * Hebrew_Letter} \p{WordBreak = Katakana} \p{WordBreak = Numeric} \p{Extended_Pictographic} (From
 * http://www.unicode.org/Public/emoji/12.1/emoji-data.txt)
 */
-@Ignore
-public class WordBreakTestUnicode_12_1_0 extends BaseTokenStreamTestCase {
+public final class WordBreakTestUnicode_12_1_0 {

-  public void test(Analyzer analyzer) throws Exception {
+  public static void test(Analyzer analyzer) throws Exception {
    // ÷ 0001 ÷ 0001 ÷  #  ÷ [0.2] <START OF HEADING> (Other) ÷ [999.0] <START OF HEADING> (Other) ÷
    // [0.3]
    assertAnalyzesTo(analyzer, "\u0001\u0001", new String[] {});
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl
@ -56,32 +56,26 @@ my $header =<<"__HEADER__";
 package org.apache.lucene.tests.analysis.standard;

 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
+import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;

 /**
- * This class was automatically generated by ${script_name}
- * from: ${url}
- *
- * emoji-test.txt contains emoji char sequences, which are represented as
- * tokenization tests in this class.
- * 
+ * This class was automatically generated by ${script_name}.
+ * from: <a href="${url}"><code>${url}</code></a>
 */
-\@Ignore
-public class ${class_name} extends BaseTokenStreamTestCase {
+public final class ${class_name} {

-  public void test(Analyzer analyzer) throws Exception {
-    for (int i = 0 ; i < tests.length ; i += 2) {
-      String test = tests[i + 1];
+  public static void test(Analyzer analyzer) throws Exception {
+    for (int i = 0 ; i < TESTS.length ; i += 2) {
+      String test = TESTS[i + 1];
      try {
-        assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
+        BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "<EMOJI>" });
      } catch (Throwable t) {
-        throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t);        
+        throw new Exception("Failed to tokenize \\"" + TESTS[i] + "\\":", t);
      }
    }
  }

-  private String[] tests = new String[] {
+  private static String[] TESTS = new String[] {
 __HEADER__

 my @tests = split /\r?\n/, get_URL_content($url);
--- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl
+++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl
@ -62,8 +62,7 @@ my $header =<<"__HEADER__";
 package org.apache.lucene.tests.analysis.standard;

 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.junit.Ignore;
+import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

 /**
 * This class was automatically generated by ${script_name}
@ -85,10 +84,9 @@ import org.junit.Ignore;
 *    \\p{WordBreak = Numeric}
 *    \\p{Extended_Pictographic}       (From $emoji_url)
 */
-\@Ignore
-public class ${class_name} extends BaseTokenStreamTestCase {
+public final class ${class_name} {

-  public void test(Analyzer analyzer) throws Exception {
+  public static void test(Analyzer analyzer) throws Exception {
 __HEADER__

 my $codepoints = [];