From 3f6410b738557f42292d7499e69f6df4456413ef Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Sat, 19 Nov 2022 23:40:45 +0100 Subject: [PATCH] Implement source code regeneration for test-framework perl scripts (#11952) --- build.gradle | 1 + gradle/generation/regenerate.gradle | 1 + gradle/generation/unicode-test-classes.gradle | 76 +++++++++++++++++++ gradle/globals.gradle | 2 + .../email/TestUAX29URLEmailTokenizer.java | 6 +- .../standard/TestStandardAnalyzer.java | 6 +- .../generateEmojiTokenizationTest.json | 5 ++ .../generateJavaUnicodeWordBreakTest.json | 5 ++ .../EmojiTokenizationTestUnicode_12_1.java | 25 +++--- .../standard/WordBreakTestUnicode_12_1_0.java | 9 +-- .../standard/generateEmojiTokenizationTest.pl | 26 +++---- .../generateJavaUnicodeWordBreakTest.pl | 8 +- 12 files changed, 122 insertions(+), 48 deletions(-) create mode 100644 gradle/generation/unicode-test-classes.gradle create mode 100644 lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json create mode 100644 lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json diff --git a/build.gradle b/build.gradle index eb2cc45f62c..24e8b580583 100644 --- a/build.gradle +++ b/build.gradle @@ -156,6 +156,7 @@ apply from: file('gradle/generation/icu.gradle') apply from: file('gradle/generation/javacc.gradle') apply from: file('gradle/generation/forUtil.gradle') apply from: file('gradle/generation/antlr.gradle') +apply from: file('gradle/generation/unicode-test-classes.gradle') apply from: file('gradle/datasets/external-datasets.gradle') diff --git a/gradle/generation/regenerate.gradle b/gradle/generation/regenerate.gradle index f13cf9d9936..da438e5e14e 100644 --- a/gradle/generation/regenerate.gradle +++ b/gradle/generation/regenerate.gradle @@ -92,6 +92,7 @@ configure([ project(":lucene:core"), project(":lucene:queryparser"), project(":lucene:expressions"), + project(":lucene:test-framework"), ]) { task regenerate() { description "Rerun any code or static data generation tasks." diff --git a/gradle/generation/unicode-test-classes.gradle b/gradle/generation/unicode-test-classes.gradle new file mode 100644 index 00000000000..cd2086edeb1 --- /dev/null +++ b/gradle/generation/unicode-test-classes.gradle @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Regenerates test classes from unicode data. + +configure(project(":lucene:test-framework")) { + task generateEmojiTokenizationTestInternal() { + def unicodeVersion = "12.1" + + def genDir = file("src/java/org/apache/lucene/tests/analysis/standard") + def genScript = file("${genDir}/generateEmojiTokenizationTest.pl") + def genOutput = file("${genDir}/EmojiTokenizationTestUnicode_${unicodeVersion.replace('.', '_')}.java") + + description "Regenerate ${genOutput}" + group "generation" + + inputs.file genScript + inputs.property "unicodeVersion", unicodeVersion + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("perl") + args = [ + genScript, + "-v", unicodeVersion + ] + } + } + } + regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiTokenizationTestInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]) + + task generateJavaUnicodeWordBreakTestInternal() { + def unicodeVersion = "12.1" + + def genDir = file("src/java/org/apache/lucene/tests/analysis/standard") + def genScript = file("${genDir}/generateJavaUnicodeWordBreakTest.pl") + def genOutput = file("${genDir}/WordBreakTestUnicode_${unicodeVersion.replace('.', '_')}.java") + + description "Regenerate ${genOutput}" + group "generation" + + inputs.file genScript + inputs.property "unicodeVersion", unicodeVersion + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("perl") + args = [ + genScript, + "-v", unicodeVersion + ] + } + } + } + + regenerate.dependsOn wrapWithPersistentChecksums(generateJavaUnicodeWordBreakTestInternal, [ andThenTasks: ["spotlessJava", "spotlessJavaApply"] ]) + +} diff --git a/gradle/globals.gradle b/gradle/globals.gradle index 60e5101437e..662b58d4205 100644 --- a/gradle/globals.gradle +++ b/gradle/globals.gradle @@ -113,6 +113,8 @@ allprojects { "code: ${result.exitValue}, " + "output at: ${outputFile} (and logged above).") } + } else { + logger.info(new String(outputFile.bytes)) } return result diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java index c72c9073c66..7d709e62e51 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/email/TestUAX29URLEmailTokenizer.java @@ -678,8 +678,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase { } public void testUnicodeWordBreaks() throws Exception { - WordBreakTestUnicode_12_1_0 wordBreakTest = new WordBreakTestUnicode_12_1_0(); - wordBreakTest.test(a); + WordBreakTestUnicode_12_1_0.test(a); } public void testSupplementary() throws Exception { @@ -852,8 +851,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase { } public void testUnicodeEmojiTests() throws Exception { - EmojiTokenizationTestUnicode_12_1 emojiTest = new EmojiTokenizationTestUnicode_12_1(); - emojiTest.test(a); + EmojiTokenizationTestUnicode_12_1.test(a); } /** blast some random strings through the analyzer */ diff --git a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java index 655ba8bf9b0..8fbe0359698 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/standard/TestStandardAnalyzer.java @@ -463,8 +463,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase { } public void testUnicodeWordBreaks() throws Exception { - WordBreakTestUnicode_12_1_0 wordBreakTest = new WordBreakTestUnicode_12_1_0(); - wordBreakTest.test(a); + WordBreakTestUnicode_12_1_0.test(a); } public void testSupplementary() throws Exception { @@ -637,8 +636,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase { } public void testUnicodeEmojiTests() throws Exception { - EmojiTokenizationTestUnicode_12_1 emojiTest = new EmojiTokenizationTestUnicode_12_1(); - emojiTest.test(a); + EmojiTokenizationTestUnicode_12_1.test(a); } /** blast some random strings through the analyzer */ diff --git a/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json b/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json new file mode 100644 index 00000000000..77785ed9882 --- /dev/null +++ b/lucene/test-framework/src/generated/checksums/generateEmojiTokenizationTest.json @@ -0,0 +1,5 @@ +{ + "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java": "22e03ada47168b0986220c57260cfaf8e6e12e16", + "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl": "a21d8aea5d2c30fb47b2bf9b24e20ddf605de46d", + "property:unicodeVersion": "12.1" +} \ No newline at end of file diff --git a/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json b/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json new file mode 100644 index 00000000000..72bfd5f8c4a --- /dev/null +++ b/lucene/test-framework/src/generated/checksums/generateJavaUnicodeWordBreakTest.json @@ -0,0 +1,5 @@ +{ + "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1.java": "--", + "lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl": "1dd7148f4514976503a2be2e00be75c20ce784fb", + "property:unicodeVersion": "12.1" +} \ No newline at end of file diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java index 3e291100914..0276063e6c7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/EmojiTokenizationTestUnicode_12_1.java @@ -19,30 +19,27 @@ package org.apache.lucene.tests.analysis.standard; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; -import org.junit.Ignore; /** - * This class was automatically generated by generateEmojiTokenizationTest.pl from: - * http://www.unicode.org/Public/emoji/12.1/emoji-test.txt - * - *

emoji-test.txt contains emoji char sequences, which are represented as tokenization tests in - * this class. + * This class was automatically generated by generateEmojiTokenizationTest.pl. from: + * http://www.unicode.org/Public/emoji/12.1/emoji-test.txt */ -@Ignore -public class EmojiTokenizationTestUnicode_12_1 extends BaseTokenStreamTestCase { +public final class EmojiTokenizationTestUnicode_12_1 { - public void test(Analyzer analyzer) throws Exception { - for (int i = 0; i < tests.length; i += 2) { - String test = tests[i + 1]; + public static void test(Analyzer analyzer) throws Exception { + for (int i = 0; i < TESTS.length; i += 2) { + String test = TESTS[i + 1]; try { - assertAnalyzesTo(analyzer, test, new String[] {test}, new String[] {""}); + BaseTokenStreamTestCase.assertAnalyzesTo( + analyzer, test, new String[] {test}, new String[] {""}); } catch (Throwable t) { - throw new Exception("Failed to tokenize \"" + tests[i] + "\":", t); + throw new Exception("Failed to tokenize \"" + TESTS[i] + "\":", t); } } } - private String[] tests = + private static String[] TESTS = new String[] { "1F600 ; fully-qualified # 😀 E2.0 grinning face", "\uD83D\uDE00", diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java index 238301c7f3c..c94c6737a12 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/WordBreakTestUnicode_12_1_0.java @@ -17,9 +17,9 @@ package org.apache.lucene.tests.analysis.standard; +import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo; + import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; -import org.junit.Ignore; /** * This class was automatically generated by generateJavaUnicodeWordBreakTest.pl from: @@ -38,10 +38,9 @@ import org.junit.Ignore; * Hebrew_Letter} \p{WordBreak = Katakana} \p{WordBreak = Numeric} \p{Extended_Pictographic} (From * http://www.unicode.org/Public/emoji/12.1/emoji-data.txt) */ -@Ignore -public class WordBreakTestUnicode_12_1_0 extends BaseTokenStreamTestCase { +public final class WordBreakTestUnicode_12_1_0 { - public void test(Analyzer analyzer) throws Exception { + public static void test(Analyzer analyzer) throws Exception { // ÷ 0001 ÷ 0001 ÷ # ÷ [0.2] (Other) ÷ [999.0] (Other) ÷ // [0.3] assertAnalyzesTo(analyzer, "\u0001\u0001", new String[] {}); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl index 95c504f2fcc..5a0d8e06c10 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateEmojiTokenizationTest.pl @@ -56,32 +56,26 @@ my $header =<<"__HEADER__"; package org.apache.lucene.tests.analysis.standard; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.junit.Ignore; +import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; /** - * This class was automatically generated by ${script_name} - * from: ${url} - * - * emoji-test.txt contains emoji char sequences, which are represented as - * tokenization tests in this class. - * + * This class was automatically generated by ${script_name}. + * from: ${url} */ -\@Ignore -public class ${class_name} extends BaseTokenStreamTestCase { +public final class ${class_name} { - public void test(Analyzer analyzer) throws Exception { - for (int i = 0 ; i < tests.length ; i += 2) { - String test = tests[i + 1]; + public static void test(Analyzer analyzer) throws Exception { + for (int i = 0 ; i < TESTS.length ; i += 2) { + String test = TESTS[i + 1]; try { - assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "" }); + BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, test, new String[] { test }, new String[] { "" }); } catch (Throwable t) { - throw new Exception("Failed to tokenize \\"" + tests[i] + "\\":", t); + throw new Exception("Failed to tokenize \\"" + TESTS[i] + "\\":", t); } } } - private String[] tests = new String[] { + private static String[] TESTS = new String[] { __HEADER__ my @tests = split /\r?\n/, get_URL_content($url); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl index 07f7afe86ff..16e59c1fc6e 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/standard/generateJavaUnicodeWordBreakTest.pl @@ -62,8 +62,7 @@ my $header =<<"__HEADER__"; package org.apache.lucene.tests.analysis.standard; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.junit.Ignore; +import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo; /** * This class was automatically generated by ${script_name} @@ -85,10 +84,9 @@ import org.junit.Ignore; * \\p{WordBreak = Numeric} * \\p{Extended_Pictographic} (From $emoji_url) */ -\@Ignore -public class ${class_name} extends BaseTokenStreamTestCase { +public final class ${class_name} { - public void test(Analyzer analyzer) throws Exception { + public static void test(Analyzer analyzer) throws Exception { __HEADER__ my $codepoints = [];