LUCENE-9914: Modernize Emoji regeneration scripts (#78)

2025-03-07 00:39:21 +00:00 · 2021-04-12 20:16:43 +02:00 · 2021-04-12 20:16:43 +02:00 · f91700a713
commit f91700a713
parent e7de06eb51
12 changed files with 312 additions and 287 deletions
--- a/build.gradle
+++ b/build.gradle
@ -89,6 +89,15 @@ ext {
  }
 }

+configurations {
+  groovy
+}
+
+dependencies {
+  // Use a newer groovy that doesn't have illegal reflective accesses.
+  groovy "org.codehaus.groovy:groovy-all:3.0.7"
+}
+
 apply from: file('buildSrc/scriptDepVersions.gradle')

 // Include smaller chunks configuring dedicated build areas.
@ -142,7 +151,6 @@ apply from: file('gradle/generation/kuromoji.gradle')
 apply from: file('gradle/generation/nori.gradle')
 apply from: file('gradle/generation/icu.gradle')
 apply from: file('gradle/generation/javacc.gradle')
-apply from: file('gradle/generation/unicode-data.gradle')

 apply from: file('gradle/datasets/external-datasets.gradle')

--- a/buildSrc/build.gradle
+++ b/buildSrc/build.gradle
@ -30,6 +30,5 @@ dependencies {
  implementation localGroovy()

  implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
-  implementation "com.ibm.icu:icu4j:${scriptDepVersions['icu']}"
 }

--- a/buildSrc/scriptDepVersions.gradle
+++ b/buildSrc/scriptDepVersions.gradle
@ -8,7 +8,6 @@ ext {
      "commons-codec": "1.13",
      "ecj": "3.25.0",
      "flexmark": "0.61.24",
-      "icu": "68.2",
      "javacc": "7.0.4",
      "jflex": "1.7.0",
      "jgit": "5.9.0.202009080501-r",
--- a/gradle/generation/icu.gradle
+++ b/gradle/generation/icu.gradle
@ -1,5 +1,4 @@
 import org.apache.tools.ant.taskdefs.condition.Os
-
 import java.nio.file.Files

 /*
@ -19,12 +18,37 @@ import java.nio.file.Files
 * limitations under the License.
 */

-/* Regenerates ICU data files.
+def resources = scriptResources(buildscript)
+
+/*
+ * Regenerates ICU-related data files.
+ *
+ * This build file contains regeneration code utilizing both icu4j and icu4c.
 *
 * The icu4c version must match exactly the icu4j version in version.props:
 * The one on your system is probably different. This script will attempt to
 * download and compile a matching icu4c version automatically.
 */
+
+// Configure different icu4j dependencies.
+configure(rootProject) {
+  configurations {
+    icu_62
+    icu_68
+  }
+
+  dependencies {
+    icu_62 "com.ibm.icu:icu4j:62.2"
+    icu_68 "com.ibm.icu:icu4j:68.2"
+  }
+
+  // Exclude ICU config from palantir's version unification.
+  versionRecommendations {
+    excludeConfigurations "icu_68", "icu_62"
+  }
+}
+
+
 configure(project(":lucene:analysis:icu")) {
  def utr30DataDir = file("src/data/utr30")

@ -208,3 +232,63 @@ configure(project(":lucene:analysis:icu")) {
    }
  }
 }
+
+
+// Regenerates UnicodeProps.java
+configure(project(":lucene:analysis:common")) {
+  task generateUnicodeProps() {
+    def icuConfig = rootProject.configurations.icu_68
+    def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
+
+    description "Regenerate ${outputFile} (with ${icuConfig.name})"
+    group "generation"
+
+    dependsOn icuConfig
+    outputs.file outputFile
+
+    doFirst {
+      project.javaexec {
+        main "groovy.lang.GroovyShell"
+        classpath icuConfig, rootProject.configurations.groovy
+
+        args = [
+            "--encoding", "UTF-8",
+            file("${resources}/GenerateUnicodeProps.groovy"),
+            outputFile
+        ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
+}
+
+
+// UnicodeEmojiProperties.jflex
+configure(project(":lucene:core")) {
+  task generateEmojiProperties() {
+    def icuConfig = rootProject.configurations.icu_62
+    def outputFile = file("src/data/jflex/UnicodeEmojiProperties.jflex")
+
+    description "Regenerate ${outputFile} (with ${icuConfig.name})"
+    group "generation"
+
+    dependsOn icuConfig
+    outputs.file outputFile
+
+    doFirst {
+      project.javaexec {
+        main "groovy.lang.GroovyShell"
+        classpath icuConfig, rootProject.configurations.groovy
+
+        args = [
+            "--encoding", "UTF-8",
+            file("${resources}/GenerateEmojiProperties.groovy"),
+            outputFile
+        ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiProperties)
+}
--- a/gradle/generation/icu/GenerateEmojiProperties.groovy
+++ b/gradle/generation/icu/GenerateEmojiProperties.groovy
@ -0,0 +1,48 @@
+import com.ibm.icu.text.UnicodeSet;
+import com.ibm.icu.lang.UCharacter
+import com.ibm.icu.util.VersionInfo
+import java.nio.file.*
+
+def outputFile = Paths.get(args[0])
+
+StringBuilder sb = new StringBuilder()
+[ "Emoji", "Emoji_Modifier", "Emoji_Modifier_Base", "Extended_Pictographic" ].each { setName ->
+  UnicodeSet set = new UnicodeSet("[:" + setName + ":]")
+  sb.append(setName + " = [")
+  for (UnicodeSet.EntryRange range : set.ranges()) {
+    if (range.codepoint == range.codepointEnd) {
+      sb.append("\\u{" + Integer.toHexString(range.codepoint).toUpperCase(Locale.ROOT) + "}")
+    } else {
+      sb.append("\\u{" + Integer.toHexString(range.codepoint).toUpperCase(Locale.ROOT) + "}-\\u{" + Integer.toHexString(range.codepointEnd).toUpperCase(Locale.ROOT) + "}")
+    }
+  }
+  sb.append("]\n")
+}
+
+def icuVersion = VersionInfo.ICU_VERSION.toString()
+def unicodeVersion = UCharacter.getUnicodeVersion().toString()
+
+def code = """
+// DO NOT EDIT THIS FILE! Use "gradlew generateEmojiProperties" to recreate.
+// The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+${sb.toString()}
+"""
+outputFile.setText(code, "UTF-8")
--- a/gradle/generation/icu/GenerateUnicodeProps.groovy
+++ b/gradle/generation/icu/GenerateUnicodeProps.groovy
@ -0,0 +1,74 @@
+import com.ibm.icu.lang.UCharacter
+import com.ibm.icu.util.VersionInfo
+import java.nio.file.*
+
+def icuVersion = VersionInfo.ICU_VERSION.toString()
+def unicodeVersion = UCharacter.getUnicodeVersion().toString()
+
+def outputFile = Paths.get(args[0])
+
+List<String> chars = []
+for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
+  if (UCharacter.isUWhiteSpace(c)) {
+    chars.add(String.format(Locale.ROOT, "0x%04X", c))
+  }
+}
+def whitespace = chars.join(", ")
+
+def code = """
+// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.util;
+
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.SparseFixedBitSet;
+
+/**
+ * This file contains unicode properties used by various {@link CharTokenizer}s.
+ * The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
+ */
+public final class UnicodeProps {
+  private UnicodeProps() {}
+  
+  /** Unicode version that was used to generate this file: {@value} */
+  public static final String UNICODE_VERSION = "${unicodeVersion}";
+  
+  /** Bitset with Unicode WHITESPACE code points. */
+  public static final Bits WHITESPACE = createBits(${whitespace});
+
+  private static Bits createBits(final int... codepoints) {
+    final int len = codepoints[codepoints.length - 1] + 1;
+    final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
+    for (int i : codepoints) bitset.set(i);
+    return new Bits() {
+      @Override
+      public boolean get(int index) {
+        return index < len && bitset.get(index);
+      }
+      
+      @Override
+      public int length() {
+        return ${String.format(Locale.ROOT, "0x%04X", UCharacter.MAX_CODE_POINT)} + 1;
+      }
+    };
+  }
+}
+"""
+outputFile.setText(code.trim(), "UTF-8")
--- a/gradle/generation/javacc.gradle
+++ b/gradle/generation/javacc.gradle
@ -170,7 +170,7 @@ configure(project(":lucene:queryparser")) {
  }

  task javaccParserFlexible(type: JavaCCTask) {
-    description "Regenerate Flexible query parser from queryparser/flexible/standard/parser/StandardSyntaxParser.jj"
+    description "Regenerate flexible query parser from queryparser/flexible/standard/parser/StandardSyntaxParser.jj"
    group "generation"

    javaccFile = file('src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj')
--- a/gradle/generation/unicode-data.gradle
+++ b/gradle/generation/unicode-data.gradle
@ -1,110 +0,0 @@
-import com.ibm.icu.lang.UCharacter;
-import com.ibm.icu.util.VersionInfo;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Regenerates UnicodeProps.java
-configure(project(":lucene:analysis:common")) {
-  task generateUnicodeProps() {
-    def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
-
-    def icuVersion = VersionInfo.ICU_VERSION.toString()
-    def unicodeVersion = UCharacter.getUnicodeVersion().toString()
-
-    inputs.property("icu-version", icuVersion)
-    inputs.property("unicode-version", unicodeVersion)
-    outputs.file outputFile
-
-    doFirst {
-      def icuLockDepVersion = getVersion("com.ibm.icu", "icu4j")
-      def icuScriptDep = scriptDepVersions['icu']
-      if (icuLockDepVersion != icuScriptDep) {
-        throw new GradleException("ICU version in build script dependency ${icuScriptDep} and in" +
-            " project dependency ${icuLockDepVersion} must match.")
-      }
-
-      List<String> chars = []
-      for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
-        if (UCharacter.isUWhiteSpace(c)) {
-          chars.add(String.format(Locale.ROOT, "0x%04X", c))
-        }
-      }
-      def whitespace = chars.join(", ")
-
-      def code = """
-// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.util;
-
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.SparseFixedBitSet;
-
-/**
- * This file contains unicode properties used by various {@link CharTokenizer}s.
- * The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
- */
-public final class UnicodeProps {
-  private UnicodeProps() {}
-  
-  /** Unicode version that was used to generate this file: {@value} */
-  public static final String UNICODE_VERSION = "${unicodeVersion}";
-  
-  /** Bitset with Unicode WHITESPACE code points. */
-  public static final Bits WHITESPACE = createBits(${whitespace});
-
-  private static Bits createBits(final int... codepoints) {
-    final int len = codepoints[codepoints.length - 1] + 1;
-    final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
-    for (int i : codepoints) bitset.set(i);
-    return new Bits() {
-      @Override
-      public boolean get(int index) {
-        return index < len && bitset.get(index);
-      }
-      
-      @Override
-      public int length() {
-        return ${String.format(Locale.ROOT, "0x%04X", UCharacter.MAX_CODE_POINT)} + 1;
-      }
-    };
-  }
-}
-"""
-      outputFile.setText(code.trim(), "UTF-8")
-    }
-  }
-
-  regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
-}
--- a/gradle/generation/unicode-props.gradle
+++ b/gradle/generation/unicode-props.gradle
@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+def resources = scriptResources(buildscript)
+
+configure(rootProject) {
+  configurations {
+    icu_62
+    icu_68
+  }
+
+  dependencies {
+    icu_62 "com.ibm.icu:icu4j:62.2"
+    icu_68 "com.ibm.icu:icu4j:68.2"
+  }
+
+  // Exclude ICU config from palantir's version unification.
+  versionRecommendations {
+    excludeConfigurations "icu_68", "icu_62"
+  }
+}
+
+// Regenerates UnicodeProps.java
+configure(project(":lucene:analysis:common")) {
+  task generateUnicodeProps() {
+    def icuConfig = rootProject.configurations.icu_68
+    def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
+
+    dependsOn icuConfig
+    outputs.file outputFile
+
+    doFirst {
+      project.javaexec {
+        main "groovy.lang.GroovyShell"
+        classpath icuConfig, rootProject.configurations.groovy
+
+        args = [
+            "--encoding", "UTF-8",
+            file("${resources}/GenerateUnicodeProps.groovy"),
+            outputFile
+        ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
+}
+
+configure(project(":lucene:core")) {
+  task generateEmojiProperties() {
+    def icuConfig = rootProject.configurations.icu_62
+    def outputFile = file("src/data/jflex/UnicodeEmojiProperties.jflex")
+
+    dependsOn icuConfig
+    outputs.file outputFile
+
+    doFirst {
+      println icuConfig.files
+      project.javaexec {
+        main "groovy.lang.GroovyShell"
+        classpath icuConfig, rootProject.configurations.groovy
+
+        args = [
+            "--encoding", "UTF-8",
+            file("${resources}/GenerateEmojiProperties.groovy"),
+            outputFile
+        ]
+      }
+    }
+  }
+
+  regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiProperties)
+}
--- a/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
+++ b/lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex
@ -1,3 +1,7 @@
+
+// DO NOT EDIT THIS FILE! Use "gradlew generateEmojiProperties" to recreate.
+// The data was generated using ICU4J v62.2.0.0, unicode version: 11.0.0.0.
+
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
@ -15,9 +19,6 @@
 * limitations under the License.
 */

-// This file was automatically generated by getUnicodeEmojiProperties.pl
-// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt 
-
 Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
 Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
 Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
--- a/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
+++ b/lucene/core/src/data/jflex/getUnicodeEmojiProperties.pl
@ -1,168 +0,0 @@
-#!/usr/bin/perl
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-use warnings;
-use strict;
-use File::Spec;
-use Getopt::Long;
-use LWP::UserAgent;
-
-my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
-
-my $version = '';
-unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
-    print STDERR "Usage: $script_name -v <version>\n";
-    print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
-        if ($version);
-    exit 1;
-}
-my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
-my $output_filename = "UnicodeEmojiProperties.jflex";
-my $header =<<"__HEADER__";
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// This file was automatically generated by ${script_name}
-// from: ${emoji_data_url} 
-
-__HEADER__
-
-my $property_ranges = {};
-my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
-
-parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
-
-my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
-output_jflex_include_file($output_path, $property_ranges);
-
-
-# sub parse_emoji_data_file
-#
-# Downloads and parses the emoji_data.txt file, extracting code point ranges
-# assigned to property values with age not younger than the passed-in version,
-# except for the Extended_Pictographic property, for which all code point ranges
-# are extracted, regardless of age.
-#
-# Parameters:
-#
-#  - Emoji data file URL
-#  - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
-#  - Reference to hash of wanted property names
-#
-sub parse_emoji_data_file {
-    my $url = shift;
-    my $prop_ranges = shift;
-    my $wanted_props = shift;
-    my $content = get_URL_content($url);
-    print STDERR "Parsing '$url'...";
-    my @lines = split /\r?\n/, $content;
-    for (@lines) {
-        ## 231A..231B    ; Emoji_Presentation   #  1.1  [2] (⌚..⌛)    watch..hourglass done
-        ## 1F9C0         ; Emoji_Presentation   #  8.0  [1] (🧀)       cheese wedge
-        ## 1FA00..1FA5F  ; Extended_Pictographic#   NA [96] (🨀️..🩟️)    <reserved-1FA00>..<reserved-1FA5F>
-        if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
-            next unless defined($wanted_props->{$prop});  # Skip unless we want ranges for this property
-            
-            if (not defined($prop_ranges->{$prop})) {
-                $prop_ranges->{$prop} = [];
-            }
-            $end = $start unless defined($end);
-            my $start_dec = hex $start;
-            my $end_dec = hex $end;
-            my $ranges = $prop_ranges->{$prop};
-            if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
-                # print STDERR "Adding new range ($start, $end)\n";
-                push @$ranges, $start_dec, $end_dec;
-            } else {
-                # printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
-                $ranges->[-1] = $end_dec;
-            }
-        } else {
-            # print STDERR "Skipping line (no data): $_\n";
-        }
-    }
-    print STDERR "done.\n";
-}
-
-# sub get_URL_content
-#
-# Retrieves and returns the content of the given URL.
-#
-# Parameter:
-#
-#  - URL to get content for
-#
-sub get_URL_content {
-    my $url = shift;
-    print STDERR "Retrieving '$url'...";
-    my $user_agent = LWP::UserAgent->new;
-    my $request = HTTP::Request->new(GET => $url);
-    my $response = $user_agent->request($request);
-    unless ($response->is_success) {
-        print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
-        exit 1;
-    }
-    print STDERR "done.\n";
-    return $response->content;
-}
-
-
-# sub output_jflex_include_file
-#
-# Parameters:
-#
-#  - Output path
-#  - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
-#     
-sub output_jflex_include_file {
-    my $path = shift;
-    my $prop_ranges = shift;
-    open OUT, ">$path"
-        || die "Error opening '$path' for writing: $!";
-
-    print STDERR "Writing '$path'...";
-
-    print OUT $header;
-
-    for my $prop (sort keys %$prop_ranges) {
-        my $ranges = $prop_ranges->{$prop};
-        print OUT "$prop = [";
-        for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
-            printf OUT "\\u{%X}", $ranges->[$index];
-            printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
-        }
-        print OUT "]\n";
-    }
-
-    print OUT "\n";
-    close OUT;
-    print STDERR "done.\n";
-}
--- a/lucene/core/src/generated/checksums/generateEmojiProperties.json
+++ b/lucene/core/src/generated/checksums/generateEmojiProperties.json
@ -0,0 +1,3 @@
+{
+    "lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae"
+}