LUCENE-9914: Modernize Emoji regeneration scripts (#78)

This commit is contained in:
Dawid Weiss 2021-04-12 20:16:43 +02:00 committed by GitHub
parent e7de06eb51
commit f91700a713
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 312 additions and 287 deletions

View File

@ -89,6 +89,15 @@ ext {
}
}
configurations {
groovy
}
dependencies {
// Use a newer groovy that doesn't have illegal reflective accesses.
groovy "org.codehaus.groovy:groovy-all:3.0.7"
}
apply from: file('buildSrc/scriptDepVersions.gradle')
// Include smaller chunks configuring dedicated build areas.
@ -142,7 +151,6 @@ apply from: file('gradle/generation/kuromoji.gradle')
apply from: file('gradle/generation/nori.gradle')
apply from: file('gradle/generation/icu.gradle')
apply from: file('gradle/generation/javacc.gradle')
apply from: file('gradle/generation/unicode-data.gradle')
apply from: file('gradle/datasets/external-datasets.gradle')

View File

@ -30,6 +30,5 @@ dependencies {
implementation localGroovy()
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
implementation "com.ibm.icu:icu4j:${scriptDepVersions['icu']}"
}

View File

@ -8,7 +8,6 @@ ext {
"commons-codec": "1.13",
"ecj": "3.25.0",
"flexmark": "0.61.24",
"icu": "68.2",
"javacc": "7.0.4",
"jflex": "1.7.0",
"jgit": "5.9.0.202009080501-r",

View File

@ -1,5 +1,4 @@
import org.apache.tools.ant.taskdefs.condition.Os
import java.nio.file.Files
/*
@ -19,12 +18,37 @@ import java.nio.file.Files
* limitations under the License.
*/
/* Regenerates ICU data files.
def resources = scriptResources(buildscript)
/*
* Regenerates ICU-related data files.
*
* This build file contains regeneration code utilizing both icu4j and icu4c.
*
* The icu4c version must match exactly the icu4j version in version.props:
* The one on your system is probably different. This script will attempt to
* download and compile a matching icu4c version automatically.
*/
// Configure different icu4j dependencies.
configure(rootProject) {
configurations {
icu_62
icu_68
}
dependencies {
icu_62 "com.ibm.icu:icu4j:62.2"
icu_68 "com.ibm.icu:icu4j:68.2"
}
// Exclude ICU config from palantir's version unification.
versionRecommendations {
excludeConfigurations "icu_68", "icu_62"
}
}
configure(project(":lucene:analysis:icu")) {
def utr30DataDir = file("src/data/utr30")
@ -208,3 +232,63 @@ configure(project(":lucene:analysis:icu")) {
}
}
}
// Regenerates UnicodeProps.java
configure(project(":lucene:analysis:common")) {
task generateUnicodeProps() {
def icuConfig = rootProject.configurations.icu_68
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
description "Regenerate ${outputFile} (with ${icuConfig.name})"
group "generation"
dependsOn icuConfig
outputs.file outputFile
doFirst {
project.javaexec {
main "groovy.lang.GroovyShell"
classpath icuConfig, rootProject.configurations.groovy
args = [
"--encoding", "UTF-8",
file("${resources}/GenerateUnicodeProps.groovy"),
outputFile
]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
}
// UnicodeEmojiProperties.jflex
configure(project(":lucene:core")) {
task generateEmojiProperties() {
def icuConfig = rootProject.configurations.icu_62
def outputFile = file("src/data/jflex/UnicodeEmojiProperties.jflex")
description "Regenerate ${outputFile} (with ${icuConfig.name})"
group "generation"
dependsOn icuConfig
outputs.file outputFile
doFirst {
project.javaexec {
main "groovy.lang.GroovyShell"
classpath icuConfig, rootProject.configurations.groovy
args = [
"--encoding", "UTF-8",
file("${resources}/GenerateEmojiProperties.groovy"),
outputFile
]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiProperties)
}

View File

@ -0,0 +1,48 @@
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.lang.UCharacter
import com.ibm.icu.util.VersionInfo
import java.nio.file.*
def outputFile = Paths.get(args[0])
StringBuilder sb = new StringBuilder()
[ "Emoji", "Emoji_Modifier", "Emoji_Modifier_Base", "Extended_Pictographic" ].each { setName ->
UnicodeSet set = new UnicodeSet("[:" + setName + ":]")
sb.append(setName + " = [")
for (UnicodeSet.EntryRange range : set.ranges()) {
if (range.codepoint == range.codepointEnd) {
sb.append("\\u{" + Integer.toHexString(range.codepoint).toUpperCase(Locale.ROOT) + "}")
} else {
sb.append("\\u{" + Integer.toHexString(range.codepoint).toUpperCase(Locale.ROOT) + "}-\\u{" + Integer.toHexString(range.codepointEnd).toUpperCase(Locale.ROOT) + "}")
}
}
sb.append("]\n")
}
def icuVersion = VersionInfo.ICU_VERSION.toString()
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
def code = """
// DO NOT EDIT THIS FILE! Use "gradlew generateEmojiProperties" to recreate.
// The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
${sb.toString()}
"""
outputFile.setText(code, "UTF-8")

View File

@ -0,0 +1,74 @@
import com.ibm.icu.lang.UCharacter
import com.ibm.icu.util.VersionInfo
import java.nio.file.*
def icuVersion = VersionInfo.ICU_VERSION.toString()
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
def outputFile = Paths.get(args[0])
List<String> chars = []
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
if (UCharacter.isUWhiteSpace(c)) {
chars.add(String.format(Locale.ROOT, "0x%04X", c))
}
}
def whitespace = chars.join(", ")
def code = """
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.util;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "${unicodeVersion}";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(${whitespace});
private static Bits createBits(final int... codepoints) {
final int len = codepoints[codepoints.length - 1] + 1;
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
for (int i : codepoints) bitset.set(i);
return new Bits() {
@Override
public boolean get(int index) {
return index < len && bitset.get(index);
}
@Override
public int length() {
return ${String.format(Locale.ROOT, "0x%04X", UCharacter.MAX_CODE_POINT)} + 1;
}
};
}
}
"""
outputFile.setText(code.trim(), "UTF-8")

View File

@ -170,7 +170,7 @@ configure(project(":lucene:queryparser")) {
}
task javaccParserFlexible(type: JavaCCTask) {
description "Regenerate Flexible query parser from queryparser/flexible/standard/parser/StandardSyntaxParser.jj"
description "Regenerate flexible query parser from queryparser/flexible/standard/parser/StandardSyntaxParser.jj"
group "generation"
javaccFile = file('src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj')

View File

@ -1,110 +0,0 @@
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.VersionInfo;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Regenerates UnicodeProps.java
configure(project(":lucene:analysis:common")) {
task generateUnicodeProps() {
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
def icuVersion = VersionInfo.ICU_VERSION.toString()
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
inputs.property("icu-version", icuVersion)
inputs.property("unicode-version", unicodeVersion)
outputs.file outputFile
doFirst {
def icuLockDepVersion = getVersion("com.ibm.icu", "icu4j")
def icuScriptDep = scriptDepVersions['icu']
if (icuLockDepVersion != icuScriptDep) {
throw new GradleException("ICU version in build script dependency ${icuScriptDep} and in" +
" project dependency ${icuLockDepVersion} must match.")
}
List<String> chars = []
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
if (UCharacter.isUWhiteSpace(c)) {
chars.add(String.format(Locale.ROOT, "0x%04X", c))
}
}
def whitespace = chars.join(", ")
def code = """
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.util;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SparseFixedBitSet;
/**
* This file contains unicode properties used by various {@link CharTokenizer}s.
* The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
*/
public final class UnicodeProps {
private UnicodeProps() {}
/** Unicode version that was used to generate this file: {@value} */
public static final String UNICODE_VERSION = "${unicodeVersion}";
/** Bitset with Unicode WHITESPACE code points. */
public static final Bits WHITESPACE = createBits(${whitespace});
private static Bits createBits(final int... codepoints) {
final int len = codepoints[codepoints.length - 1] + 1;
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
for (int i : codepoints) bitset.set(i);
return new Bits() {
@Override
public boolean get(int index) {
return index < len && bitset.get(index);
}
@Override
public int length() {
return ${String.format(Locale.ROOT, "0x%04X", UCharacter.MAX_CODE_POINT)} + 1;
}
};
}
}
"""
outputFile.setText(code.trim(), "UTF-8")
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
}

View File

@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
def resources = scriptResources(buildscript)
configure(rootProject) {
configurations {
icu_62
icu_68
}
dependencies {
icu_62 "com.ibm.icu:icu4j:62.2"
icu_68 "com.ibm.icu:icu4j:68.2"
}
// Exclude ICU config from palantir's version unification.
versionRecommendations {
excludeConfigurations "icu_68", "icu_62"
}
}
// Regenerates UnicodeProps.java
configure(project(":lucene:analysis:common")) {
task generateUnicodeProps() {
def icuConfig = rootProject.configurations.icu_68
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
dependsOn icuConfig
outputs.file outputFile
doFirst {
project.javaexec {
main "groovy.lang.GroovyShell"
classpath icuConfig, rootProject.configurations.groovy
args = [
"--encoding", "UTF-8",
file("${resources}/GenerateUnicodeProps.groovy"),
outputFile
]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
}
configure(project(":lucene:core")) {
task generateEmojiProperties() {
def icuConfig = rootProject.configurations.icu_62
def outputFile = file("src/data/jflex/UnicodeEmojiProperties.jflex")
dependsOn icuConfig
outputs.file outputFile
doFirst {
println icuConfig.files
project.javaexec {
main "groovy.lang.GroovyShell"
classpath icuConfig, rootProject.configurations.groovy
args = [
"--encoding", "UTF-8",
file("${resources}/GenerateEmojiProperties.groovy"),
outputFile
]
}
}
}
regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiProperties)
}

View File

@ -1,3 +1,7 @@
// DO NOT EDIT THIS FILE! Use "gradlew generateEmojiProperties" to recreate.
// The data was generated using ICU4J v62.2.0.0, unicode version: 11.0.0.0.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -15,9 +19,6 @@
* limitations under the License.
*/
// This file was automatically generated by getUnicodeEmojiProperties.pl
// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt
Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]

View File

@ -1,168 +0,0 @@
#!/usr/bin/perl
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
use warnings;
use strict;
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
my $version = '';
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
print STDERR "Usage: $script_name -v <version>\n";
print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
if ($version);
exit 1;
}
my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
my $output_filename = "UnicodeEmojiProperties.jflex";
my $header =<<"__HEADER__";
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// This file was automatically generated by ${script_name}
// from: ${emoji_data_url}
__HEADER__
my $property_ranges = {};
my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
output_jflex_include_file($output_path, $property_ranges);
# sub parse_emoji_data_file
#
# Downloads and parses the emoji_data.txt file, extracting code point ranges
# assigned to property values with age not younger than the passed-in version,
# except for the Extended_Pictographic property, for which all code point ranges
# are extracted, regardless of age.
#
# Parameters:
#
# - Emoji data file URL
# - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
# - Reference to hash of wanted property names
#
sub parse_emoji_data_file {
my $url = shift;
my $prop_ranges = shift;
my $wanted_props = shift;
my $content = get_URL_content($url);
print STDERR "Parsing '$url'...";
my @lines = split /\r?\n/, $content;
for (@lines) {
## 231A..231B ; Emoji_Presentation # 1.1 [2] (⌚..⌛) watch..hourglass done
## 1F9C0 ; Emoji_Presentation # 8.0 [1] (🧀) cheese wedge
## 1FA00..1FA5F ; Extended_Pictographic# NA [96] (🨀️..🩟️) <reserved-1FA00>..<reserved-1FA5F>
if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
next unless defined($wanted_props->{$prop}); # Skip unless we want ranges for this property
if (not defined($prop_ranges->{$prop})) {
$prop_ranges->{$prop} = [];
}
$end = $start unless defined($end);
my $start_dec = hex $start;
my $end_dec = hex $end;
my $ranges = $prop_ranges->{$prop};
if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
# print STDERR "Adding new range ($start, $end)\n";
push @$ranges, $start_dec, $end_dec;
} else {
# printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
$ranges->[-1] = $end_dec;
}
} else {
# print STDERR "Skipping line (no data): $_\n";
}
}
print STDERR "done.\n";
}
# sub get_URL_content
#
# Retrieves and returns the content of the given URL.
#
# Parameter:
#
# - URL to get content for
#
sub get_URL_content {
my $url = shift;
print STDERR "Retrieving '$url'...";
my $user_agent = LWP::UserAgent->new;
my $request = HTTP::Request->new(GET => $url);
my $response = $user_agent->request($request);
unless ($response->is_success) {
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
exit 1;
}
print STDERR "done.\n";
return $response->content;
}
# sub output_jflex_include_file
#
# Parameters:
#
# - Output path
# - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
#
sub output_jflex_include_file {
my $path = shift;
my $prop_ranges = shift;
open OUT, ">$path"
|| die "Error opening '$path' for writing: $!";
print STDERR "Writing '$path'...";
print OUT $header;
for my $prop (sort keys %$prop_ranges) {
my $ranges = $prop_ranges->{$prop};
print OUT "$prop = [";
for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
printf OUT "\\u{%X}", $ranges->[$index];
printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
}
print OUT "]\n";
}
print OUT "\n";
close OUT;
print STDERR "done.\n";
}

View File

@ -0,0 +1,3 @@
{
"lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae"
}