mirror of https://github.com/apache/lucene.git
LUCENE-9914: Modernize Emoji regeneration scripts (#78)
This commit is contained in:
parent
e7de06eb51
commit
f91700a713
10
build.gradle
10
build.gradle
|
@ -89,6 +89,15 @@ ext {
|
|||
}
|
||||
}
|
||||
|
||||
configurations {
|
||||
groovy
|
||||
}
|
||||
|
||||
dependencies {
|
||||
// Use a newer groovy that doesn't have illegal reflective accesses.
|
||||
groovy "org.codehaus.groovy:groovy-all:3.0.7"
|
||||
}
|
||||
|
||||
apply from: file('buildSrc/scriptDepVersions.gradle')
|
||||
|
||||
// Include smaller chunks configuring dedicated build areas.
|
||||
|
@ -142,7 +151,6 @@ apply from: file('gradle/generation/kuromoji.gradle')
|
|||
apply from: file('gradle/generation/nori.gradle')
|
||||
apply from: file('gradle/generation/icu.gradle')
|
||||
apply from: file('gradle/generation/javacc.gradle')
|
||||
apply from: file('gradle/generation/unicode-data.gradle')
|
||||
|
||||
apply from: file('gradle/datasets/external-datasets.gradle')
|
||||
|
||||
|
|
|
@ -30,6 +30,5 @@ dependencies {
|
|||
implementation localGroovy()
|
||||
|
||||
implementation "commons-codec:commons-codec:${scriptDepVersions['commons-codec']}"
|
||||
implementation "com.ibm.icu:icu4j:${scriptDepVersions['icu']}"
|
||||
}
|
||||
|
||||
|
|
|
@ -8,7 +8,6 @@ ext {
|
|||
"commons-codec": "1.13",
|
||||
"ecj": "3.25.0",
|
||||
"flexmark": "0.61.24",
|
||||
"icu": "68.2",
|
||||
"javacc": "7.0.4",
|
||||
"jflex": "1.7.0",
|
||||
"jgit": "5.9.0.202009080501-r",
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import org.apache.tools.ant.taskdefs.condition.Os
|
||||
|
||||
import java.nio.file.Files
|
||||
|
||||
/*
|
||||
|
@ -19,12 +18,37 @@ import java.nio.file.Files
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/* Regenerates ICU data files.
|
||||
def resources = scriptResources(buildscript)
|
||||
|
||||
/*
|
||||
* Regenerates ICU-related data files.
|
||||
*
|
||||
* This build file contains regeneration code utilizing both icu4j and icu4c.
|
||||
*
|
||||
* The icu4c version must match exactly the icu4j version in version.props:
|
||||
* The one on your system is probably different. This script will attempt to
|
||||
* download and compile a matching icu4c version automatically.
|
||||
*/
|
||||
|
||||
// Configure different icu4j dependencies.
|
||||
configure(rootProject) {
|
||||
configurations {
|
||||
icu_62
|
||||
icu_68
|
||||
}
|
||||
|
||||
dependencies {
|
||||
icu_62 "com.ibm.icu:icu4j:62.2"
|
||||
icu_68 "com.ibm.icu:icu4j:68.2"
|
||||
}
|
||||
|
||||
// Exclude ICU config from palantir's version unification.
|
||||
versionRecommendations {
|
||||
excludeConfigurations "icu_68", "icu_62"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
configure(project(":lucene:analysis:icu")) {
|
||||
def utr30DataDir = file("src/data/utr30")
|
||||
|
||||
|
@ -208,3 +232,63 @@ configure(project(":lucene:analysis:icu")) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Regenerates UnicodeProps.java
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
task generateUnicodeProps() {
|
||||
def icuConfig = rootProject.configurations.icu_68
|
||||
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
|
||||
|
||||
description "Regenerate ${outputFile} (with ${icuConfig.name})"
|
||||
group "generation"
|
||||
|
||||
dependsOn icuConfig
|
||||
outputs.file outputFile
|
||||
|
||||
doFirst {
|
||||
project.javaexec {
|
||||
main "groovy.lang.GroovyShell"
|
||||
classpath icuConfig, rootProject.configurations.groovy
|
||||
|
||||
args = [
|
||||
"--encoding", "UTF-8",
|
||||
file("${resources}/GenerateUnicodeProps.groovy"),
|
||||
outputFile
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
|
||||
}
|
||||
|
||||
|
||||
// UnicodeEmojiProperties.jflex
|
||||
configure(project(":lucene:core")) {
|
||||
task generateEmojiProperties() {
|
||||
def icuConfig = rootProject.configurations.icu_62
|
||||
def outputFile = file("src/data/jflex/UnicodeEmojiProperties.jflex")
|
||||
|
||||
description "Regenerate ${outputFile} (with ${icuConfig.name})"
|
||||
group "generation"
|
||||
|
||||
dependsOn icuConfig
|
||||
outputs.file outputFile
|
||||
|
||||
doFirst {
|
||||
project.javaexec {
|
||||
main "groovy.lang.GroovyShell"
|
||||
classpath icuConfig, rootProject.configurations.groovy
|
||||
|
||||
args = [
|
||||
"--encoding", "UTF-8",
|
||||
file("${resources}/GenerateEmojiProperties.groovy"),
|
||||
outputFile
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiProperties)
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.lang.UCharacter
|
||||
import com.ibm.icu.util.VersionInfo
|
||||
import java.nio.file.*
|
||||
|
||||
def outputFile = Paths.get(args[0])
|
||||
|
||||
StringBuilder sb = new StringBuilder()
|
||||
[ "Emoji", "Emoji_Modifier", "Emoji_Modifier_Base", "Extended_Pictographic" ].each { setName ->
|
||||
UnicodeSet set = new UnicodeSet("[:" + setName + ":]")
|
||||
sb.append(setName + " = [")
|
||||
for (UnicodeSet.EntryRange range : set.ranges()) {
|
||||
if (range.codepoint == range.codepointEnd) {
|
||||
sb.append("\\u{" + Integer.toHexString(range.codepoint).toUpperCase(Locale.ROOT) + "}")
|
||||
} else {
|
||||
sb.append("\\u{" + Integer.toHexString(range.codepoint).toUpperCase(Locale.ROOT) + "}-\\u{" + Integer.toHexString(range.codepointEnd).toUpperCase(Locale.ROOT) + "}")
|
||||
}
|
||||
}
|
||||
sb.append("]\n")
|
||||
}
|
||||
|
||||
def icuVersion = VersionInfo.ICU_VERSION.toString()
|
||||
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
|
||||
|
||||
def code = """
|
||||
// DO NOT EDIT THIS FILE! Use "gradlew generateEmojiProperties" to recreate.
|
||||
// The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
${sb.toString()}
|
||||
"""
|
||||
outputFile.setText(code, "UTF-8")
|
|
@ -0,0 +1,74 @@
|
|||
import com.ibm.icu.lang.UCharacter
|
||||
import com.ibm.icu.util.VersionInfo
|
||||
import java.nio.file.*
|
||||
|
||||
def icuVersion = VersionInfo.ICU_VERSION.toString()
|
||||
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
|
||||
|
||||
def outputFile = Paths.get(args[0])
|
||||
|
||||
List<String> chars = []
|
||||
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
|
||||
if (UCharacter.isUWhiteSpace(c)) {
|
||||
chars.add(String.format(Locale.ROOT, "0x%04X", c))
|
||||
}
|
||||
}
|
||||
def whitespace = chars.join(", ")
|
||||
|
||||
def code = """
|
||||
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "${unicodeVersion}";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(${whitespace});
|
||||
|
||||
private static Bits createBits(final int... codepoints) {
|
||||
final int len = codepoints[codepoints.length - 1] + 1;
|
||||
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
|
||||
for (int i : codepoints) bitset.set(i);
|
||||
return new Bits() {
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
return index < len && bitset.get(index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return ${String.format(Locale.ROOT, "0x%04X", UCharacter.MAX_CODE_POINT)} + 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
"""
|
||||
outputFile.setText(code.trim(), "UTF-8")
|
|
@ -170,7 +170,7 @@ configure(project(":lucene:queryparser")) {
|
|||
}
|
||||
|
||||
task javaccParserFlexible(type: JavaCCTask) {
|
||||
description "Regenerate Flexible query parser from queryparser/flexible/standard/parser/StandardSyntaxParser.jj"
|
||||
description "Regenerate flexible query parser from queryparser/flexible/standard/parser/StandardSyntaxParser.jj"
|
||||
group "generation"
|
||||
|
||||
javaccFile = file('src/java/org/apache/lucene/queryparser/flexible/standard/parser/StandardSyntaxParser.jj')
|
||||
|
|
|
@ -1,110 +0,0 @@
|
|||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Regenerates UnicodeProps.java
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
task generateUnicodeProps() {
|
||||
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
|
||||
|
||||
def icuVersion = VersionInfo.ICU_VERSION.toString()
|
||||
def unicodeVersion = UCharacter.getUnicodeVersion().toString()
|
||||
|
||||
inputs.property("icu-version", icuVersion)
|
||||
inputs.property("unicode-version", unicodeVersion)
|
||||
outputs.file outputFile
|
||||
|
||||
doFirst {
|
||||
def icuLockDepVersion = getVersion("com.ibm.icu", "icu4j")
|
||||
def icuScriptDep = scriptDepVersions['icu']
|
||||
if (icuLockDepVersion != icuScriptDep) {
|
||||
throw new GradleException("ICU version in build script dependency ${icuScriptDep} and in" +
|
||||
" project dependency ${icuLockDepVersion} must match.")
|
||||
}
|
||||
|
||||
List<String> chars = []
|
||||
for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
|
||||
if (UCharacter.isUWhiteSpace(c)) {
|
||||
chars.add(String.format(Locale.ROOT, "0x%04X", c))
|
||||
}
|
||||
}
|
||||
def whitespace = chars.join(", ")
|
||||
|
||||
def code = """
|
||||
// DO NOT EDIT THIS FILE! Use "gradlew generateUnicodeProps tidy" to recreate.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.util;
|
||||
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.SparseFixedBitSet;
|
||||
|
||||
/**
|
||||
* This file contains unicode properties used by various {@link CharTokenizer}s.
|
||||
* The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
|
||||
*/
|
||||
public final class UnicodeProps {
|
||||
private UnicodeProps() {}
|
||||
|
||||
/** Unicode version that was used to generate this file: {@value} */
|
||||
public static final String UNICODE_VERSION = "${unicodeVersion}";
|
||||
|
||||
/** Bitset with Unicode WHITESPACE code points. */
|
||||
public static final Bits WHITESPACE = createBits(${whitespace});
|
||||
|
||||
private static Bits createBits(final int... codepoints) {
|
||||
final int len = codepoints[codepoints.length - 1] + 1;
|
||||
final SparseFixedBitSet bitset = new SparseFixedBitSet(len);
|
||||
for (int i : codepoints) bitset.set(i);
|
||||
return new Bits() {
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
return index < len && bitset.get(index);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return ${String.format(Locale.ROOT, "0x%04X", UCharacter.MAX_CODE_POINT)} + 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
"""
|
||||
outputFile.setText(code.trim(), "UTF-8")
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
def resources = scriptResources(buildscript)
|
||||
|
||||
configure(rootProject) {
|
||||
configurations {
|
||||
icu_62
|
||||
icu_68
|
||||
}
|
||||
|
||||
dependencies {
|
||||
icu_62 "com.ibm.icu:icu4j:62.2"
|
||||
icu_68 "com.ibm.icu:icu4j:68.2"
|
||||
}
|
||||
|
||||
// Exclude ICU config from palantir's version unification.
|
||||
versionRecommendations {
|
||||
excludeConfigurations "icu_68", "icu_62"
|
||||
}
|
||||
}
|
||||
|
||||
// Regenerates UnicodeProps.java
|
||||
configure(project(":lucene:analysis:common")) {
|
||||
task generateUnicodeProps() {
|
||||
def icuConfig = rootProject.configurations.icu_68
|
||||
def outputFile = file("src/java/org/apache/lucene/analysis/util/UnicodeProps.java")
|
||||
|
||||
dependsOn icuConfig
|
||||
outputs.file outputFile
|
||||
|
||||
doFirst {
|
||||
project.javaexec {
|
||||
main "groovy.lang.GroovyShell"
|
||||
classpath icuConfig, rootProject.configurations.groovy
|
||||
|
||||
args = [
|
||||
"--encoding", "UTF-8",
|
||||
file("${resources}/GenerateUnicodeProps.groovy"),
|
||||
outputFile
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateUnicodeProps, [ andThenTasks: "spotlessApply" ])
|
||||
}
|
||||
|
||||
configure(project(":lucene:core")) {
|
||||
task generateEmojiProperties() {
|
||||
def icuConfig = rootProject.configurations.icu_62
|
||||
def outputFile = file("src/data/jflex/UnicodeEmojiProperties.jflex")
|
||||
|
||||
dependsOn icuConfig
|
||||
outputs.file outputFile
|
||||
|
||||
doFirst {
|
||||
println icuConfig.files
|
||||
project.javaexec {
|
||||
main "groovy.lang.GroovyShell"
|
||||
classpath icuConfig, rootProject.configurations.groovy
|
||||
|
||||
args = [
|
||||
"--encoding", "UTF-8",
|
||||
file("${resources}/GenerateEmojiProperties.groovy"),
|
||||
outputFile
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
regenerate.dependsOn wrapWithPersistentChecksums(generateEmojiProperties)
|
||||
}
|
|
@ -1,3 +1,7 @@
|
|||
|
||||
// DO NOT EDIT THIS FILE! Use "gradlew generateEmojiProperties" to recreate.
|
||||
// The data was generated using ICU4J v62.2.0.0, unicode version: 11.0.0.0.
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -15,9 +19,6 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file was automatically generated by getUnicodeEmojiProperties.pl
|
||||
// from: http://unicode.org/Public/emoji/11.0/emoji-data.txt
|
||||
|
||||
Emoji = [\u{23}\u{2A}\u{30}-\u{39}\u{A9}\u{AE}\u{203C}\u{2049}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{2328}\u{23CF}\u{23E9}-\u{23F3}\u{23F8}-\u{23FA}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2604}\u{260E}\u{2611}\u{2614}-\u{2615}\u{2618}\u{261D}\u{2620}\u{2622}-\u{2623}\u{2626}\u{262A}\u{262E}-\u{262F}\u{2638}-\u{263A}\u{2640}\u{2642}\u{2648}-\u{2653}\u{265F}-\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267E}-\u{267F}\u{2692}-\u{2697}\u{2699}\u{269B}-\u{269C}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26B0}-\u{26B1}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26C8}\u{26CE}-\u{26CF}\u{26D1}\u{26D3}-\u{26D4}\u{26E9}-\u{26EA}\u{26F0}-\u{26F5}\u{26F7}-\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270D}\u{270F}\u{2712}\u{2714}\u{2716}\u{271D}\u{2721}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2763}-\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{27BF}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E6}-\u{1F1FF}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F321}\u{1F324}-\u{1F393}\u{1F396}-\u{1F397}\u{1F399}-\u{1F39B}\u{1F39E}-\u{1F3F0}\u{1F3F3}-\u{1F3F5}\u{1F3F7}-\u{1F4FD}\u{1F4FF}-\u{1F53D}\u{1F549}-\u{1F54E}\u{1F550}-\u{1F567}\u{1F56F}-\u{1F570}\u{1F573}-\u{1F57A}\u{1F587}\u{1F58A}-\u{1F58D}\u{1F590}\u{1F595}-\u{1F596}\u{1F5A4}-\u{1F5A5}\u{1F5A8}\u{1F5B1}-\u{1F5B2}\u{1F5BC}\u{1F5C2}-\u{1F5C4}\u{1F5D1}-\u{1F5D3}\u{1F5DC}-\u{1F5DE}\u{1F5E1}\u{1F5E3}\u{1F5E8}\u{1F5EF}\u{1F5F3}\u{1F5FA}-\u{1F64F}\u{1F680}-\u{1F6C5}\u{1F6CB}-\u{1F6D2}\u{1F6E0}-\u{1F6E5}\u{1F6E9}\u{1F6EB}-\u{1F6EC}\u{1F6F0}\u{1F6F3}-\u{1F6F9}\u{1F910}-\u{1F93A}\u{1F93C}-\u{1F93E}\u{1F940}-\u{1F945}\u{1F947}-\u{1F970}\u{1F973}-\u{1F976}\u{1F97A}\u{1F97C}-\u{1F9A2}\u{1F9B0}-\u{1F9B9}\u{1F9C0}-\u{1F9C2}\u{1F9D0}-\u{1F9FF}]
|
||||
Emoji_Modifier = [\u{1F3FB}-\u{1F3FF}]
|
||||
Emoji_Modifier_Base = [\u{261D}\u{26F9}\u{270A}-\u{270D}\u{1F385}\u{1F3C2}-\u{1F3C4}\u{1F3C7}\u{1F3CA}-\u{1F3CC}\u{1F442}-\u{1F443}\u{1F446}-\u{1F450}\u{1F466}-\u{1F469}\u{1F46E}\u{1F470}-\u{1F478}\u{1F47C}\u{1F481}-\u{1F483}\u{1F485}-\u{1F487}\u{1F4AA}\u{1F574}-\u{1F575}\u{1F57A}\u{1F590}\u{1F595}-\u{1F596}\u{1F645}-\u{1F647}\u{1F64B}-\u{1F64F}\u{1F6A3}\u{1F6B4}-\u{1F6B6}\u{1F6C0}\u{1F6CC}\u{1F918}-\u{1F91C}\u{1F91E}-\u{1F91F}\u{1F926}\u{1F930}-\u{1F939}\u{1F93D}-\u{1F93E}\u{1F9B5}-\u{1F9B6}\u{1F9B8}-\u{1F9B9}\u{1F9D1}-\u{1F9DD}]
|
||||
|
|
|
@ -1,168 +0,0 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
use File::Spec;
|
||||
use Getopt::Long;
|
||||
use LWP::UserAgent;
|
||||
|
||||
my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
|
||||
|
||||
my $version = '';
|
||||
unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+/) {
|
||||
print STDERR "Usage: $script_name -v <version>\n";
|
||||
print STDERR "\tversion must be of the form X.Y, e.g. 9.0\n"
|
||||
if ($version);
|
||||
exit 1;
|
||||
}
|
||||
my $emoji_data_url = "http://unicode.org/Public/emoji/$version/emoji-data.txt";
|
||||
my $output_filename = "UnicodeEmojiProperties.jflex";
|
||||
my $header =<<"__HEADER__";
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// This file was automatically generated by ${script_name}
|
||||
// from: ${emoji_data_url}
|
||||
|
||||
__HEADER__
|
||||
|
||||
my $property_ranges = {};
|
||||
my $wanted_properties = { 'Emoji' => 1, 'Emoji_Modifier' => 1, 'Emoji_Modifier_Base' => 1, 'Extended_Pictographic' => 1 };
|
||||
|
||||
parse_emoji_data_file($emoji_data_url, $property_ranges, $wanted_properties);
|
||||
|
||||
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
||||
output_jflex_include_file($output_path, $property_ranges);
|
||||
|
||||
|
||||
# sub parse_emoji_data_file
|
||||
#
|
||||
# Downloads and parses the emoji_data.txt file, extracting code point ranges
|
||||
# assigned to property values with age not younger than the passed-in version,
|
||||
# except for the Extended_Pictographic property, for which all code point ranges
|
||||
# are extracted, regardless of age.
|
||||
#
|
||||
# Parameters:
|
||||
#
|
||||
# - Emoji data file URL
|
||||
# - Reference to hash of properties mapped to an array of alternating (start,end) code point ranges
|
||||
# - Reference to hash of wanted property names
|
||||
#
|
||||
sub parse_emoji_data_file {
|
||||
my $url = shift;
|
||||
my $prop_ranges = shift;
|
||||
my $wanted_props = shift;
|
||||
my $content = get_URL_content($url);
|
||||
print STDERR "Parsing '$url'...";
|
||||
my @lines = split /\r?\n/, $content;
|
||||
for (@lines) {
|
||||
## 231A..231B ; Emoji_Presentation # 1.1 [2] (⌚..⌛) watch..hourglass done
|
||||
## 1F9C0 ; Emoji_Presentation # 8.0 [1] (🧀) cheese wedge
|
||||
## 1FA00..1FA5F ; Extended_Pictographic# NA [96] (🨀️..️) <reserved-1FA00>..<reserved-1FA5F>
|
||||
if (my ($start,$end,$prop) = /^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*([^\s#]+)/) {
|
||||
next unless defined($wanted_props->{$prop}); # Skip unless we want ranges for this property
|
||||
|
||||
if (not defined($prop_ranges->{$prop})) {
|
||||
$prop_ranges->{$prop} = [];
|
||||
}
|
||||
$end = $start unless defined($end);
|
||||
my $start_dec = hex $start;
|
||||
my $end_dec = hex $end;
|
||||
my $ranges = $prop_ranges->{$prop};
|
||||
if (scalar(@$ranges) == 0 || $start_dec > $ranges->[-1] + 1) { # Can't merge range with previous range
|
||||
# print STDERR "Adding new range ($start, $end)\n";
|
||||
push @$ranges, $start_dec, $end_dec;
|
||||
} else {
|
||||
# printf STDERR "Merging range (%s, %s) with previous range (%X, %X)\n", $start, $end, $ranges->[-2], $ranges->[-1];
|
||||
$ranges->[-1] = $end_dec;
|
||||
}
|
||||
} else {
|
||||
# print STDERR "Skipping line (no data): $_\n";
|
||||
}
|
||||
}
|
||||
print STDERR "done.\n";
|
||||
}
|
||||
|
||||
# sub get_URL_content
|
||||
#
|
||||
# Retrieves and returns the content of the given URL.
|
||||
#
|
||||
# Parameter:
|
||||
#
|
||||
# - URL to get content for
|
||||
#
|
||||
sub get_URL_content {
|
||||
my $url = shift;
|
||||
print STDERR "Retrieving '$url'...";
|
||||
my $user_agent = LWP::UserAgent->new;
|
||||
my $request = HTTP::Request->new(GET => $url);
|
||||
my $response = $user_agent->request($request);
|
||||
unless ($response->is_success) {
|
||||
print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
|
||||
exit 1;
|
||||
}
|
||||
print STDERR "done.\n";
|
||||
return $response->content;
|
||||
}
|
||||
|
||||
|
||||
# sub output_jflex_include_file
|
||||
#
|
||||
# Parameters:
|
||||
#
|
||||
# - Output path
|
||||
# - Reference to hash mapping properties to an array of alternating (start,end) codepoint ranges
|
||||
#
|
||||
sub output_jflex_include_file {
|
||||
my $path = shift;
|
||||
my $prop_ranges = shift;
|
||||
open OUT, ">$path"
|
||||
|| die "Error opening '$path' for writing: $!";
|
||||
|
||||
print STDERR "Writing '$path'...";
|
||||
|
||||
print OUT $header;
|
||||
|
||||
for my $prop (sort keys %$prop_ranges) {
|
||||
my $ranges = $prop_ranges->{$prop};
|
||||
print OUT "$prop = [";
|
||||
for (my $index = 0 ; $index < scalar(@$ranges) ; $index += 2) {
|
||||
printf OUT "\\u{%X}", $ranges->[$index];
|
||||
printf OUT "-\\u{%X}", $ranges->[$index + 1] if ($ranges->[$index + 1] > $ranges->[$index]);
|
||||
}
|
||||
print OUT "]\n";
|
||||
}
|
||||
|
||||
print OUT "\n";
|
||||
close OUT;
|
||||
print STDERR "done.\n";
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
"lucene/core/src/data/jflex/UnicodeEmojiProperties.jflex": "7491dd535debc6e9e9ce367c4d3a7217e466dcae"
|
||||
}
|
Loading…
Reference in New Issue