Merge pull request #1577 from hapifhir/do-20240319-languages-dat

Loader for rfc5646 languages file
This commit is contained in:
Grahame Grieve 2024-04-04 08:23:24 +11:00 committed by GitHub
commit 003abc13d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 49661 additions and 0 deletions

View File

@ -0,0 +1,24 @@
package org.hl7.fhir.utilities.i18n.subtag;
import lombok.Getter;
import lombok.Setter;
/*
Preferred-Value
Macrolanguage
Prefix
*/
public class ExtLangSubtag extends Subtag {
@Getter @Setter
private String preferredValue;
@Getter @Setter
private String macrolanguage;
@Getter @Setter
private String prefix;
protected ExtLangSubtag(String subtag) {
super(subtag);
}
}

View File

@ -0,0 +1,29 @@
package org.hl7.fhir.utilities.i18n.subtag;
import lombok.Getter;
import lombok.Setter;
/*
Scope
Preferred-Value
Suppress-Script
Macrolanguage
*/
public class LanguageSubtag extends Subtag {
@Getter @Setter
private String scope;
@Getter @Setter
private String preferredValue;
@Getter @Setter
private String suppressScript;
@Getter @Setter
private String macrolanguage;
protected LanguageSubtag(String subtag) {
super(subtag);
}
}

View File

@ -0,0 +1,95 @@
package org.hl7.fhir.utilities.i18n.subtag;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public class LanguageSubtagRegistry {
private final Map<String, LanguageSubtag> languages = new HashMap<>();
public Set<String> getLanguageKeys() {
return languages.keySet();
}
protected LanguageSubtag addLanguage(String key, LanguageSubtag language) {
return languages.put(key, language);
}
public boolean containsLanguage(String key) {
return languages.containsKey(key);
}
public LanguageSubtag getLanguage(String key) {
return languages.get(key);
}
private final Map<String, ExtLangSubtag> extLangs = new HashMap<>();
protected ExtLangSubtag addExtLang(String key, ExtLangSubtag extLang) {
return extLangs.put(key, extLang);
}
public Set<String> getExtLangKeys() {
return extLangs.keySet();
}
public boolean containsExtLang(String key) {
return extLangs.containsKey(key);
}
public ExtLangSubtag getExtLang(String key) {
return extLangs.get(key);
}
private final Map<String, ScriptSubtag> scripts = new HashMap<>();
protected ScriptSubtag addScript(String key, ScriptSubtag script) {
return scripts.put(key, script);
}
public Set<String> getScriptKeys() {
return scripts.keySet();
}
public boolean containsScript(String key) {
return scripts.containsKey(key);
}
public ScriptSubtag getScript(String key) {
return scripts.get(key);
}
private final Map<String, RegionSubtag> regions = new HashMap<>();
protected RegionSubtag addRegion(String key, RegionSubtag region) {
return regions.put(key, region);
}
public Set<String> getRegionKeys() {
return regions.keySet();
}
public boolean containsRegion(String key) {
return regions.containsKey(key);
}
public RegionSubtag getRegion(String key) {
return regions.get(key);
}
private final Map<String, VariantSubtag> variants = new HashMap<>();
protected VariantSubtag addVariant(String key, VariantSubtag variant) {
return variants.put(key, variant);
}
public Set<String> getVariantKeys() {
return variants.keySet();
}
public boolean containsVariant(String key) {
return variants.containsKey(key);
}
public VariantSubtag getVariant(String key) {
return variants.get(key);
}
}

View File

@ -0,0 +1,247 @@
package org.hl7.fhir.utilities.i18n.subtag;
import lombok.Getter;
import lombok.With;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.*;
public class LanguageSubtagRegistryLoader {
@Getter
private final LanguageSubtagRegistry registry;
@With
private final boolean loadLanguages;
@With
private final boolean loadScripts;
@With
private final boolean loadExtLangs;
@With
private final boolean loadRegions;
@With
private final boolean loadVariants;
public static boolean isMultiField(String field){
return DESCRIPTION.equals(field)
|| COMMENTS.equals(field)
|| PREFIX.equals(field);
}
public static class Record {
final Map<String, String> fields = new HashMap<>();
final Map<String, List<String>> multiFields = new HashMap<>();
public void addField(String field, String value) {
if (isMultiField(field)) {
List<String> list = multiFields.get(field);
if (list == null) {
list = new ArrayList<>();
}
list.add(value);
multiFields.put(field, list);
} else {
fields.put(field, value);
}
}
}
//Fields
//[Type, Added, Description, Scope, Deprecated, Preferred-Value, Suppress-Script, Comments, Macrolanguage, Subtag, Prefix, Tag]
public static final String TYPE = "Type";
public static final String ADDED = "Added";
public static final String DESCRIPTION = "Description";
public static final String SCOPE = "Scope";
public static final String DEPRECATED = "Deprecated";
public static final String PREFERRED_VALUE = "Preferred-Value";
public static final String SUPPRESS_SCRIPT = "Suppress-Script";
public static final String COMMENTS = "Comments";
public static final String MACROLANGUAGE = "Macrolanguage";
public static final String SUBTAG = "Subtag";
public static final String PREFIX = "Prefix";
//public static final String TAG = "Tag";
//Types
// [grandfathered, variant, language, region, script, redundant, extlang]
public static final String VARIANT = "variant";
public static final String LANGUAGE = "language";
public static final String REGION = "region";
public static final String SCRIPT = "script";
public static final String EXTLANG = "extlang";
public static final String REDUNDANT = "redundant";
public static final String GRANDFATHERED = "grandfathered";
public LanguageSubtagRegistryLoader(LanguageSubtagRegistry registry) {
this(registry, true, true, true, true, true);
}
private LanguageSubtagRegistryLoader(LanguageSubtagRegistry registry, boolean loadLanguages, boolean loadScripts, boolean loadExtLangs, boolean loadRegions, boolean loadVariants) {
this.registry = registry;
this.loadLanguages = loadLanguages;
this.loadScripts = loadScripts;
this.loadExtLangs = loadExtLangs;
this.loadRegions = loadRegions;
this.loadVariants = loadVariants;
}
public void loadFromDefaultResource() throws IOException {
loadFromResource("lang.dat.txt");
}
public void loadFromResource(String resourceName) throws IOException {
ClassLoader classLoader = getClass().getClassLoader();
URL resourceUrl = classLoader.getResource(resourceName);
assert resourceUrl != null;
load(resourceUrl.openStream());
}
private void load(InputStream inputStream) throws IOException {
Scanner scanner = new Scanner(inputStream, "UTF-8");
Record record = null;
String currentField = null;
String currentValue = null;
while (scanner.hasNext()) {
String line = scanner.nextLine();
if (line.equals("%%")) {
if (record != null) {
record.addField(currentField, currentValue);
}
record = processRecord(record);
currentField = null;
currentValue = null;
} else {
if (line.startsWith(" ")) {
assert currentValue != null;
currentValue = currentValue + " " + line.trim();
} else {
if (currentField != null && currentValue != null) {
record.addField(currentField, currentValue);
}
String[] split = line.split(":\\s");
if (split.length == 2) {
currentField = split[0];
currentValue = split[1];
}
}
}
}
if (record != null) {
record.addField(currentField, currentValue);
}
processRecord(record);
}
protected Record processRecord(Record record) {
if (record == null) {
return new Record();
}
String typeValue = record.fields.get(TYPE);
assert record.fields.containsKey(ADDED);
final Subtag subtag;
switch (typeValue) {
case LANGUAGE: subtag = processLanguageRecord(record); break;
case EXTLANG: subtag = processExtLangRecord(record); break;
case SCRIPT: subtag = processScriptRecord(record); break;
case REGION: subtag = processRegionRecord(record); break;
case VARIANT: subtag = processVariantRecord(record); break;
default: subtag = null;
}
assert subtag != null || typeValue.equals(GRANDFATHERED) || typeValue.equals(REDUNDANT);
if (subtag != null) {
addSubtag(subtag);
}
return new Record();
}
protected void addSubtag(Subtag subtag) {
assert subtag.getSubtag() != null;
if (subtag instanceof LanguageSubtag && loadLanguages)
registry.addLanguage(subtag.getSubtag(), (LanguageSubtag) subtag);
else if (subtag instanceof ExtLangSubtag && loadExtLangs )
registry.addExtLang(subtag.getSubtag(), (ExtLangSubtag) subtag);
else if (subtag instanceof ScriptSubtag && loadScripts)
registry.addScript(subtag.getSubtag(), (ScriptSubtag) subtag);
else if (subtag instanceof RegionSubtag && loadRegions)
registry.addRegion(subtag.getSubtag(), (RegionSubtag) subtag);
else if (subtag instanceof VariantSubtag && loadVariants)
registry.addVariant(subtag.getSubtag(), (VariantSubtag) subtag);
}
protected Subtag processVariantRecord(Record record) {
assert record.fields.containsKey(SUBTAG);
VariantSubtag variant = new VariantSubtag(record.fields.get(SUBTAG));
addCommonFieldsToSubtag(variant, record);
variant.setPreferredValue(record.fields.get(PREFERRED_VALUE));
if (record.multiFields.containsKey(PREFIX))
for (String prefix : record.multiFields.get(PREFIX)) {
variant.addPrefix(prefix);
}
return variant;
}
protected Subtag processScriptRecord(Record record) {
assert record.fields.containsKey(SUBTAG);
ScriptSubtag script = new ScriptSubtag(record.fields.get(SUBTAG));
addCommonFieldsToSubtag(script, record);
return script;
}
protected Subtag processRegionRecord(Record record) {
assert record.fields.containsKey(SUBTAG);
RegionSubtag region = new RegionSubtag(record.fields.get(SUBTAG));
addCommonFieldsToSubtag(region, record);
region.setPreferredValue(record.fields.get(PREFERRED_VALUE));
return region;
}
protected Subtag processExtLangRecord(Record record) {
assert record.fields.containsKey(SUBTAG);
ExtLangSubtag extLang = new ExtLangSubtag(record.fields.get(SUBTAG));
addCommonFieldsToSubtag(extLang, record);
extLang.setPreferredValue(record.fields.get(PREFERRED_VALUE));
extLang.setMacrolanguage(record.fields.get(MACROLANGUAGE));
if (record.multiFields.containsKey(PREFIX)) {
assert record.multiFields.get(PREFIX).size() == 1;
extLang.setPrefix(record.multiFields.get(PREFIX).get(0));
}
return extLang;
}
protected Subtag processLanguageRecord(Record record) {
assert record.fields.containsKey(SUBTAG);
LanguageSubtag language = new LanguageSubtag(record.fields.get(SUBTAG));
addCommonFieldsToSubtag(language, record);
language.setScope(record.fields.get(SCOPE));
language.setPreferredValue(record.fields.get(PREFERRED_VALUE));
language.setSuppressScript(record.fields.get(SUPPRESS_SCRIPT));
language.setMacrolanguage(record.fields.get(MACROLANGUAGE));
return language;
}
private void addCommonFieldsToSubtag(Subtag subtag, Record record) {
if (record.multiFields.containsKey(DESCRIPTION))
for (String description : record.multiFields.get(DESCRIPTION)) {
subtag.addDescription(description);
}
if (record.multiFields.containsKey(COMMENTS))
for (String comment : record.multiFields.get(COMMENTS)) {
subtag.addComments(comment);
}
subtag.setAdded(record.fields.get(ADDED));
subtag.setDeprecated(record.fields.get(DEPRECATED));
}
}

View File

@ -0,0 +1,16 @@
package org.hl7.fhir.utilities.i18n.subtag;
import lombok.Getter;
import lombok.Setter;
/*
Preferred-Value
*/
public class RegionSubtag extends Subtag {
@Getter @Setter
private String preferredValue;
protected RegionSubtag(String subtag) {
super(subtag);
}
}

View File

@ -0,0 +1,10 @@
package org.hl7.fhir.utilities.i18n.subtag;
/*
*/
public class ScriptSubtag extends Subtag {
protected ScriptSubtag(String subtag) {
super(subtag);
}
}

View File

@ -0,0 +1,45 @@
package org.hl7.fhir.utilities.i18n.subtag;
import lombok.Getter;
import lombok.Setter;
import java.util.ArrayList;
import java.util.List;
public abstract class Subtag {
@Getter @Setter
private String added;
@Getter @Setter
private String deprecated;
private List<String> descriptions = new ArrayList<>();
private List<String> comments = new ArrayList<>();
@Getter
private final String subtag;
protected Subtag(String subtag){
this.subtag = subtag;
}
protected boolean addDescription(String description) {
return descriptions.add(description);
}
public List<String> getDescriptions() {
return List.copyOf(descriptions);
}
protected boolean addComments(String comment) {
return comments.add(comment);
}
public List<String> getComments() {
return List.copyOf(comments);
}
}

View File

@ -0,0 +1,30 @@
package org.hl7.fhir.utilities.i18n.subtag;
import lombok.Getter;
import lombok.Setter;
import java.util.ArrayList;
import java.util.List;
/*
Preferred-Value
Prefix
*/
public class VariantSubtag extends Subtag {
@Getter @Setter
private String preferredValue;
private List<String> prefixes = new ArrayList<>();
protected VariantSubtag(String subtag) {
super(subtag);
}
protected boolean addPrefix(String description) {
return prefixes.add(description);
}
public List<String> getPrefixes() {
return List.copyOf(prefixes);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,242 @@
package org.hl7.fhir.utilities.i18n.subtag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.util.stream.Stream;
import static org.junit.jupiter.api.Assertions.*;
public class LanguageSubtagRegistryLoaderTest {
@Test
public void defaultLoaderTest() throws IOException {
/*
languages.size(): 8259
extLangs.size(): 253
scripts.size(): 222
regions.size(): 305
variants.size(): 113
*/
LanguageSubtagRegistry registry = new LanguageSubtagRegistry();
LanguageSubtagRegistryLoader loader = new LanguageSubtagRegistryLoader(registry);
loader.loadFromDefaultResource();
/* Test entries of every subtag type (language, script, variant, extLang, region)
These should cover both simple, and more complex entries with a larger number
of fields.
*/
/*
Type: language
Subtag: ppa
Description: Pao
Added: 2009-07-29
Deprecated: 2016-05-30
Preferred-Value: bfy
*/
LanguageSubtag ppa = registry.getLanguage("ppa");
assertEquals("Pao", ppa.getDescriptions().get(0));
assertEquals("2009-07-29", ppa.getAdded());
assertEquals("2016-05-30", ppa.getDeprecated());
assertEquals("bfy", ppa.getPreferredValue());
assertNull(ppa.getScope());
assertNull(ppa.getSuppressScript());
assertNull(ppa.getMacrolanguage());
/*
Type: language
Subtag: ia
Description: Interlingua (International Auxiliary Language
Association)
Added: 2005-10-16
*/
LanguageSubtag ia = registry.getLanguage("ia");
assertEquals("Interlingua (International Auxiliary Language Association)", ia.getDescriptions().get(0));
/*
Type: script
Subtag: Cpmn
Description: Cypro-Minoan
Added: 2017-08-13
*/
ScriptSubtag cpmn = registry.getScript("Cpmn");
assertEquals("Cypro-Minoan", cpmn.getDescriptions().get(0));
assertEquals("2017-08-13", cpmn.getAdded());
assertTrue(cpmn.getComments().isEmpty());
assertNull(cpmn.getDeprecated());
/*
Type: script
Subtag: Lisu
Description: Lisu
Description: Fraser
Added: 2009-03-13
*/
ScriptSubtag lisu = registry.getScript("Lisu");
assertEquals(2, lisu.getDescriptions().size());
assertEquals("Lisu", lisu.getDescriptions().get(0));
assertEquals("Fraser", lisu.getDescriptions().get(1));
assertEquals("2009-03-13", lisu.getAdded());
/*
Type: variant
Subtag: tarask
Description: Belarusian in Taraskievica orthography
Added: 2007-04-27
Prefix: be
Comments: The subtag represents Branislau Taraskievic's Belarusian
orthography as published in "Bielaruski klasycny pravapis" by Juras
Buslakou, Vincuk Viacorka, Zmicier Sanko, and Zmicier Sauka (Vilnia-
Miensk 2005).
*/
final String taraskComment = "The subtag represents Branislau Taraskievic's Belarusian orthography as published in \"Bielaruski klasycny pravapis\" by Juras Buslakou, Vincuk Viacorka, Zmicier Sanko, and Zmicier Sauka (Vilnia- Miensk 2005).";
VariantSubtag tarask = registry.getVariant("tarask");
assertEquals(1, tarask.getDescriptions().size());
assertEquals("Belarusian in Taraskievica orthography", tarask.getDescriptions().get(0));
assertEquals(1, tarask.getPrefixes().size());
assertEquals("be", tarask.getPrefixes().get(0));
assertEquals("2007-04-27", tarask.getAdded());
assertEquals(taraskComment, tarask.getComments().get(0));
/*
Type: variant
Subtag: ao1990
Description: Portuguese Language Orthographic Agreement of 1990 (Acordo
Ortográfico da Língua Portuguesa de 1990)
Added: 2015-05-06
Prefix: pt
Prefix: gl
Comments: Portuguese orthography conventions established in 1990 but
not brought into effect until 2009
*/
VariantSubtag ao1990 = registry.getVariant("ao1990");
assertEquals(1, ao1990.getDescriptions().size());
assertEquals("Portuguese Language Orthographic Agreement of 1990 (Acordo Ortográfico da Língua Portuguesa de 1990)", ao1990.getDescriptions().get(0));
assertEquals(2, ao1990.getPrefixes().size());
assertEquals("pt", ao1990.getPrefixes().get(0));
assertEquals("gl", ao1990.getPrefixes().get(1));
assertEquals(1, ao1990.getComments().size());
assertEquals("Portuguese orthography conventions established in 1990 but not brought into effect until 2009", ao1990.getComments().get(0));
/*
Type: extlang
Subtag: arq
Description: Algerian Arabic
Added: 2009-07-29
Preferred-Value: arq
Prefix: ar
Macrolanguage: ar
*/
ExtLangSubtag arq = registry.getExtLang("arq");
assertEquals(1, arq.getDescriptions().size());
assertEquals("Algerian Arabic", arq.getDescriptions().get(0));
assertEquals("2009-07-29", arq.getAdded());
assertEquals("arq", arq.getPreferredValue());
assertEquals("ar", arq.getPrefix());
assertEquals("ar", arq.getMacrolanguage());
/*
Type: extlang
Subtag: kvk
Description: Korean Sign Language
Added: 2009-07-29
Preferred-Value: kvk
Prefix: sgn
*/
ExtLangSubtag kvk = registry.getExtLang("kvk");
assertEquals(1, kvk.getDescriptions().size());
assertEquals("Korean Sign Language", kvk.getDescriptions().get(0));
assertEquals("2009-07-29", kvk.getAdded());
assertEquals("kvk", kvk.getPreferredValue());
assertEquals("sgn", kvk.getPrefix());
/*
Type: region
Subtag: YD
Description: Democratic Yemen
Added: 2005-10-16
Deprecated: 1990-08-14
Preferred-Value: YE
*/
RegionSubtag yd = registry.getRegion("YD");
assertEquals(1, yd.getDescriptions().size());
assertEquals("Democratic Yemen", yd.getDescriptions().get(0));
assertEquals("2005-10-16", yd.getAdded());
assertEquals("1990-08-14", yd.getDeprecated());
assertEquals("YE", yd.getPreferredValue());
/*
Type: region
Subtag: HN
Description: Honduras
Added: 2005-10-16
*/
RegionSubtag hn = registry.getRegion("HN");
assertEquals(1, hn.getDescriptions().size());
assertEquals("Honduras", hn.getDescriptions().get(0));
assertEquals("2005-10-16", hn.getAdded());
}
@Test
public void testNoLanguagesLoading() throws IOException {
LanguageSubtagRegistry registry = new LanguageSubtagRegistry();
LanguageSubtagRegistryLoader loader = new LanguageSubtagRegistryLoader(registry).withLoadLanguages(false);
loader.loadFromDefaultResource();
assertTrue(registry.getLanguageKeys().isEmpty());
assertFalse(registry.getExtLangKeys().isEmpty());
assertFalse(registry.getRegionKeys().isEmpty());
assertFalse(registry.getVariantKeys().isEmpty());
assertFalse(registry.getScriptKeys().isEmpty());
}
@Test
public void testNoExtLangsLoading() throws IOException {
LanguageSubtagRegistry registry = new LanguageSubtagRegistry();
LanguageSubtagRegistryLoader loader = new LanguageSubtagRegistryLoader(new LanguageSubtagRegistry()).withLoadExtLangs(false);
}
private static Stream<Arguments> provideParamsForPartialLoad() {
return Stream.of(
Arguments.of(new LanguageSubtagRegistryLoader(new LanguageSubtagRegistry()).withLoadLanguages(false), true, false, false, false, false),
Arguments.of(new LanguageSubtagRegistryLoader(new LanguageSubtagRegistry()).withLoadExtLangs(false), false, true, false, false, false),
Arguments.of(new LanguageSubtagRegistryLoader(new LanguageSubtagRegistry()).withLoadRegions(false), false, false, true, false, false),
Arguments.of(new LanguageSubtagRegistryLoader(new LanguageSubtagRegistry()).withLoadVariants(false), false, false, false, true, false),
Arguments.of(new LanguageSubtagRegistryLoader(new LanguageSubtagRegistry()).withLoadScripts(false), false, false, false, false, true)
);
}
@ParameterizedTest
@MethodSource("provideParamsForPartialLoad")
public void testPartialLoad(LanguageSubtagRegistryLoader loader,
boolean languagesEmpty,
boolean extLangsEmpty,
boolean regionsEmpty,
boolean variantsEmpty,
boolean scriptsEmpty) throws IOException {
loader.loadFromDefaultResource();
assertEquals(languagesEmpty, loader.getRegistry().getLanguageKeys().isEmpty());
assertEquals(extLangsEmpty, loader.getRegistry().getExtLangKeys().isEmpty());
assertEquals(regionsEmpty, loader.getRegistry().getRegionKeys().isEmpty());
assertEquals(variantsEmpty, loader.getRegistry().getVariantKeys().isEmpty());
assertEquals(scriptsEmpty, loader.getRegistry().getScriptKeys().isEmpty());
}
}