mirror of https://github.com/apache/lucene.git
SOLR-4412: LanguageIdentifier lcmap for language field
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1498959 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5f678a0e23
commit
42b3251408
solr
CHANGES.txt
contrib/langid/src
java/org/apache/solr/update/processor
test/org/apache/solr/update/processor
|
@ -250,6 +250,9 @@ Optimizations
|
||||||
|
|
||||||
* SOLR-4955: Admin UI - Show address bar on top for Schema + Config (steffkes)
|
* SOLR-4955: Admin UI - Show address bar on top for Schema + Config (steffkes)
|
||||||
|
|
||||||
|
* SOLR-4412: New parameter langid.lcmap to map detected language code to be placed
|
||||||
|
in "language" field (janhoy)
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -31,6 +31,7 @@ public interface LangIdParams {
|
||||||
String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold
|
String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold
|
||||||
String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in schema
|
String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in schema
|
||||||
String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Allowed languages
|
String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Allowed languages
|
||||||
|
String LCMAP = LANGUAGE_ID + ".lcmap"; // Maps detected langcode to other value
|
||||||
String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field mapping
|
String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field mapping
|
||||||
String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping
|
String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping
|
||||||
String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to overwrite existing fields
|
String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to overwrite existing fields
|
||||||
|
|
|
@ -75,6 +75,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
protected HashSet<String> mapIndividualFieldsSet;
|
protected HashSet<String> mapIndividualFieldsSet;
|
||||||
protected HashSet<String> allMapFieldsSet;
|
protected HashSet<String> allMapFieldsSet;
|
||||||
protected HashMap<String,String> lcMap;
|
protected HashMap<String,String> lcMap;
|
||||||
|
protected HashMap<String,String> mapLcMap;
|
||||||
protected IndexSchema schema;
|
protected IndexSchema schema;
|
||||||
|
|
||||||
// Regex patterns
|
// Regex patterns
|
||||||
|
@ -138,13 +139,26 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
allMapFieldsSet.addAll(mapIndividualFieldsSet);
|
allMapFieldsSet.addAll(mapIndividualFieldsSet);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Language Code mapping
|
// Normalize detected langcode onto normalized langcode
|
||||||
lcMap = new HashMap<String,String>();
|
lcMap = new HashMap<String,String>();
|
||||||
|
if(params.get(LCMAP) != null) {
|
||||||
|
for(String mapping : params.get(LCMAP).split("[, ]")) {
|
||||||
|
String[] keyVal = mapping.split(":");
|
||||||
|
if(keyVal.length == 2) {
|
||||||
|
lcMap.put(keyVal[0], keyVal[1]);
|
||||||
|
} else {
|
||||||
|
log.error("Unsupported format for langid.lcmap: "+mapping+". Skipping this mapping.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Language Code mapping
|
||||||
|
mapLcMap = new HashMap<String,String>();
|
||||||
if(params.get(MAP_LCMAP) != null) {
|
if(params.get(MAP_LCMAP) != null) {
|
||||||
for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
|
for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
|
||||||
String[] keyVal = mapping.split(":");
|
String[] keyVal = mapping.split(":");
|
||||||
if(keyVal.length == 2) {
|
if(keyVal.length == 2) {
|
||||||
lcMap.put(keyVal[0], keyVal[1]);
|
mapLcMap.put(keyVal[0], keyVal[1]);
|
||||||
} else {
|
} else {
|
||||||
log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
|
log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
|
||||||
}
|
}
|
||||||
|
@ -322,10 +336,11 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
langStr = fallbackLang;
|
langStr = fallbackLang;
|
||||||
} else {
|
} else {
|
||||||
DetectedLanguage lang = languages.get(0);
|
DetectedLanguage lang = languages.get(0);
|
||||||
if(langWhitelist.isEmpty() || langWhitelist.contains(lang.getLangCode())) {
|
String normalizedLang = normalizeLangCode(lang.getLangCode());
|
||||||
log.debug("Language detected {} with certainty {}", lang.getLangCode(), lang.getCertainty());
|
if(langWhitelist.isEmpty() || langWhitelist.contains(normalizedLang)) {
|
||||||
|
log.debug("Language detected {} with certainty {}", normalizedLang, lang.getCertainty());
|
||||||
if(lang.getCertainty() >= threshold) {
|
if(lang.getCertainty() >= threshold) {
|
||||||
langStr = lang.getLangCode();
|
langStr = normalizedLang;
|
||||||
} else {
|
} else {
|
||||||
log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
|
log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
|
||||||
langStr = fallbackLang;
|
langStr = fallbackLang;
|
||||||
|
@ -344,6 +359,20 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
return langStr;
|
return langStr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up language code in map (langid.lcmap) and returns mapped value
|
||||||
|
* @param langCode the language code string returned from detector
|
||||||
|
* @return the normalized/mapped language code
|
||||||
|
*/
|
||||||
|
protected String normalizeLangCode(String langCode) {
|
||||||
|
if (lcMap.containsKey(langCode)) {
|
||||||
|
String lc = lcMap.get(langCode);
|
||||||
|
log.debug("Doing langcode normalization mapping from "+langCode+" to "+lc);
|
||||||
|
return lc;
|
||||||
|
}
|
||||||
|
return langCode;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance
|
* Returns the name of the field to map the current contents into, so that they are properly analyzed. For instance
|
||||||
* if the currentField is "text" and the code is "en", the new field would by default be "text_en".
|
* if the currentField is "text" and the code is "en", the new field would by default be "text_en".
|
||||||
|
@ -355,7 +384,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
|
||||||
* @return The new schema field name, based on pattern and replace, or null if illegal
|
* @return The new schema field name, based on pattern and replace, or null if illegal
|
||||||
*/
|
*/
|
||||||
protected String getMappedField(String currentField, String language) {
|
protected String getMappedField(String currentField, String language) {
|
||||||
String lc = lcMap.containsKey(language) ? lcMap.get(language) : language;
|
String lc = mapLcMap.containsKey(language) ? mapLcMap.get(language) : language;
|
||||||
String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
|
String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
|
||||||
if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
|
if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
|
||||||
log.warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
|
log.warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
|
||||||
|
|
|
@ -115,6 +115,22 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
|
||||||
assertEquals("body_sv_s", liProcessor.getMappedField("text_body_field", "sv"));
|
assertEquals("body_sv_s", liProcessor.getMappedField("text_body_field", "sv"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMapLangcode() throws Exception {
|
||||||
|
parameters = new ModifiableSolrParams();
|
||||||
|
parameters.add("langid.fl", "name");
|
||||||
|
parameters.add("langid.lcmap", "zh_cn:zh zh_tw:zh");
|
||||||
|
parameters.set("langid.enforceSchema", "false");
|
||||||
|
liProcessor = createLangIdProcessor(parameters);
|
||||||
|
|
||||||
|
assertEquals("zh", liProcessor.resolveLanguage("zh_cn", "NA"));
|
||||||
|
assertEquals("zh", liProcessor.resolveLanguage("zh_tw", "NA"));
|
||||||
|
assertEquals("no", liProcessor.resolveLanguage("no", "NA"));
|
||||||
|
List<DetectedLanguage> langs = new ArrayList<DetectedLanguage>();
|
||||||
|
langs.add(new DetectedLanguage("zh_cn", 0.8));
|
||||||
|
assertEquals("zh", liProcessor.resolveLanguage(langs, "NA"));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPreExisting() throws Exception {
|
public void testPreExisting() throws Exception {
|
||||||
SolrInputDocument doc;
|
SolrInputDocument doc;
|
||||||
|
|
Loading…
Reference in New Issue