SOLR-4412: LanguageIdentifier lcmap for language field

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1498959 13f79535-47bb-0310-9956-ffa450edef68
2013-07-02 14:38:47 +00:00 · 2013-07-02 14:38:47 +00:00 · 42b3251408
parent 5f678a0e23
commit 42b3251408
4 changed files with 55 additions and 6 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -250,6 +250,9 @@ Optimizations
 * SOLR-4955: Admin UI - Show address bar on top for Schema + Config (steffkes)
 * SOLR-4412: New parameter langid.lcmap to map detected language code to be placed 
  in "language" field (janhoy)
 Other Changes
 ----------------------
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
@ -31,6 +31,7 @@ public interface LangIdParams {
  String THRESHOLD  = LANGUAGE_ID + ".threshold";            // Detection threshold
  String ENFORCE_SCHEMA =  LANGUAGE_ID + ".enforceSchema";   // Enforces that output fields exist in schema
  String LANG_WHITELIST  = LANGUAGE_ID + ".whitelist";       // Allowed languages
  String LCMAP =  LANGUAGE_ID + ".lcmap";                    // Maps detected langcode to other value
  String MAP_ENABLE =  LANGUAGE_ID + ".map";                 // Turns on or off the field mapping
  String MAP_FL =  LANGUAGE_ID + ".map.fl";                  // Field list for mapping
  String MAP_OVERWRITE =  LANGUAGE_ID + ".map.overwrite";    // Whether to overwrite existing fields
--- a/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++ b/solr/contrib/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@ -75,6 +75,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
  protected HashSet<String> mapIndividualFieldsSet;
  protected HashSet<String> allMapFieldsSet;
  protected HashMap<String,String> lcMap;
  protected HashMap<String,String> mapLcMap;
  protected IndexSchema schema;
  // Regex patterns
@ -138,13 +139,26 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
        allMapFieldsSet.addAll(mapIndividualFieldsSet);
      }
-      // Language Code mapping
+      // Normalize detected langcode onto normalized langcode
      lcMap = new HashMap<String,String>();
      if(params.get(LCMAP) != null) {
        for(String mapping : params.get(LCMAP).split("[, ]")) {
          String[] keyVal = mapping.split(":");
          if(keyVal.length == 2) {
            lcMap.put(keyVal[0], keyVal[1]);
          } else {
            log.error("Unsupported format for langid.lcmap: "+mapping+". Skipping this mapping.");
          }
        }
      }
      // Language Code mapping
      mapLcMap = new HashMap<String,String>();
      if(params.get(MAP_LCMAP) != null) {
        for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
          String[] keyVal = mapping.split(":");
          if(keyVal.length == 2) {
-            lcMap.put(keyVal[0], keyVal[1]);
+            mapLcMap.put(keyVal[0], keyVal[1]);
          } else {
            log.error("Unsupported format for langid.map.lcmap: "+mapping+". Skipping this mapping.");
          }
@ -322,10 +336,11 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
      langStr = fallbackLang;
    } else {
      DetectedLanguage lang = languages.get(0);
-      if(langWhitelist.isEmpty() || langWhitelist.contains(lang.getLangCode())) {
+      String normalizedLang = normalizeLangCode(lang.getLangCode());
-        log.debug("Language detected {} with certainty {}", lang.getLangCode(), lang.getCertainty());
+      if(langWhitelist.isEmpty() || langWhitelist.contains(normalizedLang)) {
        log.debug("Language detected {} with certainty {}", normalizedLang, lang.getCertainty());
        if(lang.getCertainty() >= threshold) {
-          langStr = lang.getLangCode();
+          langStr = normalizedLang;
        } else {
          log.debug("Detected language below threshold {}, using fallback {}", threshold, fallbackLang);
          langStr = fallbackLang;
@ -344,6 +359,20 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
    return langStr;
  }
  /**
   * Looks up language code in map (langid.lcmap) and returns mapped value
   * @param langCode the language code string returned from detector
   * @return the normalized/mapped language code
   */
  protected String normalizeLangCode(String langCode) {
    if (lcMap.containsKey(langCode)) {
      String lc = lcMap.get(langCode);
      log.debug("Doing langcode normalization mapping from "+langCode+" to "+lc);
      return lc;
    }
    return langCode;
  }
  /**
   * Returns the name of the field to map the current contents into, so that they are properly analyzed.  For instance
   * if the currentField is "text" and the code is "en", the new field would by default be "text_en".
@ -355,7 +384,7 @@ public abstract class LanguageIdentifierUpdateProcessor extends UpdateRequestPro
   * @return The new schema field name, based on pattern and replace, or null if illegal
   */
  protected String getMappedField(String currentField, String language) {
-    String lc = lcMap.containsKey(language) ? lcMap.get(language) : language;
+    String lc = mapLcMap.containsKey(language) ? mapLcMap.get(language) : language;
    String newFieldName = langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
    if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
      log.warn("Unsuccessful field name mapping from {} to {}, field does not exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
--- a/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
+++ b/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
@ -115,6 +115,22 @@ public abstract class LanguageIdentifierUpdateProcessorFactoryTestCase extends S
    assertEquals("body_sv_s", liProcessor.getMappedField("text_body_field", "sv"));
  }
  @Test
  public void testMapLangcode() throws Exception {
    parameters = new ModifiableSolrParams();
    parameters.add("langid.fl", "name");
    parameters.add("langid.lcmap", "zh_cn:zh zh_tw:zh");
    parameters.set("langid.enforceSchema", "false");
    liProcessor = createLangIdProcessor(parameters);
    assertEquals("zh", liProcessor.resolveLanguage("zh_cn", "NA"));
    assertEquals("zh", liProcessor.resolveLanguage("zh_tw", "NA"));
    assertEquals("no", liProcessor.resolveLanguage("no", "NA"));
    List<DetectedLanguage> langs = new ArrayList<DetectedLanguage>();
    langs.add(new DetectedLanguage("zh_cn", 0.8));
    assertEquals("zh", liProcessor.resolveLanguage(langs, "NA"));
  }
  @Test
  public void testPreExisting() throws Exception {
    SolrInputDocument doc;