SOLR-5654: Create a synonym filter factory that is (re)configurable, and capable of reporting its configuration, via REST API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1584211 13f79535-47bb-0310-9956-ffa450edef68
2014-04-02 23:09:08 +00:00 · 2014-04-02 23:09:08 +00:00 · a3b46516f7
parent 0c0d026fea
commit a3b46516f7
7 changed files with 545 additions and 2 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilterFactory.java
@ -158,7 +158,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
  /**
   * Load synonyms with the given {@link SynonymMap.Parser} class.
   */
-  private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
+  protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
        .onMalformedInput(CodingErrorAction.REPORT)
        .onUnmappableCharacter(CodingErrorAction.REPORT);
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -170,7 +170,9 @@ New Features
 * SOLR-5829: Allow ExpandComponent to accept query and filter query parameters
  (Joel Bernstein)

-
+* SOLR-5654: Create a synonym filter factory that is (re)configurable, and
+  capable of reporting its configuration, via REST API.
+  (Tim Potter via Steve Rowe)

 Bug Fixes
 ----------------------
--- a/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedSynonymFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/rest/schema/analysis/ManagedSynonymFilterFactory.java
@ -0,0 +1,349 @@
+package org.apache.solr.rest.schema.analysis;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.text.ParseException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.util.ResourceLoader;
+import org.apache.lucene.util.CharsRef;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.response.SolrQueryResponse;
+import org.apache.solr.rest.BaseSolrResource;
+import org.apache.solr.rest.ManagedResource;
+import org.apache.solr.rest.ManagedResourceStorage.StorageIO;
+import org.restlet.data.Status;
+import org.restlet.resource.ResourceException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * TokenFilterFactory and ManagedResource implementation for 
+ * doing CRUD on synonyms using the REST API.
+ */
+public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
+  
+  public static final Logger log = LoggerFactory.getLogger(ManagedSynonymFilterFactory.class);
+  
+  public static final String SYNONYM_MAPPINGS = "synonymMappings";
+  public static final String IGNORE_CASE_INIT_ARG = "ignoreCase";
+  
+  /**
+   * ManagedResource implementation for synonyms, which are so specialized that
+   * it makes sense to implement this class as an inner class as it has little 
+   * application outside the SynonymFilterFactory use cases.
+   */
+  public static class SynonymManager extends ManagedResource 
+      implements ManagedResource.ChildResourceSupport
+  {
+
+    // TODO: Maybe hold this using a SoftReference / WeakReference to
+    // reduce memory in case the set of synonyms is large and the JVM 
+    // is running low on memory?
+    protected Map<String,Set<String>> synonymMappings;
+    
+    public SynonymManager(String resourceId, SolrResourceLoader loader, StorageIO storageIO)
+        throws SolrException {
+      super(resourceId, loader, storageIO);
+    }
+
+    @SuppressWarnings("unchecked")
+    @Override
+    protected void onManagedDataLoadedFromStorage(NamedList<?> managedInitArgs, Object managedData)
+        throws SolrException
+    {
+      NamedList<Object> initArgs = (NamedList<Object>)managedInitArgs;
+      
+      String format = (String)initArgs.get("format");
+      if (format != null && !"solr".equals(format)) {
+        throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid format "+
+           format+"! Only 'solr' is supported.");
+      }
+      
+      // the default behavior is to not ignore case, 
+      // so if not supplied, then install the default
+      if (initArgs.get(IGNORE_CASE_INIT_ARG) == null) {
+        initArgs.add(IGNORE_CASE_INIT_ARG, Boolean.FALSE);
+      }
+      boolean ignoreCase = getIgnoreCase(managedInitArgs);
+      synonymMappings = new TreeMap<>();
+      if (managedData != null) {
+        Map<String,Object> storedSyns = (Map<String,Object>)managedData;
+        for (String key : storedSyns.keySet()) {
+          // give the nature of our JSON parsing solution, we really have
+          // no guarantees on what is in the file
+          Object mapping = storedSyns.get(key);
+          if (!(mapping instanceof List)) {
+            throw new SolrException(ErrorCode.SERVER_ERROR, 
+                "Invalid synonym file format! Expected a list of synonyms for "+key+
+                " but got "+mapping.getClass().getName());
+          }
+                    
+          // if we're configured to ignoreCase, then we build the mappings with all lower           
+          List<String> vals = (List<String>)storedSyns.get(key);
+          Set<String> sortedVals = new TreeSet<>();
+          if (ignoreCase) {
+            for (String next : vals) {
+              sortedVals.add(applyCaseSetting(ignoreCase, next));
+            }
+          } else {
+            sortedVals.addAll(vals);
+          }
+          
+          synonymMappings.put(applyCaseSetting(ignoreCase, key), sortedVals);
+        }
+      }
+      
+      log.info("Loaded {} synonym mappings for {}", synonymMappings.size(), getResourceId());      
+    }    
+
+    @SuppressWarnings("unchecked")
+    @Override
+    protected Object applyUpdatesToManagedData(Object updates) {
+      if (!(updates instanceof Map)) {
+        throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST,
+          "Unsupported data format (" + updates.getClass().getName() + "); expected a JSON object (Map)!");
+      }
+      boolean ignoreCase = getIgnoreCase();      
+      boolean madeChanges = false;
+      Map<String,Object> jsonMap = (Map<String,Object>)updates;
+      for (String term : jsonMap.keySet()) {
+        
+        term = applyCaseSetting(ignoreCase, term);
+        
+        Set<String> output = synonymMappings.get(term); 
+        
+        Object val = jsonMap.get(term);
+        if (val instanceof String) {
+          String strVal = applyCaseSetting(ignoreCase, (String)val);
+          
+          if (output == null) {
+            output = new TreeSet<>();
+            synonymMappings.put(term, output);
+          }
+                    
+          if (output.add(strVal)) {
+            madeChanges = true;
+          }
+        } else if (val instanceof List) {
+          List<String> vals = (List<String>)val;
+          
+          if (output == null) {
+            output = new TreeSet<>();
+            synonymMappings.put(term, output);
+          }
+          
+          for (String nextVal : vals) {
+            if (output.add(applyCaseSetting(ignoreCase, nextVal))) {
+              madeChanges = true;
+            }
+          }          
+          
+        } else {
+          throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST, "Unsupported value "+val+
+              " for "+term+"; expected single value or a JSON array!");
+        }
+      }
+          
+      return madeChanges ? synonymMappings : null;
+    }
+    
+    /**
+     * Handles a change in the ignoreCase setting for synonyms, which requires
+     * a full rebuild of the synonymMappings.
+     */
+    @Override
+    protected boolean updateInitArgs(NamedList<?> updatedArgs) {
+      if (updatedArgs == null || updatedArgs.size() == 0) {
+        return false;
+      }
+      boolean currentIgnoreCase = getIgnoreCase(managedInitArgs);
+      boolean updatedIgnoreCase = getIgnoreCase(updatedArgs);
+      if (currentIgnoreCase == true && updatedIgnoreCase == false) {
+        throw new SolrException(ErrorCode.BAD_REQUEST,
+            "Changing a managed word set's ignoreCase arg from true to false is not permitted.");
+      } else if (currentIgnoreCase == false && updatedIgnoreCase == true) {
+        // ignore case policy changed ... rebuild the map
+        Map<String,Set<String>> rebuild = new TreeMap<>();
+        for (String curr : synonymMappings.keySet()) {
+          Set<String> newMappings = new TreeSet<>();
+          for (String next : synonymMappings.get(curr)) {
+            newMappings.add(applyCaseSetting(updatedIgnoreCase, next));
+          }
+          rebuild.put(applyCaseSetting(updatedIgnoreCase, curr), newMappings);
+        }
+        synonymMappings = rebuild;
+      }
+      
+      return super.updateInitArgs(updatedArgs);
+    }
+    
+    protected String applyCaseSetting(boolean ignoreCase, String str) {
+      return (ignoreCase && str != null) ? str.toLowerCase(Locale.ROOT) : str;
+    }
+    
+    public boolean getIgnoreCase() {
+      return getIgnoreCase(managedInitArgs);
+    }
+
+    public boolean getIgnoreCase(NamedList<?> initArgs) {
+      Boolean ignoreCase = initArgs.getBooleanArg(IGNORE_CASE_INIT_ARG);
+      // ignoreCase = false by default
+      return null == ignoreCase ? false : ignoreCase;
+    }
+    
+    @Override
+    public void doGet(BaseSolrResource endpoint, String childId) {
+      SolrQueryResponse response = endpoint.getSolrResponse();
+      if (childId != null) {
+        boolean ignoreCase = getIgnoreCase();
+        String key = applyCaseSetting(ignoreCase, childId);
+        Set<String> output = synonymMappings.get(key);
+        if (output == null) {
+          throw new SolrException(ErrorCode.NOT_FOUND,
+              String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
+        }
+        response.add(key, output);
+      } else {
+        response.add(SYNONYM_MAPPINGS, buildMapToStore(synonymMappings));      
+      }
+    }  
+
+    @Override
+    public synchronized void doDeleteChild(BaseSolrResource endpoint, String childId) {
+      boolean ignoreCase = getIgnoreCase();
+      String key = applyCaseSetting(ignoreCase, childId);
+      Set<String> output = synonymMappings.get(key);
+      if (output == null)
+        throw new SolrException(ErrorCode.NOT_FOUND, 
+            String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
+      
+      synonymMappings.remove(key);
+      storeManagedData(synonymMappings);
+      log.info("Removed synonym mappings for: {}", key);      
+    }
+  }
+  
+  /**
+   * Custom SynonymMap.Parser implementation that provides synonym
+   * mappings from the managed JSON in this class during SynonymMap
+   * building.
+   */
+  private class ManagedSynonymParser extends SynonymMap.Parser {
+
+    SynonymManager synonymManager;
+    
+    public ManagedSynonymParser(SynonymManager synonymManager, boolean dedup, Analyzer analyzer) {
+      super(dedup, analyzer);
+      this.synonymManager = synonymManager;
+    }
+
+    /**
+     * Add the managed synonyms and their mappings into the SynonymMap builder.
+     */
+    @Override
+    public void parse(Reader in) throws IOException, ParseException {
+      for (String term : synonymManager.synonymMappings.keySet()) {
+        for (String mapping : synonymManager.synonymMappings.get(term)) {
+          add(new CharsRef(term), new CharsRef(mapping), false);
+        }
+      }      
+    }    
+  }
+  
+  protected SynonymFilterFactory delegate;
+          
+  public ManagedSynonymFilterFactory(Map<String,String> args) {
+    super(args);    
+  }
+
+  @Override
+  public String getResourceId() {
+    return "/schema/analysis/synonyms/"+handle;
+  }  
+    
+  protected Class<? extends ManagedResource> getManagedResourceImplClass() {
+    return SynonymManager.class;
+  }
+
+  /**
+   * Called once, during core initialization, to initialize any analysis components
+   * that depend on the data managed by this resource. It is important that the
+   * analysis component is only initialized once during core initialization so that
+   * text analysis is consistent, especially in a distributed environment, as we
+   * don't want one server applying a different set of stop words than other servers.
+   */
+  @SuppressWarnings("unchecked")
+  @Override
+  public void onManagedResourceInitialized(NamedList<?> initArgs, final ManagedResource res) 
+      throws SolrException
+  {    
+    NamedList<Object> args = (NamedList<Object>)initArgs;    
+    args.add("synonyms", getResourceId());
+    args.add("expand", "false");
+    args.add("format", "solr");
+    
+    Map<String,String> filtArgs = new HashMap<>();
+    for (Map.Entry<String,?> entry : args) {
+      filtArgs.put(entry.getKey(), entry.getValue().toString());
+    }
+    // create the actual filter factory that pulls the synonym mappings
+    // from synonymMappings using a custom parser implementation
+    delegate = new SynonymFilterFactory(filtArgs) {
+      @Override
+      protected SynonymMap loadSynonyms
+          (ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer)
+          throws IOException, ParseException {
+
+        ManagedSynonymParser parser =
+            new ManagedSynonymParser((SynonymManager)res, dedup, analyzer);
+        // null is safe here because there's no actual parsing done against a input Reader
+        parser.parse(null);
+        return parser.build(); 
+      }
+    };
+    try {
+      delegate.inform(res.getResourceLoader());
+    } catch (IOException e) {
+      throw new SolrException(ErrorCode.SERVER_ERROR, e);
+    }    
+  }
+    
+  @Override
+  public TokenStream create(TokenStream input) {    
+    if (delegate == null)
+      throw new IllegalStateException(this.getClass().getName()+
+          " not initialized correctly! The SynonymFilterFactory delegate was not initialized.");
+    
+    return delegate.create(input);
+  }
+}
--- a/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-rest.xml
@ -457,6 +457,7 @@
       <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ManagedStopFilterFactory" managed="english" />
+        <filter class="solr.ManagedSynonymFilterFactory" managed="english" />
      </analyzer>
    </fieldtype>

--- a/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/rest/schema/analysis/TestManagedSynonymFilterFactory.java
@ -0,0 +1,179 @@
+package org.apache.solr.rest.schema.analysis;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.solr.util.RestTestBase;
+import org.eclipse.jetty.servlet.ServletHolder;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.noggit.JSONUtil;
+import org.restlet.ext.servlet.ServerServlet;
+
+public class TestManagedSynonymFilterFactory extends RestTestBase {
+  
+  private static File tmpSolrHome;
+
+  /**
+   * Setup to make the schema mutable
+   */
+  @Before
+  public void before() throws Exception {
+    tmpSolrHome = new File(dataDir + File.separator + TestManagedStopFilterFactory.class.getSimpleName()
+                          + System.currentTimeMillis());
+    FileUtils.copyDirectory(new File(TEST_HOME()), tmpSolrHome.getAbsoluteFile());
+
+    final SortedMap<ServletHolder,String> extraServlets = new TreeMap<>();
+    final ServletHolder solrRestApi = new ServletHolder("SolrSchemaRestApi", ServerServlet.class);
+    solrRestApi.setInitParameter("org.restlet.application", "org.apache.solr.rest.SolrSchemaRestApi");
+    extraServlets.put(solrRestApi, "/schema/*");
+
+    System.setProperty("managed.schema.mutable", "true");
+    System.setProperty("enable.update.log", "false");
+    createJettyAndHarness(tmpSolrHome.getAbsolutePath(), "solrconfig-managed-schema.xml", "schema-rest.xml",
+                          "/solr", true, extraServlets);
+  }
+
+  @After
+  private void after() throws Exception {
+    jetty.stop();
+    jetty = null;
+    FileUtils.deleteDirectory(tmpSolrHome);
+    System.clearProperty("managed.schema.mutable");
+    System.clearProperty("enable.update.log");
+  }
+  
+  @Test
+  public void testManagedSynonyms() throws Exception {
+    // this endpoint depends on at least one field type containing the following
+    // declaration in the schema-rest.xml:
+    // 
+    //   <filter class="solr.ManagedSynonymFilterFactory" managed="english" />
+    //      
+    String endpoint = "/schema/analysis/synonyms/english";
+    
+    assertJQ(endpoint, 
+             "/synonymMappings/initArgs/ignoreCase==false",
+             "/synonymMappings/managedMap=={}");
+      
+    // put a new mapping into the synonyms
+    Map<String,List<String>> syns = new HashMap<>();
+    syns.put("happy", Arrays.asList("glad","cheerful","joyful"));    
+    assertJPut(endpoint, 
+               JSONUtil.toJSON(syns),
+               "/responseHeader/status==0");
+    
+    assertJQ(endpoint, 
+             "/synonymMappings/managedMap/happy==['cheerful','glad','joyful']");
+
+    // request to a specific mapping
+    assertJQ(endpoint+"/happy", 
+             "/happy==['cheerful','glad','joyful']");
+    
+    // does not exist
+    assertJQ(endpoint+"/sad", 
+             "/error/code==404");
+    
+    // verify the user can update the ignoreCase initArg
+    assertJPut(endpoint, 
+               json("{ 'initArgs':{ 'ignoreCase':true } }"), 
+               "responseHeader/status==0");
+
+    assertJQ(endpoint, 
+             "/synonymMappings/initArgs/ignoreCase==true");
+    
+    syns = new HashMap<>();
+    syns.put("sad", Arrays.asList("unhappy"));    
+    syns.put("SAD", Arrays.asList("Unhappy"));    
+    assertJPut(endpoint, 
+               JSONUtil.toJSON(syns),
+               "/responseHeader/status==0");
+    
+    assertJQ(endpoint, 
+             "/synonymMappings/managedMap/sad==['unhappy']");
+    
+    // verify delete works
+    assertJDelete(endpoint+"/sad",
+                  "/responseHeader/status==0");
+    
+    assertJQ(endpoint, 
+        "/synonymMappings/managedMap=={'happy':['cheerful','glad','joyful']}");
+    
+    // should fail with 404 as foo doesn't exist
+    assertJDelete(endpoint+"/foo",
+                  "/error/code==404");
+    
+    // verify that a newly added synonym gets expanded on the query side after core reload
+    
+    String newFieldName = "managed_en_field";
+    // make sure the new field doesn't already exist
+    assertQ("/schema/fields/" + newFieldName + "?indent=on&wt=xml",
+            "count(/response/lst[@name='field']) = 0",
+            "/response/lst[@name='responseHeader']/int[@name='status'] = '404'",
+            "/response/lst[@name='error']/int[@name='code'] = '404'");
+
+    // add the new field
+    assertJPut("/schema/fields/" + newFieldName, json("{'type':'managed_en'}"),
+               "/responseHeader/status==0");
+
+    // make sure the new field exists now
+    assertQ("/schema/fields/" + newFieldName + "?indent=on&wt=xml",
+            "count(/response/lst[@name='field']) = 1",
+            "/response/lst[@name='responseHeader']/int[@name='status'] = '0'");
+
+    assertU(adoc(newFieldName, "I am a happy test today but yesterday I was angry", "id", "5150"));
+    assertU(commit());
+
+    assertQ("/select?q=" + newFieldName + ":angry",
+            "/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
+            "/response/result[@name='response'][@numFound='1']",
+            "/response/result[@name='response']/doc/str[@name='id'][.='5150']");    
+    
+    // add a mapping that will expand a query for "mad" to match docs with "angry"
+    syns = new HashMap<>();
+    syns.put("mad", Arrays.asList("angry"));    
+    assertJPut(endpoint, 
+               JSONUtil.toJSON(syns),
+               "/responseHeader/status==0");
+    
+    assertJQ(endpoint, 
+        "/synonymMappings/managedMap/mad==['angry']");
+
+    // should not match as the synonym mapping between mad and angry does not    
+    // get applied until core reload
+    assertQ("/select?q=" + newFieldName + ":mad",
+        "/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
+        "/response/result[@name='response'][@numFound='0']");    
+    
+    restTestHarness.reload();
+
+    // now query for mad and we should see our test doc
+    assertQ("/select?q=" + newFieldName + ":mad",
+        "/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
+        "/response/result[@name='response'][@numFound='1']",
+        "/response/result[@name='response']/doc/str[@name='id'][.='5150']");    
+  }
+}
--- a/solr/example/solr/collection1/conf/_schema_analysis_synonyms_english.json
+++ b/solr/example/solr/collection1/conf/_schema_analysis_synonyms_english.json
@ -0,0 +1,11 @@
+{
+  "initArgs":{
+    "ignoreCase":true,
+    "format":"solr"
+  },
+  "managedMap":{
+    "GB":["GiB","Gigabyte"],
+    "happy":["glad","joyful"],
+    "TV":["Television"]
+  }
+}
--- a/solr/example/solr/collection1/conf/schema.xml
+++ b/solr/example/solr/collection1/conf/schema.xml
@ -453,6 +453,7 @@
      <analyzer>
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.ManagedStopFilterFactory" managed="english" />
+        <filter class="solr.ManagedSynonymFilterFactory" managed="english" />
      </analyzer>
    </fieldType>