SOLR-5654: Create a synonym filter factory that is (re)configurable, and capable of reporting its configuration, via REST API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1584211 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-04-02 23:09:08 +00:00
parent 0c0d026fea
commit a3b46516f7
7 changed files with 545 additions and 2 deletions

View File

@ -158,7 +158,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
/**
* Load synonyms with the given {@link SynonymMap.Parser} class.
*/
private SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, ParseException {
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

View File

@ -170,7 +170,9 @@ New Features
* SOLR-5829: Allow ExpandComponent to accept query and filter query parameters
(Joel Bernstein)
* SOLR-5654: Create a synonym filter factory that is (re)configurable, and
capable of reporting its configuration, via REST API.
(Tim Potter via Steve Rowe)
Bug Fixes
----------------------

View File

@ -0,0 +1,349 @@
package org.apache.solr.rest.schema.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.util.CharsRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.rest.BaseSolrResource;
import org.apache.solr.rest.ManagedResource;
import org.apache.solr.rest.ManagedResourceStorage.StorageIO;
import org.restlet.data.Status;
import org.restlet.resource.ResourceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* TokenFilterFactory and ManagedResource implementation for
* doing CRUD on synonyms using the REST API.
*/
public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
public static final Logger log = LoggerFactory.getLogger(ManagedSynonymFilterFactory.class);
public static final String SYNONYM_MAPPINGS = "synonymMappings";
public static final String IGNORE_CASE_INIT_ARG = "ignoreCase";
/**
* ManagedResource implementation for synonyms, which are so specialized that
* it makes sense to implement this class as an inner class as it has little
* application outside the SynonymFilterFactory use cases.
*/
public static class SynonymManager extends ManagedResource
implements ManagedResource.ChildResourceSupport
{
// TODO: Maybe hold this using a SoftReference / WeakReference to
// reduce memory in case the set of synonyms is large and the JVM
// is running low on memory?
protected Map<String,Set<String>> synonymMappings;
public SynonymManager(String resourceId, SolrResourceLoader loader, StorageIO storageIO)
throws SolrException {
super(resourceId, loader, storageIO);
}
@SuppressWarnings("unchecked")
@Override
protected void onManagedDataLoadedFromStorage(NamedList<?> managedInitArgs, Object managedData)
throws SolrException
{
NamedList<Object> initArgs = (NamedList<Object>)managedInitArgs;
String format = (String)initArgs.get("format");
if (format != null && !"solr".equals(format)) {
throw new SolrException(ErrorCode.BAD_REQUEST, "Invalid format "+
format+"! Only 'solr' is supported.");
}
// the default behavior is to not ignore case,
// so if not supplied, then install the default
if (initArgs.get(IGNORE_CASE_INIT_ARG) == null) {
initArgs.add(IGNORE_CASE_INIT_ARG, Boolean.FALSE);
}
boolean ignoreCase = getIgnoreCase(managedInitArgs);
synonymMappings = new TreeMap<>();
if (managedData != null) {
Map<String,Object> storedSyns = (Map<String,Object>)managedData;
for (String key : storedSyns.keySet()) {
// give the nature of our JSON parsing solution, we really have
// no guarantees on what is in the file
Object mapping = storedSyns.get(key);
if (!(mapping instanceof List)) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Invalid synonym file format! Expected a list of synonyms for "+key+
" but got "+mapping.getClass().getName());
}
// if we're configured to ignoreCase, then we build the mappings with all lower
List<String> vals = (List<String>)storedSyns.get(key);
Set<String> sortedVals = new TreeSet<>();
if (ignoreCase) {
for (String next : vals) {
sortedVals.add(applyCaseSetting(ignoreCase, next));
}
} else {
sortedVals.addAll(vals);
}
synonymMappings.put(applyCaseSetting(ignoreCase, key), sortedVals);
}
}
log.info("Loaded {} synonym mappings for {}", synonymMappings.size(), getResourceId());
}
@SuppressWarnings("unchecked")
@Override
protected Object applyUpdatesToManagedData(Object updates) {
if (!(updates instanceof Map)) {
throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST,
"Unsupported data format (" + updates.getClass().getName() + "); expected a JSON object (Map)!");
}
boolean ignoreCase = getIgnoreCase();
boolean madeChanges = false;
Map<String,Object> jsonMap = (Map<String,Object>)updates;
for (String term : jsonMap.keySet()) {
term = applyCaseSetting(ignoreCase, term);
Set<String> output = synonymMappings.get(term);
Object val = jsonMap.get(term);
if (val instanceof String) {
String strVal = applyCaseSetting(ignoreCase, (String)val);
if (output == null) {
output = new TreeSet<>();
synonymMappings.put(term, output);
}
if (output.add(strVal)) {
madeChanges = true;
}
} else if (val instanceof List) {
List<String> vals = (List<String>)val;
if (output == null) {
output = new TreeSet<>();
synonymMappings.put(term, output);
}
for (String nextVal : vals) {
if (output.add(applyCaseSetting(ignoreCase, nextVal))) {
madeChanges = true;
}
}
} else {
throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST, "Unsupported value "+val+
" for "+term+"; expected single value or a JSON array!");
}
}
return madeChanges ? synonymMappings : null;
}
/**
* Handles a change in the ignoreCase setting for synonyms, which requires
* a full rebuild of the synonymMappings.
*/
@Override
protected boolean updateInitArgs(NamedList<?> updatedArgs) {
if (updatedArgs == null || updatedArgs.size() == 0) {
return false;
}
boolean currentIgnoreCase = getIgnoreCase(managedInitArgs);
boolean updatedIgnoreCase = getIgnoreCase(updatedArgs);
if (currentIgnoreCase == true && updatedIgnoreCase == false) {
throw new SolrException(ErrorCode.BAD_REQUEST,
"Changing a managed word set's ignoreCase arg from true to false is not permitted.");
} else if (currentIgnoreCase == false && updatedIgnoreCase == true) {
// ignore case policy changed ... rebuild the map
Map<String,Set<String>> rebuild = new TreeMap<>();
for (String curr : synonymMappings.keySet()) {
Set<String> newMappings = new TreeSet<>();
for (String next : synonymMappings.get(curr)) {
newMappings.add(applyCaseSetting(updatedIgnoreCase, next));
}
rebuild.put(applyCaseSetting(updatedIgnoreCase, curr), newMappings);
}
synonymMappings = rebuild;
}
return super.updateInitArgs(updatedArgs);
}
protected String applyCaseSetting(boolean ignoreCase, String str) {
return (ignoreCase && str != null) ? str.toLowerCase(Locale.ROOT) : str;
}
public boolean getIgnoreCase() {
return getIgnoreCase(managedInitArgs);
}
public boolean getIgnoreCase(NamedList<?> initArgs) {
Boolean ignoreCase = initArgs.getBooleanArg(IGNORE_CASE_INIT_ARG);
// ignoreCase = false by default
return null == ignoreCase ? false : ignoreCase;
}
@Override
public void doGet(BaseSolrResource endpoint, String childId) {
SolrQueryResponse response = endpoint.getSolrResponse();
if (childId != null) {
boolean ignoreCase = getIgnoreCase();
String key = applyCaseSetting(ignoreCase, childId);
Set<String> output = synonymMappings.get(key);
if (output == null) {
throw new SolrException(ErrorCode.NOT_FOUND,
String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
}
response.add(key, output);
} else {
response.add(SYNONYM_MAPPINGS, buildMapToStore(synonymMappings));
}
}
@Override
public synchronized void doDeleteChild(BaseSolrResource endpoint, String childId) {
boolean ignoreCase = getIgnoreCase();
String key = applyCaseSetting(ignoreCase, childId);
Set<String> output = synonymMappings.get(key);
if (output == null)
throw new SolrException(ErrorCode.NOT_FOUND,
String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
synonymMappings.remove(key);
storeManagedData(synonymMappings);
log.info("Removed synonym mappings for: {}", key);
}
}
/**
* Custom SynonymMap.Parser implementation that provides synonym
* mappings from the managed JSON in this class during SynonymMap
* building.
*/
private class ManagedSynonymParser extends SynonymMap.Parser {
SynonymManager synonymManager;
public ManagedSynonymParser(SynonymManager synonymManager, boolean dedup, Analyzer analyzer) {
super(dedup, analyzer);
this.synonymManager = synonymManager;
}
/**
* Add the managed synonyms and their mappings into the SynonymMap builder.
*/
@Override
public void parse(Reader in) throws IOException, ParseException {
for (String term : synonymManager.synonymMappings.keySet()) {
for (String mapping : synonymManager.synonymMappings.get(term)) {
add(new CharsRef(term), new CharsRef(mapping), false);
}
}
}
}
protected SynonymFilterFactory delegate;
public ManagedSynonymFilterFactory(Map<String,String> args) {
super(args);
}
@Override
public String getResourceId() {
return "/schema/analysis/synonyms/"+handle;
}
protected Class<? extends ManagedResource> getManagedResourceImplClass() {
return SynonymManager.class;
}
/**
* Called once, during core initialization, to initialize any analysis components
* that depend on the data managed by this resource. It is important that the
* analysis component is only initialized once during core initialization so that
* text analysis is consistent, especially in a distributed environment, as we
* don't want one server applying a different set of stop words than other servers.
*/
@SuppressWarnings("unchecked")
@Override
public void onManagedResourceInitialized(NamedList<?> initArgs, final ManagedResource res)
throws SolrException
{
NamedList<Object> args = (NamedList<Object>)initArgs;
args.add("synonyms", getResourceId());
args.add("expand", "false");
args.add("format", "solr");
Map<String,String> filtArgs = new HashMap<>();
for (Map.Entry<String,?> entry : args) {
filtArgs.put(entry.getKey(), entry.getValue().toString());
}
// create the actual filter factory that pulls the synonym mappings
// from synonymMappings using a custom parser implementation
delegate = new SynonymFilterFactory(filtArgs) {
@Override
protected SynonymMap loadSynonyms
(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer)
throws IOException, ParseException {
ManagedSynonymParser parser =
new ManagedSynonymParser((SynonymManager)res, dedup, analyzer);
// null is safe here because there's no actual parsing done against a input Reader
parser.parse(null);
return parser.build();
}
};
try {
delegate.inform(res.getResourceLoader());
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
@Override
public TokenStream create(TokenStream input) {
if (delegate == null)
throw new IllegalStateException(this.getClass().getName()+
" not initialized correctly! The SynonymFilterFactory delegate was not initialized.");
return delegate.create(input);
}
}

View File

@ -457,6 +457,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ManagedStopFilterFactory" managed="english" />
<filter class="solr.ManagedSynonymFilterFactory" managed="english" />
</analyzer>
</fieldtype>

View File

@ -0,0 +1,179 @@
package org.apache.solr.rest.schema.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.commons.io.FileUtils;
import org.apache.solr.util.RestTestBase;
import org.eclipse.jetty.servlet.ServletHolder;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.noggit.JSONUtil;
import org.restlet.ext.servlet.ServerServlet;
public class TestManagedSynonymFilterFactory extends RestTestBase {
private static File tmpSolrHome;
/**
* Setup to make the schema mutable
*/
@Before
public void before() throws Exception {
tmpSolrHome = new File(dataDir + File.separator + TestManagedStopFilterFactory.class.getSimpleName()
+ System.currentTimeMillis());
FileUtils.copyDirectory(new File(TEST_HOME()), tmpSolrHome.getAbsoluteFile());
final SortedMap<ServletHolder,String> extraServlets = new TreeMap<>();
final ServletHolder solrRestApi = new ServletHolder("SolrSchemaRestApi", ServerServlet.class);
solrRestApi.setInitParameter("org.restlet.application", "org.apache.solr.rest.SolrSchemaRestApi");
extraServlets.put(solrRestApi, "/schema/*");
System.setProperty("managed.schema.mutable", "true");
System.setProperty("enable.update.log", "false");
createJettyAndHarness(tmpSolrHome.getAbsolutePath(), "solrconfig-managed-schema.xml", "schema-rest.xml",
"/solr", true, extraServlets);
}
@After
private void after() throws Exception {
jetty.stop();
jetty = null;
FileUtils.deleteDirectory(tmpSolrHome);
System.clearProperty("managed.schema.mutable");
System.clearProperty("enable.update.log");
}
@Test
public void testManagedSynonyms() throws Exception {
// this endpoint depends on at least one field type containing the following
// declaration in the schema-rest.xml:
//
// <filter class="solr.ManagedSynonymFilterFactory" managed="english" />
//
String endpoint = "/schema/analysis/synonyms/english";
assertJQ(endpoint,
"/synonymMappings/initArgs/ignoreCase==false",
"/synonymMappings/managedMap=={}");
// put a new mapping into the synonyms
Map<String,List<String>> syns = new HashMap<>();
syns.put("happy", Arrays.asList("glad","cheerful","joyful"));
assertJPut(endpoint,
JSONUtil.toJSON(syns),
"/responseHeader/status==0");
assertJQ(endpoint,
"/synonymMappings/managedMap/happy==['cheerful','glad','joyful']");
// request to a specific mapping
assertJQ(endpoint+"/happy",
"/happy==['cheerful','glad','joyful']");
// does not exist
assertJQ(endpoint+"/sad",
"/error/code==404");
// verify the user can update the ignoreCase initArg
assertJPut(endpoint,
json("{ 'initArgs':{ 'ignoreCase':true } }"),
"responseHeader/status==0");
assertJQ(endpoint,
"/synonymMappings/initArgs/ignoreCase==true");
syns = new HashMap<>();
syns.put("sad", Arrays.asList("unhappy"));
syns.put("SAD", Arrays.asList("Unhappy"));
assertJPut(endpoint,
JSONUtil.toJSON(syns),
"/responseHeader/status==0");
assertJQ(endpoint,
"/synonymMappings/managedMap/sad==['unhappy']");
// verify delete works
assertJDelete(endpoint+"/sad",
"/responseHeader/status==0");
assertJQ(endpoint,
"/synonymMappings/managedMap=={'happy':['cheerful','glad','joyful']}");
// should fail with 404 as foo doesn't exist
assertJDelete(endpoint+"/foo",
"/error/code==404");
// verify that a newly added synonym gets expanded on the query side after core reload
String newFieldName = "managed_en_field";
// make sure the new field doesn't already exist
assertQ("/schema/fields/" + newFieldName + "?indent=on&wt=xml",
"count(/response/lst[@name='field']) = 0",
"/response/lst[@name='responseHeader']/int[@name='status'] = '404'",
"/response/lst[@name='error']/int[@name='code'] = '404'");
// add the new field
assertJPut("/schema/fields/" + newFieldName, json("{'type':'managed_en'}"),
"/responseHeader/status==0");
// make sure the new field exists now
assertQ("/schema/fields/" + newFieldName + "?indent=on&wt=xml",
"count(/response/lst[@name='field']) = 1",
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'");
assertU(adoc(newFieldName, "I am a happy test today but yesterday I was angry", "id", "5150"));
assertU(commit());
assertQ("/select?q=" + newFieldName + ":angry",
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
"/response/result[@name='response'][@numFound='1']",
"/response/result[@name='response']/doc/str[@name='id'][.='5150']");
// add a mapping that will expand a query for "mad" to match docs with "angry"
syns = new HashMap<>();
syns.put("mad", Arrays.asList("angry"));
assertJPut(endpoint,
JSONUtil.toJSON(syns),
"/responseHeader/status==0");
assertJQ(endpoint,
"/synonymMappings/managedMap/mad==['angry']");
// should not match as the synonym mapping between mad and angry does not
// get applied until core reload
assertQ("/select?q=" + newFieldName + ":mad",
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
"/response/result[@name='response'][@numFound='0']");
restTestHarness.reload();
// now query for mad and we should see our test doc
assertQ("/select?q=" + newFieldName + ":mad",
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
"/response/result[@name='response'][@numFound='1']",
"/response/result[@name='response']/doc/str[@name='id'][.='5150']");
}
}

View File

@ -0,0 +1,11 @@
{
"initArgs":{
"ignoreCase":true,
"format":"solr"
},
"managedMap":{
"GB":["GiB","Gigabyte"],
"happy":["glad","joyful"],
"TV":["Television"]
}
}

View File

@ -453,6 +453,7 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ManagedStopFilterFactory" managed="english" />
<filter class="solr.ManagedSynonymFilterFactory" managed="english" />
</analyzer>
</fieldType>