mirror of https://github.com/apache/lucene.git
SOLR-6015: improved strategy for handling managed synonyms when ignoreCase=true
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1596928 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1cc70e75fc
commit
8a220db194
|
@ -86,6 +86,9 @@ Other Changes
|
|||
replication factor for an update request (single or batch) by sending
|
||||
an optional parameter "min_rf". (Timothy Potter)
|
||||
|
||||
* SOLR-6015: Better way to handle managed synonyms when ignoreCase=true
|
||||
(Timothy Potter)
|
||||
|
||||
================== 4.9.0 ==================
|
||||
|
||||
Versions of Major Components
|
||||
|
|
|
@ -57,6 +57,42 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
public static final String SYNONYM_MAPPINGS = "synonymMappings";
|
||||
public static final String IGNORE_CASE_INIT_ARG = "ignoreCase";
|
||||
|
||||
/**
|
||||
* Used internally to preserve the case of synonym mappings regardless
|
||||
* of the ignoreCase setting.
|
||||
*/
|
||||
private static class CasePreservedSynonymMappings {
|
||||
Map<String,Set<String>> mappings = new TreeMap<>();
|
||||
|
||||
/**
|
||||
* Provides a view of the mappings for a given term; specifically, if
|
||||
* ignoreCase is true, then the returned "view" contains the mappings
|
||||
* for all known cases of the term, if it is false, then only the
|
||||
* mappings for the specific case is returned.
|
||||
*/
|
||||
Set<String> getMappings(boolean ignoreCase, String key) {
|
||||
Set<String> synMappings = null;
|
||||
if (ignoreCase) {
|
||||
// TODO: should we return the mapped values in all lower-case here?
|
||||
if (mappings.size() == 1) {
|
||||
// if only one in the map (which is common) just return it directly
|
||||
return mappings.values().iterator().next();
|
||||
}
|
||||
|
||||
synMappings = new TreeSet<>();
|
||||
for (Set<String> next : mappings.values())
|
||||
synMappings.addAll(next);
|
||||
} else {
|
||||
synMappings = mappings.get(key);
|
||||
}
|
||||
return synMappings;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return mappings.toString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ManagedResource implementation for synonyms, which are so specialized that
|
||||
* it makes sense to implement this class as an inner class as it has little
|
||||
|
@ -65,11 +101,7 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
public static class SynonymManager extends ManagedResource
|
||||
implements ManagedResource.ChildResourceSupport
|
||||
{
|
||||
|
||||
// TODO: Maybe hold this using a SoftReference / WeakReference to
|
||||
// reduce memory in case the set of synonyms is large and the JVM
|
||||
// is running low on memory?
|
||||
protected Map<String,Set<String>> synonymMappings;
|
||||
protected Map<String,CasePreservedSynonymMappings> synonymMappings;
|
||||
|
||||
public SynonymManager(String resourceId, SolrResourceLoader loader, StorageIO storageIO)
|
||||
throws SolrException {
|
||||
|
@ -94,11 +126,20 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
if (initArgs.get(IGNORE_CASE_INIT_ARG) == null) {
|
||||
initArgs.add(IGNORE_CASE_INIT_ARG, Boolean.FALSE);
|
||||
}
|
||||
|
||||
boolean ignoreCase = getIgnoreCase(managedInitArgs);
|
||||
synonymMappings = new TreeMap<>();
|
||||
if (managedData != null) {
|
||||
Map<String,Object> storedSyns = (Map<String,Object>)managedData;
|
||||
for (String key : storedSyns.keySet()) {
|
||||
|
||||
String caseKey = applyCaseSetting(ignoreCase, key);
|
||||
CasePreservedSynonymMappings cpsm = synonymMappings.get(caseKey);
|
||||
if (cpsm == null) {
|
||||
cpsm = new CasePreservedSynonymMappings();
|
||||
synonymMappings.put(caseKey, cpsm);
|
||||
}
|
||||
|
||||
// give the nature of our JSON parsing solution, we really have
|
||||
// no guarantees on what is in the file
|
||||
Object mapping = storedSyns.get(key);
|
||||
|
@ -108,21 +149,11 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
" but got "+mapping.getClass().getName());
|
||||
}
|
||||
|
||||
// if we're configured to ignoreCase, then we build the mappings with all lower
|
||||
List<String> vals = (List<String>)storedSyns.get(key);
|
||||
Set<String> sortedVals = new TreeSet<>();
|
||||
if (ignoreCase) {
|
||||
for (String next : vals) {
|
||||
sortedVals.add(applyCaseSetting(ignoreCase, next));
|
||||
}
|
||||
} else {
|
||||
sortedVals.addAll(vals);
|
||||
}
|
||||
|
||||
synonymMappings.put(applyCaseSetting(ignoreCase, key), sortedVals);
|
||||
sortedVals.addAll((List<String>)storedSyns.get(key));
|
||||
cpsm.mappings.put(key, sortedVals);
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Loaded {} synonym mappings for {}", synonymMappings.size(), getResourceId());
|
||||
}
|
||||
|
||||
|
@ -138,17 +169,24 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
Map<String,Object> jsonMap = (Map<String,Object>)updates;
|
||||
for (String term : jsonMap.keySet()) {
|
||||
|
||||
String origTerm = term;
|
||||
term = applyCaseSetting(ignoreCase, term);
|
||||
|
||||
Set<String> output = synonymMappings.get(term);
|
||||
// find the mappings using the case aware key
|
||||
CasePreservedSynonymMappings cpsm = synonymMappings.get(term);
|
||||
if (cpsm == null) {
|
||||
cpsm = new CasePreservedSynonymMappings();
|
||||
}
|
||||
|
||||
Object val = jsonMap.get(term);
|
||||
Set<String> output = cpsm.mappings.get(origTerm);
|
||||
|
||||
Object val = jsonMap.get(origTerm); // IMPORTANT: use the original
|
||||
if (val instanceof String) {
|
||||
String strVal = applyCaseSetting(ignoreCase, (String)val);
|
||||
String strVal = (String)val;
|
||||
|
||||
if (output == null) {
|
||||
output = new TreeSet<>();
|
||||
synonymMappings.put(term, output);
|
||||
cpsm.mappings.put(origTerm, output);
|
||||
}
|
||||
|
||||
if (output.add(strVal)) {
|
||||
|
@ -159,11 +197,11 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
|
||||
if (output == null) {
|
||||
output = new TreeSet<>();
|
||||
synonymMappings.put(term, output);
|
||||
cpsm.mappings.put(origTerm, output);
|
||||
}
|
||||
|
||||
for (String nextVal : vals) {
|
||||
if (output.add(applyCaseSetting(ignoreCase, nextVal))) {
|
||||
if (output.add(nextVal)) {
|
||||
madeChanges = true;
|
||||
}
|
||||
}
|
||||
|
@ -172,39 +210,28 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST, "Unsupported value "+val+
|
||||
" for "+term+"; expected single value or a JSON array!");
|
||||
}
|
||||
}
|
||||
|
||||
return madeChanges ? synonymMappings : null;
|
||||
// only add the cpsm to the synonymMappings if it has valid data
|
||||
if (!synonymMappings.containsKey(term) && cpsm.mappings.get(origTerm) != null) {
|
||||
synonymMappings.put(term, cpsm);
|
||||
}
|
||||
}
|
||||
return madeChanges ? getStoredView() : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles a change in the ignoreCase setting for synonyms, which requires
|
||||
* a full rebuild of the synonymMappings.
|
||||
* Returns a Map of how we store and load data managed by this resource,
|
||||
* which is different than how it is managed at runtime in order to support
|
||||
* the ignoreCase setting.
|
||||
*/
|
||||
@Override
|
||||
protected boolean updateInitArgs(NamedList<?> updatedArgs) {
|
||||
if (updatedArgs == null || updatedArgs.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
boolean currentIgnoreCase = getIgnoreCase(managedInitArgs);
|
||||
boolean updatedIgnoreCase = getIgnoreCase(updatedArgs);
|
||||
if (currentIgnoreCase == true && updatedIgnoreCase == false) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST,
|
||||
"Changing a managed word set's ignoreCase arg from true to false is not permitted.");
|
||||
} else if (currentIgnoreCase == false && updatedIgnoreCase == true) {
|
||||
// ignore case policy changed ... rebuild the map
|
||||
Map<String,Set<String>> rebuild = new TreeMap<>();
|
||||
for (String curr : synonymMappings.keySet()) {
|
||||
Set<String> newMappings = new TreeSet<>();
|
||||
for (String next : synonymMappings.get(curr)) {
|
||||
newMappings.add(applyCaseSetting(updatedIgnoreCase, next));
|
||||
}
|
||||
rebuild.put(applyCaseSetting(updatedIgnoreCase, curr), newMappings);
|
||||
protected Map<String,Set<String>> getStoredView() {
|
||||
Map<String,Set<String>> storedView = new TreeMap<>();
|
||||
for (CasePreservedSynonymMappings cpsm : synonymMappings.values()) {
|
||||
for (String key : cpsm.mappings.keySet()) {
|
||||
storedView.put(key, cpsm.mappings.get(key));
|
||||
}
|
||||
synonymMappings = rebuild;
|
||||
}
|
||||
|
||||
return super.updateInitArgs(updatedArgs);
|
||||
return storedView;
|
||||
}
|
||||
|
||||
protected String applyCaseSetting(boolean ignoreCase, String str) {
|
||||
|
@ -227,14 +254,19 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
if (childId != null) {
|
||||
boolean ignoreCase = getIgnoreCase();
|
||||
String key = applyCaseSetting(ignoreCase, childId);
|
||||
Set<String> output = synonymMappings.get(key);
|
||||
if (output == null) {
|
||||
|
||||
// if ignoreCase==true, then we get the mappings using the lower-cased key
|
||||
// and then return a union of all case-sensitive keys, if false, then
|
||||
// we only return the mappings for the exact case requested
|
||||
CasePreservedSynonymMappings cpsm = synonymMappings.get(key);
|
||||
Set<String> mappings = (cpsm != null) ? cpsm.getMappings(ignoreCase, childId) : null;
|
||||
if (mappings == null)
|
||||
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||
String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
|
||||
}
|
||||
response.add(key, output);
|
||||
String.format(Locale.ROOT, "%s not found in %s", childId, getResourceId()));
|
||||
|
||||
response.add(childId, mappings);
|
||||
} else {
|
||||
response.add(SYNONYM_MAPPINGS, buildMapToStore(synonymMappings));
|
||||
response.add(SYNONYM_MAPPINGS, buildMapToStore(getStoredView()));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -242,14 +274,32 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
public synchronized void doDeleteChild(BaseSolrResource endpoint, String childId) {
|
||||
boolean ignoreCase = getIgnoreCase();
|
||||
String key = applyCaseSetting(ignoreCase, childId);
|
||||
Set<String> output = synonymMappings.get(key);
|
||||
if (output == null)
|
||||
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||
String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
|
||||
|
||||
synonymMappings.remove(key);
|
||||
storeManagedData(synonymMappings);
|
||||
log.info("Removed synonym mappings for: {}", key);
|
||||
CasePreservedSynonymMappings cpsm = synonymMappings.get(key);
|
||||
if (cpsm == null)
|
||||
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||
String.format(Locale.ROOT, "%s not found in %s", childId, getResourceId()));
|
||||
|
||||
if (ignoreCase) {
|
||||
// delete all mappings regardless of case
|
||||
synonymMappings.remove(key);
|
||||
} else {
|
||||
// just delete the mappings for the specific case-sensitive key
|
||||
if (cpsm.mappings.containsKey(childId)) {
|
||||
cpsm.mappings.remove(childId);
|
||||
|
||||
if (cpsm.mappings.isEmpty())
|
||||
synonymMappings.remove(key);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||
String.format(Locale.ROOT, "%s not found in %s", childId, getResourceId()));
|
||||
}
|
||||
}
|
||||
|
||||
// store the updated data (using the stored view)
|
||||
storeManagedData(getStoredView());
|
||||
|
||||
log.info("Removed synonym mappings for: {}", childId);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -272,9 +322,15 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
|||
*/
|
||||
@Override
|
||||
public void parse(Reader in) throws IOException, ParseException {
|
||||
for (String term : synonymManager.synonymMappings.keySet()) {
|
||||
for (String mapping : synonymManager.synonymMappings.get(term)) {
|
||||
add(new CharsRef(term), new CharsRef(mapping), false);
|
||||
boolean ignoreCase = synonymManager.getIgnoreCase();
|
||||
for (CasePreservedSynonymMappings cpsm : synonymManager.synonymMappings.values()) {
|
||||
for (String term : cpsm.mappings.keySet()) {
|
||||
for (String mapping : cpsm.mappings.get(term)) {
|
||||
// apply the case setting to match the behavior of the SynonymMap builder
|
||||
String casedTerm = synonymManager.applyCaseSetting(ignoreCase, term);
|
||||
String casedMapping = synonymManager.applyCaseSetting(ignoreCase, mapping);
|
||||
add(new CharsRef(casedTerm), new CharsRef(casedMapping), false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -106,13 +106,19 @@ public class TestManagedSynonymFilterFactory extends RestTestBase {
|
|||
|
||||
syns = new HashMap<>();
|
||||
syns.put("sad", Arrays.asList("unhappy"));
|
||||
syns.put("SAD", Arrays.asList("Unhappy"));
|
||||
syns.put("SAD", Arrays.asList("bummed"));
|
||||
assertJPut(endpoint,
|
||||
JSONUtil.toJSON(syns),
|
||||
"/responseHeader/status==0");
|
||||
|
||||
assertJQ(endpoint,
|
||||
"/synonymMappings/managedMap/sad==['unhappy']");
|
||||
assertJQ(endpoint,
|
||||
"/synonymMappings/managedMap/SAD==['bummed']");
|
||||
|
||||
// expect a union of values when requesting the "sad" child
|
||||
assertJQ(endpoint+"/sad",
|
||||
"/sad==['bummed','unhappy']");
|
||||
|
||||
// verify delete works
|
||||
assertJDelete(endpoint+"/sad",
|
||||
|
@ -174,5 +180,20 @@ public class TestManagedSynonymFilterFactory extends RestTestBase {
|
|||
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
|
||||
"/response/result[@name='response'][@numFound='1']",
|
||||
"/response/result[@name='response']/doc/str[@name='id'][.='5150']");
|
||||
|
||||
// test for SOLR-6015
|
||||
syns = new HashMap<>();
|
||||
syns.put("mb", Arrays.asList("megabyte"));
|
||||
assertJPut(endpoint,
|
||||
JSONUtil.toJSON(syns),
|
||||
"/responseHeader/status==0");
|
||||
|
||||
syns.put("MB", Arrays.asList("MiB", "Megabyte"));
|
||||
assertJPut(endpoint,
|
||||
JSONUtil.toJSON(syns),
|
||||
"/responseHeader/status==0");
|
||||
|
||||
assertJQ(endpoint+"/MB",
|
||||
"/MB==['Megabyte','MiB','megabyte']");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue