mirror of https://github.com/apache/lucene.git
SOLR-6015: improved strategy for handling managed synonyms when ignoreCase=true
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1596928 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1cc70e75fc
commit
8a220db194
|
@ -86,6 +86,9 @@ Other Changes
|
||||||
replication factor for an update request (single or batch) by sending
|
replication factor for an update request (single or batch) by sending
|
||||||
an optional parameter "min_rf". (Timothy Potter)
|
an optional parameter "min_rf". (Timothy Potter)
|
||||||
|
|
||||||
|
* SOLR-6015: Better way to handle managed synonyms when ignoreCase=true
|
||||||
|
(Timothy Potter)
|
||||||
|
|
||||||
================== 4.9.0 ==================
|
================== 4.9.0 ==================
|
||||||
|
|
||||||
Versions of Major Components
|
Versions of Major Components
|
||||||
|
|
|
@ -57,6 +57,42 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
public static final String SYNONYM_MAPPINGS = "synonymMappings";
|
public static final String SYNONYM_MAPPINGS = "synonymMappings";
|
||||||
public static final String IGNORE_CASE_INIT_ARG = "ignoreCase";
|
public static final String IGNORE_CASE_INIT_ARG = "ignoreCase";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used internally to preserve the case of synonym mappings regardless
|
||||||
|
* of the ignoreCase setting.
|
||||||
|
*/
|
||||||
|
private static class CasePreservedSynonymMappings {
|
||||||
|
Map<String,Set<String>> mappings = new TreeMap<>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides a view of the mappings for a given term; specifically, if
|
||||||
|
* ignoreCase is true, then the returned "view" contains the mappings
|
||||||
|
* for all known cases of the term, if it is false, then only the
|
||||||
|
* mappings for the specific case is returned.
|
||||||
|
*/
|
||||||
|
Set<String> getMappings(boolean ignoreCase, String key) {
|
||||||
|
Set<String> synMappings = null;
|
||||||
|
if (ignoreCase) {
|
||||||
|
// TODO: should we return the mapped values in all lower-case here?
|
||||||
|
if (mappings.size() == 1) {
|
||||||
|
// if only one in the map (which is common) just return it directly
|
||||||
|
return mappings.values().iterator().next();
|
||||||
|
}
|
||||||
|
|
||||||
|
synMappings = new TreeSet<>();
|
||||||
|
for (Set<String> next : mappings.values())
|
||||||
|
synMappings.addAll(next);
|
||||||
|
} else {
|
||||||
|
synMappings = mappings.get(key);
|
||||||
|
}
|
||||||
|
return synMappings;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return mappings.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ManagedResource implementation for synonyms, which are so specialized that
|
* ManagedResource implementation for synonyms, which are so specialized that
|
||||||
* it makes sense to implement this class as an inner class as it has little
|
* it makes sense to implement this class as an inner class as it has little
|
||||||
|
@ -65,11 +101,7 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
public static class SynonymManager extends ManagedResource
|
public static class SynonymManager extends ManagedResource
|
||||||
implements ManagedResource.ChildResourceSupport
|
implements ManagedResource.ChildResourceSupport
|
||||||
{
|
{
|
||||||
|
protected Map<String,CasePreservedSynonymMappings> synonymMappings;
|
||||||
// TODO: Maybe hold this using a SoftReference / WeakReference to
|
|
||||||
// reduce memory in case the set of synonyms is large and the JVM
|
|
||||||
// is running low on memory?
|
|
||||||
protected Map<String,Set<String>> synonymMappings;
|
|
||||||
|
|
||||||
public SynonymManager(String resourceId, SolrResourceLoader loader, StorageIO storageIO)
|
public SynonymManager(String resourceId, SolrResourceLoader loader, StorageIO storageIO)
|
||||||
throws SolrException {
|
throws SolrException {
|
||||||
|
@ -94,11 +126,20 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
if (initArgs.get(IGNORE_CASE_INIT_ARG) == null) {
|
if (initArgs.get(IGNORE_CASE_INIT_ARG) == null) {
|
||||||
initArgs.add(IGNORE_CASE_INIT_ARG, Boolean.FALSE);
|
initArgs.add(IGNORE_CASE_INIT_ARG, Boolean.FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean ignoreCase = getIgnoreCase(managedInitArgs);
|
boolean ignoreCase = getIgnoreCase(managedInitArgs);
|
||||||
synonymMappings = new TreeMap<>();
|
synonymMappings = new TreeMap<>();
|
||||||
if (managedData != null) {
|
if (managedData != null) {
|
||||||
Map<String,Object> storedSyns = (Map<String,Object>)managedData;
|
Map<String,Object> storedSyns = (Map<String,Object>)managedData;
|
||||||
for (String key : storedSyns.keySet()) {
|
for (String key : storedSyns.keySet()) {
|
||||||
|
|
||||||
|
String caseKey = applyCaseSetting(ignoreCase, key);
|
||||||
|
CasePreservedSynonymMappings cpsm = synonymMappings.get(caseKey);
|
||||||
|
if (cpsm == null) {
|
||||||
|
cpsm = new CasePreservedSynonymMappings();
|
||||||
|
synonymMappings.put(caseKey, cpsm);
|
||||||
|
}
|
||||||
|
|
||||||
// give the nature of our JSON parsing solution, we really have
|
// give the nature of our JSON parsing solution, we really have
|
||||||
// no guarantees on what is in the file
|
// no guarantees on what is in the file
|
||||||
Object mapping = storedSyns.get(key);
|
Object mapping = storedSyns.get(key);
|
||||||
|
@ -108,21 +149,11 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
" but got "+mapping.getClass().getName());
|
" but got "+mapping.getClass().getName());
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we're configured to ignoreCase, then we build the mappings with all lower
|
|
||||||
List<String> vals = (List<String>)storedSyns.get(key);
|
|
||||||
Set<String> sortedVals = new TreeSet<>();
|
Set<String> sortedVals = new TreeSet<>();
|
||||||
if (ignoreCase) {
|
sortedVals.addAll((List<String>)storedSyns.get(key));
|
||||||
for (String next : vals) {
|
cpsm.mappings.put(key, sortedVals);
|
||||||
sortedVals.add(applyCaseSetting(ignoreCase, next));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
sortedVals.addAll(vals);
|
|
||||||
}
|
|
||||||
|
|
||||||
synonymMappings.put(applyCaseSetting(ignoreCase, key), sortedVals);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Loaded {} synonym mappings for {}", synonymMappings.size(), getResourceId());
|
log.info("Loaded {} synonym mappings for {}", synonymMappings.size(), getResourceId());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -138,17 +169,24 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
Map<String,Object> jsonMap = (Map<String,Object>)updates;
|
Map<String,Object> jsonMap = (Map<String,Object>)updates;
|
||||||
for (String term : jsonMap.keySet()) {
|
for (String term : jsonMap.keySet()) {
|
||||||
|
|
||||||
|
String origTerm = term;
|
||||||
term = applyCaseSetting(ignoreCase, term);
|
term = applyCaseSetting(ignoreCase, term);
|
||||||
|
|
||||||
Set<String> output = synonymMappings.get(term);
|
// find the mappings using the case aware key
|
||||||
|
CasePreservedSynonymMappings cpsm = synonymMappings.get(term);
|
||||||
|
if (cpsm == null) {
|
||||||
|
cpsm = new CasePreservedSynonymMappings();
|
||||||
|
}
|
||||||
|
|
||||||
Object val = jsonMap.get(term);
|
Set<String> output = cpsm.mappings.get(origTerm);
|
||||||
|
|
||||||
|
Object val = jsonMap.get(origTerm); // IMPORTANT: use the original
|
||||||
if (val instanceof String) {
|
if (val instanceof String) {
|
||||||
String strVal = applyCaseSetting(ignoreCase, (String)val);
|
String strVal = (String)val;
|
||||||
|
|
||||||
if (output == null) {
|
if (output == null) {
|
||||||
output = new TreeSet<>();
|
output = new TreeSet<>();
|
||||||
synonymMappings.put(term, output);
|
cpsm.mappings.put(origTerm, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (output.add(strVal)) {
|
if (output.add(strVal)) {
|
||||||
|
@ -159,11 +197,11 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
|
|
||||||
if (output == null) {
|
if (output == null) {
|
||||||
output = new TreeSet<>();
|
output = new TreeSet<>();
|
||||||
synonymMappings.put(term, output);
|
cpsm.mappings.put(origTerm, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (String nextVal : vals) {
|
for (String nextVal : vals) {
|
||||||
if (output.add(applyCaseSetting(ignoreCase, nextVal))) {
|
if (output.add(nextVal)) {
|
||||||
madeChanges = true;
|
madeChanges = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -172,39 +210,28 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST, "Unsupported value "+val+
|
throw new ResourceException(Status.CLIENT_ERROR_BAD_REQUEST, "Unsupported value "+val+
|
||||||
" for "+term+"; expected single value or a JSON array!");
|
" for "+term+"; expected single value or a JSON array!");
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return madeChanges ? synonymMappings : null;
|
// only add the cpsm to the synonymMappings if it has valid data
|
||||||
|
if (!synonymMappings.containsKey(term) && cpsm.mappings.get(origTerm) != null) {
|
||||||
|
synonymMappings.put(term, cpsm);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return madeChanges ? getStoredView() : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handles a change in the ignoreCase setting for synonyms, which requires
|
* Returns a Map of how we store and load data managed by this resource,
|
||||||
* a full rebuild of the synonymMappings.
|
* which is different than how it is managed at runtime in order to support
|
||||||
|
* the ignoreCase setting.
|
||||||
*/
|
*/
|
||||||
@Override
|
protected Map<String,Set<String>> getStoredView() {
|
||||||
protected boolean updateInitArgs(NamedList<?> updatedArgs) {
|
Map<String,Set<String>> storedView = new TreeMap<>();
|
||||||
if (updatedArgs == null || updatedArgs.size() == 0) {
|
for (CasePreservedSynonymMappings cpsm : synonymMappings.values()) {
|
||||||
return false;
|
for (String key : cpsm.mappings.keySet()) {
|
||||||
|
storedView.put(key, cpsm.mappings.get(key));
|
||||||
}
|
}
|
||||||
boolean currentIgnoreCase = getIgnoreCase(managedInitArgs);
|
|
||||||
boolean updatedIgnoreCase = getIgnoreCase(updatedArgs);
|
|
||||||
if (currentIgnoreCase == true && updatedIgnoreCase == false) {
|
|
||||||
throw new SolrException(ErrorCode.BAD_REQUEST,
|
|
||||||
"Changing a managed word set's ignoreCase arg from true to false is not permitted.");
|
|
||||||
} else if (currentIgnoreCase == false && updatedIgnoreCase == true) {
|
|
||||||
// ignore case policy changed ... rebuild the map
|
|
||||||
Map<String,Set<String>> rebuild = new TreeMap<>();
|
|
||||||
for (String curr : synonymMappings.keySet()) {
|
|
||||||
Set<String> newMappings = new TreeSet<>();
|
|
||||||
for (String next : synonymMappings.get(curr)) {
|
|
||||||
newMappings.add(applyCaseSetting(updatedIgnoreCase, next));
|
|
||||||
}
|
}
|
||||||
rebuild.put(applyCaseSetting(updatedIgnoreCase, curr), newMappings);
|
return storedView;
|
||||||
}
|
|
||||||
synonymMappings = rebuild;
|
|
||||||
}
|
|
||||||
|
|
||||||
return super.updateInitArgs(updatedArgs);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String applyCaseSetting(boolean ignoreCase, String str) {
|
protected String applyCaseSetting(boolean ignoreCase, String str) {
|
||||||
|
@ -227,14 +254,19 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
if (childId != null) {
|
if (childId != null) {
|
||||||
boolean ignoreCase = getIgnoreCase();
|
boolean ignoreCase = getIgnoreCase();
|
||||||
String key = applyCaseSetting(ignoreCase, childId);
|
String key = applyCaseSetting(ignoreCase, childId);
|
||||||
Set<String> output = synonymMappings.get(key);
|
|
||||||
if (output == null) {
|
// if ignoreCase==true, then we get the mappings using the lower-cased key
|
||||||
|
// and then return a union of all case-sensitive keys, if false, then
|
||||||
|
// we only return the mappings for the exact case requested
|
||||||
|
CasePreservedSynonymMappings cpsm = synonymMappings.get(key);
|
||||||
|
Set<String> mappings = (cpsm != null) ? cpsm.getMappings(ignoreCase, childId) : null;
|
||||||
|
if (mappings == null)
|
||||||
throw new SolrException(ErrorCode.NOT_FOUND,
|
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||||
String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
|
String.format(Locale.ROOT, "%s not found in %s", childId, getResourceId()));
|
||||||
}
|
|
||||||
response.add(key, output);
|
response.add(childId, mappings);
|
||||||
} else {
|
} else {
|
||||||
response.add(SYNONYM_MAPPINGS, buildMapToStore(synonymMappings));
|
response.add(SYNONYM_MAPPINGS, buildMapToStore(getStoredView()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -242,14 +274,32 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
public synchronized void doDeleteChild(BaseSolrResource endpoint, String childId) {
|
public synchronized void doDeleteChild(BaseSolrResource endpoint, String childId) {
|
||||||
boolean ignoreCase = getIgnoreCase();
|
boolean ignoreCase = getIgnoreCase();
|
||||||
String key = applyCaseSetting(ignoreCase, childId);
|
String key = applyCaseSetting(ignoreCase, childId);
|
||||||
Set<String> output = synonymMappings.get(key);
|
|
||||||
if (output == null)
|
|
||||||
throw new SolrException(ErrorCode.NOT_FOUND,
|
|
||||||
String.format(Locale.ROOT, "%s not found in %s", key, getResourceId()));
|
|
||||||
|
|
||||||
|
CasePreservedSynonymMappings cpsm = synonymMappings.get(key);
|
||||||
|
if (cpsm == null)
|
||||||
|
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||||
|
String.format(Locale.ROOT, "%s not found in %s", childId, getResourceId()));
|
||||||
|
|
||||||
|
if (ignoreCase) {
|
||||||
|
// delete all mappings regardless of case
|
||||||
synonymMappings.remove(key);
|
synonymMappings.remove(key);
|
||||||
storeManagedData(synonymMappings);
|
} else {
|
||||||
log.info("Removed synonym mappings for: {}", key);
|
// just delete the mappings for the specific case-sensitive key
|
||||||
|
if (cpsm.mappings.containsKey(childId)) {
|
||||||
|
cpsm.mappings.remove(childId);
|
||||||
|
|
||||||
|
if (cpsm.mappings.isEmpty())
|
||||||
|
synonymMappings.remove(key);
|
||||||
|
} else {
|
||||||
|
throw new SolrException(ErrorCode.NOT_FOUND,
|
||||||
|
String.format(Locale.ROOT, "%s not found in %s", childId, getResourceId()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// store the updated data (using the stored view)
|
||||||
|
storeManagedData(getStoredView());
|
||||||
|
|
||||||
|
log.info("Removed synonym mappings for: {}", childId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -272,9 +322,15 @@ public class ManagedSynonymFilterFactory extends BaseManagedTokenFilterFactory {
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void parse(Reader in) throws IOException, ParseException {
|
public void parse(Reader in) throws IOException, ParseException {
|
||||||
for (String term : synonymManager.synonymMappings.keySet()) {
|
boolean ignoreCase = synonymManager.getIgnoreCase();
|
||||||
for (String mapping : synonymManager.synonymMappings.get(term)) {
|
for (CasePreservedSynonymMappings cpsm : synonymManager.synonymMappings.values()) {
|
||||||
add(new CharsRef(term), new CharsRef(mapping), false);
|
for (String term : cpsm.mappings.keySet()) {
|
||||||
|
for (String mapping : cpsm.mappings.get(term)) {
|
||||||
|
// apply the case setting to match the behavior of the SynonymMap builder
|
||||||
|
String casedTerm = synonymManager.applyCaseSetting(ignoreCase, term);
|
||||||
|
String casedMapping = synonymManager.applyCaseSetting(ignoreCase, mapping);
|
||||||
|
add(new CharsRef(casedTerm), new CharsRef(casedMapping), false);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,13 +106,19 @@ public class TestManagedSynonymFilterFactory extends RestTestBase {
|
||||||
|
|
||||||
syns = new HashMap<>();
|
syns = new HashMap<>();
|
||||||
syns.put("sad", Arrays.asList("unhappy"));
|
syns.put("sad", Arrays.asList("unhappy"));
|
||||||
syns.put("SAD", Arrays.asList("Unhappy"));
|
syns.put("SAD", Arrays.asList("bummed"));
|
||||||
assertJPut(endpoint,
|
assertJPut(endpoint,
|
||||||
JSONUtil.toJSON(syns),
|
JSONUtil.toJSON(syns),
|
||||||
"/responseHeader/status==0");
|
"/responseHeader/status==0");
|
||||||
|
|
||||||
assertJQ(endpoint,
|
assertJQ(endpoint,
|
||||||
"/synonymMappings/managedMap/sad==['unhappy']");
|
"/synonymMappings/managedMap/sad==['unhappy']");
|
||||||
|
assertJQ(endpoint,
|
||||||
|
"/synonymMappings/managedMap/SAD==['bummed']");
|
||||||
|
|
||||||
|
// expect a union of values when requesting the "sad" child
|
||||||
|
assertJQ(endpoint+"/sad",
|
||||||
|
"/sad==['bummed','unhappy']");
|
||||||
|
|
||||||
// verify delete works
|
// verify delete works
|
||||||
assertJDelete(endpoint+"/sad",
|
assertJDelete(endpoint+"/sad",
|
||||||
|
@ -174,5 +180,20 @@ public class TestManagedSynonymFilterFactory extends RestTestBase {
|
||||||
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
|
"/response/lst[@name='responseHeader']/int[@name='status'] = '0'",
|
||||||
"/response/result[@name='response'][@numFound='1']",
|
"/response/result[@name='response'][@numFound='1']",
|
||||||
"/response/result[@name='response']/doc/str[@name='id'][.='5150']");
|
"/response/result[@name='response']/doc/str[@name='id'][.='5150']");
|
||||||
|
|
||||||
|
// test for SOLR-6015
|
||||||
|
syns = new HashMap<>();
|
||||||
|
syns.put("mb", Arrays.asList("megabyte"));
|
||||||
|
assertJPut(endpoint,
|
||||||
|
JSONUtil.toJSON(syns),
|
||||||
|
"/responseHeader/status==0");
|
||||||
|
|
||||||
|
syns.put("MB", Arrays.asList("MiB", "Megabyte"));
|
||||||
|
assertJPut(endpoint,
|
||||||
|
JSONUtil.toJSON(syns),
|
||||||
|
"/responseHeader/status==0");
|
||||||
|
|
||||||
|
assertJQ(endpoint+"/MB",
|
||||||
|
"/MB==['Megabyte','MiB','megabyte']");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue