SOLR-7539: Upgrade the clustering plugin to Carrot2 3.15.0.

This commit is contained in:
Dawid Weiss 2016-11-04 16:02:00 +01:00
parent 86e8d16da3
commit 1c32cc9a49
14 changed files with 77 additions and 82 deletions

View File

@ -222,7 +222,7 @@ org.bouncycastle.version = 1.45
/org.carrot2.attributes/attributes-binder = 1.3.1
/org.carrot2.shaded/carrot2-guava = 18.0
/org.carrot2/carrot2-mini = 3.12.0
/org.carrot2/carrot2-mini = 3.15.0
org.carrot2.morfologik.version = 2.1.1
/org.carrot2/morfologik-fsa = ${org.carrot2.morfologik.version}

View File

@ -23,7 +23,7 @@ Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this r
Versions of Major Components
---------------------
Apache Tika 1.13
Carrot2 3.12.0
Carrot2 3.15.0
Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.4.6
@ -58,6 +58,8 @@ Bug Fixes
Other Changes
----------------------
* SOLR-7539: Upgrade the clustering plugin to Carrot2 3.15.0. (Dawid Weiss)
* SOLR-9621: Remove several Guava & Apache Commons calls in favor of java 8 alternatives.
(Michael Braun via David Smiley)

View File

@ -19,6 +19,7 @@ package org.apache.solr.handler.clustering;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
@ -44,9 +45,6 @@ import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Maps;
/**
* Provides a plugin for performing cluster analysis. This can either be applied to
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for
@ -68,12 +66,12 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
/**
* Declaration-order list of search clustering engines.
*/
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = Maps.newLinkedHashMap();
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = new LinkedHashMap<>();
/**
* Declaration order list of document clustering engines.
*/
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = Maps.newLinkedHashMap();
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = new LinkedHashMap<>();
/**
* An unmodifiable view of {@link #searchClusteringEngines}.
@ -173,7 +171,7 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
if (engine != null) {
checkAvailable(name, engine);
DocListAndSet results = rb.getResults();
Map<SolrDocument,Integer> docIds = Maps.newHashMapWithExpectedSize(results.docList.size());
Map<SolrDocument,Integer> docIds = new HashMap<>(results.docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList(
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);

View File

@ -58,6 +58,8 @@ import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.shaded.guava.common.base.MoreObjects;
import org.carrot2.shaded.guava.common.base.Strings;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
@ -69,12 +71,6 @@ import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Objects;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
*
@ -155,7 +151,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
// of this component. This by-name convention lookup is used to simplify configuring algorithms.
String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
log.info("Initializing Clustering Engine '" + Objects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
log.info("Initializing Clustering Engine '" +
MoreObjects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
if (!Strings.isNullOrEmpty(componentName)) {
IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
@ -268,7 +265,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq));
fields.add(idFieldName);
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
fields.addAll(getCustomFieldsMap(solrParams).keySet());
@ -295,7 +292,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
+ " must not be blank.");
}
final Set<String> fields = Sets.newHashSet();
final Set<String> fields = new HashSet<>();
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
return fields;
@ -319,7 +316,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
Map<String, String> customFields = getCustomFieldsMap(solrParams);
// Parse language code map string into a map
Map<String, String> languageCodeMap = Maps.newHashMap();
Map<String, String> languageCodeMap = new HashMap<>();
if (StringUtils.isNotBlank(languageField)) {
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
final String[] split = pair.split(":");
@ -340,7 +337,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
if (produceSummary) {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
Map<String, Object> args = Maps.newHashMap();
Map<String, Object> args = new HashMap<>();
snippetFieldAry = snippetFieldSpec.split("[, ]");
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
@ -466,10 +463,10 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
* custom field names.
*/
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
Map<String, String> customFields = Maps.newHashMap();
Map<String, String> customFields = new HashMap<>();
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
if (customFieldsSpec != null) {
customFields = Maps.newHashMap();
customFields = new HashMap<>();
for (String customFieldSpec : customFieldsSpec) {
String [] split = customFieldSpec.split(":");
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
@ -501,7 +498,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
List<NamedList<Object>> result = Lists.newArrayList();
List<NamedList<Object>> result = new ArrayList<>();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@ -534,7 +531,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Add documents
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
List<Object> docList = Lists.newArrayList();
List<Object> docList = new ArrayList<>();
cluster.add("docs", docList);
for (Document doc : docs) {
docList.add(doc.getField(SOLR_DOCUMENT_ID));
@ -542,7 +539,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Add subclusters
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
List<NamedList<Object>> subclusters = Lists.newArrayList();
List<NamedList<Object>> subclusters = new ArrayList<>();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);

View File

@ -16,10 +16,10 @@
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import com.google.common.collect.ImmutableSet;
/**
* Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration).
* @lucene.experimental
@ -50,7 +50,7 @@ public final class CarrotParams {
*/
public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
static final Set<String> CARROT_PARAM_NAMES = new HashSet<>(Arrays.asList(
ALGORITHM,
TITLE_FIELD_NAME,
@ -66,7 +66,7 @@ public final class CarrotParams {
NUM_DESCRIPTIONS,
OUTPUT_SUB_CLUSTERS,
RESOURCES_DIR,
LANGUAGE_CODE_MAP);
LANGUAGE_CODE_MAP));
/** No instances. */
private CarrotParams() {}

View File

@ -16,7 +16,9 @@
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.Collection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
@ -26,6 +28,7 @@ import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
@ -37,9 +40,6 @@ import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
/**
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
* words from a field's StopFilter to the default stop words used in Carrot2,
@ -67,7 +67,7 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFacto
/**
* A lazily-built cache of stop words per field.
*/
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
private HashMap<String, List<CharArraySet>> solrStopWords = new HashMap<>();
/**
* Carrot2's default lexical resources to use in addition to Solr's stop
@ -79,32 +79,35 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFacto
* Obtains stop words for a field from the associated
* {@link StopFilterFactory}, if any.
*/
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
// No need to synchronize here, Carrot2 ensures that instances
// of this class are not used by multiple threads at a time.
synchronized (solrStopWords) {
if (!solrStopWords.containsKey(fieldName)) {
final Analyzer fieldAnalyzer = core.getLatestSchema().getFieldType(fieldName)
.getIndexAnalyzer();
solrStopWords.put(fieldName, new ArrayList<>());
IndexSchema schema = core.getLatestSchema();
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
if (fieldAnalyzer instanceof TokenizerChain) {
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
.getTokenFilterFactories();
final TokenFilterFactory[] filterFactories =
((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet
solrStopWords.put(fieldName,
((StopFilterFactory) factory).getStopWords());
CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
solrStopWords.get(fieldName).add(stopWords);
}
if (factory instanceof CommonGramsFilterFactory) {
solrStopWords.put(fieldName,
((CommonGramsFilterFactory) factory)
.getCommonWords());
CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
solrStopWords.get(fieldName).add(commonWords);
}
}
}
}
return solrStopWords.get(fieldName);
}
}
@Override
public ILexicalData getLexicalData(LanguageCode languageCode) {

View File

@ -17,6 +17,9 @@
package org.apache.solr.handler.clustering.carrot2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -45,9 +48,6 @@ import org.carrot2.core.LanguageCode;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
/**
*
*/
@ -211,7 +211,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
// stoplabels.mt, so we're expecting only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine(engineName), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
}
@Test
@ -226,7 +226,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
// only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
}
@Test
@ -243,9 +243,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 2, params);
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
assertEquals(ImmutableList.of("solrownstopword"),
getLabels(clusters.get(1)));
assertEquals(Collections.singletonList("online"), getLabels(clusters.get(0)));
assertEquals(Collections.singletonList("solrownstopword"), getLabels(clusters.get(1)));
}
@Test
@ -395,8 +394,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default");
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
assertEquals(
Lists.newArrayList("stc", "default", "mock"),
Lists.newArrayList(engines.keySet()));
Arrays.asList("stc", "default", "mock"),
new ArrayList<>(engines.keySet()));
assertEquals(
LingoClusteringAlgorithm.class,
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
@ -407,8 +406,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order");
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
assertEquals(
Lists.newArrayList("unavailable", "lingo", "stc", "mock", "default"),
Lists.newArrayList(engines.keySet()));
Arrays.asList("unavailable", "lingo", "stc", "mock", "default"),
new ArrayList<>(engines.keySet()));
assertEquals(
LingoClusteringAlgorithm.class,
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
@ -419,8 +418,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups");
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
assertEquals(
Lists.newArrayList("", "default"),
Lists.newArrayList(engines.keySet()));
Arrays.asList("", "default"),
new ArrayList<>(engines.keySet()));
assertEquals(
MockClusteringAlgorithm.class,
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());

View File

@ -15,6 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
@ -29,8 +30,6 @@ import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock Carrot2 clustering algorithm that outputs input documents as clusters.
* Useful only in tests.
@ -56,7 +55,7 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayListWithCapacity(documents.size());
clusters = new ArrayList<>();
for (Document document : documents) {
final Cluster cluster = new Cluster();

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
@ -36,8 +37,6 @@ import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock Carrot2 clustering algorithm that outputs stem of each token of each
* document as a separate cluster. Useful only in tests.
@ -64,7 +63,7 @@ public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase
final AllTokens allTokens = preprocessingContext.allTokens;
final AllWords allWords = preprocessingContext.allWords;
final AllStems allStems = preprocessingContext.allStems;
clusters = Lists.newArrayListWithCapacity(allTokens.image.length);
clusters = new ArrayList<>();
for (int i = 0; i < allTokens.image.length; i++) {
if (allTokens.wordIndex[i] >= 0) {
clusters.add(new Cluster(new String(

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
@ -33,7 +34,6 @@ import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock Carrot2 clustering algorithm that outputs each token of each document
@ -58,8 +58,7 @@ public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase
public void process() throws ProcessingException {
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
documents, "", LanguageCode.ENGLISH);
clusters = Lists
.newArrayListWithCapacity(preprocessingContext.allTokens.image.length);
clusters = new ArrayList<>();
for (char[] token : preprocessingContext.allTokens.image) {
if (token != null) {
clusters.add(new Cluster(new String(token)));

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
@ -33,8 +34,6 @@ import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock implementation of Carrot2 clustering algorithm for testing whether the
* customized lexical resource lookup works correctly. This algorithm ignores
@ -60,7 +59,7 @@ public class LexicalResourcesCheckClusteringAlgorithm extends
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
clusters = new ArrayList<>();
if (wordsToCheck == null) {
return;
}

View File

@ -15,13 +15,13 @@
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import com.google.common.collect.Lists;
import org.carrot2.core.*;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import java.util.ArrayList;
import java.util.List;
@Bindable(prefix = "MockClusteringAlgorithm")
@ -62,7 +62,7 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
clusters = new ArrayList<>();
if (documents == null) {
return;
}

View File

@ -1 +0,0 @@
9d8b42afe43ba5c0a0c5d67208d5c919e45c3584

View File

@ -0,0 +1 @@
5d76ec388711056bfaaacc354ed04ffa6811c7b7