mirror of https://github.com/apache/lucene.git
SOLR-7539: Upgrade the clustering plugin to Carrot2 3.15.0.
This commit is contained in:
parent
86e8d16da3
commit
1c32cc9a49
|
@ -222,7 +222,7 @@ org.bouncycastle.version = 1.45
|
||||||
/org.carrot2.attributes/attributes-binder = 1.3.1
|
/org.carrot2.attributes/attributes-binder = 1.3.1
|
||||||
/org.carrot2.shaded/carrot2-guava = 18.0
|
/org.carrot2.shaded/carrot2-guava = 18.0
|
||||||
|
|
||||||
/org.carrot2/carrot2-mini = 3.12.0
|
/org.carrot2/carrot2-mini = 3.15.0
|
||||||
|
|
||||||
org.carrot2.morfologik.version = 2.1.1
|
org.carrot2.morfologik.version = 2.1.1
|
||||||
/org.carrot2/morfologik-fsa = ${org.carrot2.morfologik.version}
|
/org.carrot2/morfologik-fsa = ${org.carrot2.morfologik.version}
|
||||||
|
|
|
@ -23,7 +23,7 @@ Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this r
|
||||||
Versions of Major Components
|
Versions of Major Components
|
||||||
---------------------
|
---------------------
|
||||||
Apache Tika 1.13
|
Apache Tika 1.13
|
||||||
Carrot2 3.12.0
|
Carrot2 3.15.0
|
||||||
Velocity 1.7 and Velocity Tools 2.0
|
Velocity 1.7 and Velocity Tools 2.0
|
||||||
Apache UIMA 2.3.1
|
Apache UIMA 2.3.1
|
||||||
Apache ZooKeeper 3.4.6
|
Apache ZooKeeper 3.4.6
|
||||||
|
@ -58,6 +58,8 @@ Bug Fixes
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
* SOLR-7539: Upgrade the clustering plugin to Carrot2 3.15.0. (Dawid Weiss)
|
||||||
|
|
||||||
* SOLR-9621: Remove several Guava & Apache Commons calls in favor of java 8 alternatives.
|
* SOLR-9621: Remove several Guava & Apache Commons calls in favor of java 8 alternatives.
|
||||||
(Michael Braun via David Smiley)
|
(Michael Braun via David Smiley)
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.handler.clustering;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -44,9 +45,6 @@ import org.apache.solr.util.plugin.SolrCoreAware;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides a plugin for performing cluster analysis. This can either be applied to
|
* Provides a plugin for performing cluster analysis. This can either be applied to
|
||||||
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for
|
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for
|
||||||
|
@ -68,12 +66,12 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
|
||||||
/**
|
/**
|
||||||
* Declaration-order list of search clustering engines.
|
* Declaration-order list of search clustering engines.
|
||||||
*/
|
*/
|
||||||
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = Maps.newLinkedHashMap();
|
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = new LinkedHashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Declaration order list of document clustering engines.
|
* Declaration order list of document clustering engines.
|
||||||
*/
|
*/
|
||||||
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = Maps.newLinkedHashMap();
|
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = new LinkedHashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An unmodifiable view of {@link #searchClusteringEngines}.
|
* An unmodifiable view of {@link #searchClusteringEngines}.
|
||||||
|
@ -173,7 +171,7 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
|
||||||
if (engine != null) {
|
if (engine != null) {
|
||||||
checkAvailable(name, engine);
|
checkAvailable(name, engine);
|
||||||
DocListAndSet results = rb.getResults();
|
DocListAndSet results = rb.getResults();
|
||||||
Map<SolrDocument,Integer> docIds = Maps.newHashMapWithExpectedSize(results.docList.size());
|
Map<SolrDocument,Integer> docIds = new HashMap<>(results.docList.size());
|
||||||
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList(
|
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList(
|
||||||
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
|
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
|
||||||
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
|
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
|
||||||
|
|
|
@ -58,6 +58,8 @@ import org.carrot2.core.Document;
|
||||||
import org.carrot2.core.IClusteringAlgorithm;
|
import org.carrot2.core.IClusteringAlgorithm;
|
||||||
import org.carrot2.core.LanguageCode;
|
import org.carrot2.core.LanguageCode;
|
||||||
import org.carrot2.core.attribute.AttributeNames;
|
import org.carrot2.core.attribute.AttributeNames;
|
||||||
|
import org.carrot2.shaded.guava.common.base.MoreObjects;
|
||||||
|
import org.carrot2.shaded.guava.common.base.Strings;
|
||||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
|
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
|
||||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
|
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
|
||||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
|
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
|
||||||
|
@ -69,12 +71,6 @@ import org.carrot2.util.resource.ResourceLookup;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.common.base.Objects;
|
|
||||||
import com.google.common.base.Strings;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Search results clustering engine based on Carrot2 clustering algorithms.
|
* Search results clustering engine based on Carrot2 clustering algorithms.
|
||||||
*
|
*
|
||||||
|
@ -155,7 +151,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
// Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
|
// Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
|
||||||
// of this component. This by-name convention lookup is used to simplify configuring algorithms.
|
// of this component. This by-name convention lookup is used to simplify configuring algorithms.
|
||||||
String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
|
String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
|
||||||
log.info("Initializing Clustering Engine '" + Objects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
|
log.info("Initializing Clustering Engine '" +
|
||||||
|
MoreObjects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
|
||||||
|
|
||||||
if (!Strings.isNullOrEmpty(componentName)) {
|
if (!Strings.isNullOrEmpty(componentName)) {
|
||||||
IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
|
IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
|
||||||
|
@ -268,7 +265,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
|
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
|
||||||
SolrParams solrParams = sreq.getParams();
|
SolrParams solrParams = sreq.getParams();
|
||||||
|
|
||||||
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
|
HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq));
|
||||||
fields.add(idFieldName);
|
fields.add(idFieldName);
|
||||||
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
|
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
|
||||||
fields.addAll(getCustomFieldsMap(solrParams).keySet());
|
fields.addAll(getCustomFieldsMap(solrParams).keySet());
|
||||||
|
@ -295,7 +292,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
+ " must not be blank.");
|
+ " must not be blank.");
|
||||||
}
|
}
|
||||||
|
|
||||||
final Set<String> fields = Sets.newHashSet();
|
final Set<String> fields = new HashSet<>();
|
||||||
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
|
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
|
||||||
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
|
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
|
||||||
return fields;
|
return fields;
|
||||||
|
@ -319,7 +316,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
Map<String, String> customFields = getCustomFieldsMap(solrParams);
|
Map<String, String> customFields = getCustomFieldsMap(solrParams);
|
||||||
|
|
||||||
// Parse language code map string into a map
|
// Parse language code map string into a map
|
||||||
Map<String, String> languageCodeMap = Maps.newHashMap();
|
Map<String, String> languageCodeMap = new HashMap<>();
|
||||||
if (StringUtils.isNotBlank(languageField)) {
|
if (StringUtils.isNotBlank(languageField)) {
|
||||||
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
|
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
|
||||||
final String[] split = pair.split(":");
|
final String[] split = pair.split(":");
|
||||||
|
@ -340,7 +337,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
if (produceSummary) {
|
if (produceSummary) {
|
||||||
highlighter = HighlightComponent.getHighlighter(core);
|
highlighter = HighlightComponent.getHighlighter(core);
|
||||||
if (highlighter != null){
|
if (highlighter != null){
|
||||||
Map<String, Object> args = Maps.newHashMap();
|
Map<String, Object> args = new HashMap<>();
|
||||||
snippetFieldAry = snippetFieldSpec.split("[, ]");
|
snippetFieldAry = snippetFieldSpec.split("[, ]");
|
||||||
args.put(HighlightParams.FIELDS, snippetFieldAry);
|
args.put(HighlightParams.FIELDS, snippetFieldAry);
|
||||||
args.put(HighlightParams.HIGHLIGHT, "true");
|
args.put(HighlightParams.HIGHLIGHT, "true");
|
||||||
|
@ -466,10 +463,10 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
* custom field names.
|
* custom field names.
|
||||||
*/
|
*/
|
||||||
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
|
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
|
||||||
Map<String, String> customFields = Maps.newHashMap();
|
Map<String, String> customFields = new HashMap<>();
|
||||||
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
|
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
|
||||||
if (customFieldsSpec != null) {
|
if (customFieldsSpec != null) {
|
||||||
customFields = Maps.newHashMap();
|
customFields = new HashMap<>();
|
||||||
for (String customFieldSpec : customFieldsSpec) {
|
for (String customFieldSpec : customFieldsSpec) {
|
||||||
String [] split = customFieldSpec.split(":");
|
String [] split = customFieldSpec.split(":");
|
||||||
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
|
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
|
||||||
|
@ -501,7 +498,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
|
|
||||||
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
|
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
|
||||||
SolrParams solrParams) {
|
SolrParams solrParams) {
|
||||||
List<NamedList<Object>> result = Lists.newArrayList();
|
List<NamedList<Object>> result = new ArrayList<>();
|
||||||
clustersToNamedList(carrotClusters, result, solrParams.getBool(
|
clustersToNamedList(carrotClusters, result, solrParams.getBool(
|
||||||
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
|
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
|
||||||
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
|
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
|
||||||
|
@ -534,7 +531,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
|
|
||||||
// Add documents
|
// Add documents
|
||||||
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
|
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
|
||||||
List<Object> docList = Lists.newArrayList();
|
List<Object> docList = new ArrayList<>();
|
||||||
cluster.add("docs", docList);
|
cluster.add("docs", docList);
|
||||||
for (Document doc : docs) {
|
for (Document doc : docs) {
|
||||||
docList.add(doc.getField(SOLR_DOCUMENT_ID));
|
docList.add(doc.getField(SOLR_DOCUMENT_ID));
|
||||||
|
@ -542,7 +539,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
|
|
||||||
// Add subclusters
|
// Add subclusters
|
||||||
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
|
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
|
||||||
List<NamedList<Object>> subclusters = Lists.newArrayList();
|
List<NamedList<Object>> subclusters = new ArrayList<>();
|
||||||
cluster.add("clusters", subclusters);
|
cluster.add("clusters", subclusters);
|
||||||
clustersToNamedList(outCluster.getSubclusters(), subclusters,
|
clustersToNamedList(outCluster.getSubclusters(), subclusters,
|
||||||
outputSubClusters, maxLabels);
|
outputSubClusters, maxLabels);
|
||||||
|
|
|
@ -16,10 +16,10 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableSet;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration).
|
* Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration).
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
|
@ -50,7 +50,7 @@ public final class CarrotParams {
|
||||||
*/
|
*/
|
||||||
public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
|
public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
|
||||||
|
|
||||||
static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
|
static final Set<String> CARROT_PARAM_NAMES = new HashSet<>(Arrays.asList(
|
||||||
ALGORITHM,
|
ALGORITHM,
|
||||||
|
|
||||||
TITLE_FIELD_NAME,
|
TITLE_FIELD_NAME,
|
||||||
|
@ -66,7 +66,7 @@ public final class CarrotParams {
|
||||||
NUM_DESCRIPTIONS,
|
NUM_DESCRIPTIONS,
|
||||||
OUTPUT_SUB_CLUSTERS,
|
OUTPUT_SUB_CLUSTERS,
|
||||||
RESOURCES_DIR,
|
RESOURCES_DIR,
|
||||||
LANGUAGE_CODE_MAP);
|
LANGUAGE_CODE_MAP));
|
||||||
|
|
||||||
/** No instances. */
|
/** No instances. */
|
||||||
private CarrotParams() {}
|
private CarrotParams() {}
|
||||||
|
|
|
@ -16,7 +16,9 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
@ -26,6 +28,7 @@ import org.apache.lucene.analysis.core.StopFilterFactory;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
import org.apache.solr.analysis.TokenizerChain;
|
import org.apache.solr.analysis.TokenizerChain;
|
||||||
import org.apache.solr.core.SolrCore;
|
import org.apache.solr.core.SolrCore;
|
||||||
|
import org.apache.solr.schema.IndexSchema;
|
||||||
import org.carrot2.core.LanguageCode;
|
import org.carrot2.core.LanguageCode;
|
||||||
import org.carrot2.core.attribute.Init;
|
import org.carrot2.core.attribute.Init;
|
||||||
import org.carrot2.core.attribute.Processing;
|
import org.carrot2.core.attribute.Processing;
|
||||||
|
@ -37,9 +40,6 @@ import org.carrot2.util.attribute.Attribute;
|
||||||
import org.carrot2.util.attribute.Bindable;
|
import org.carrot2.util.attribute.Bindable;
|
||||||
import org.carrot2.util.attribute.Input;
|
import org.carrot2.util.attribute.Input;
|
||||||
|
|
||||||
import com.google.common.collect.HashMultimap;
|
|
||||||
import com.google.common.collect.Multimap;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
|
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
|
||||||
* words from a field's StopFilter to the default stop words used in Carrot2,
|
* words from a field's StopFilter to the default stop words used in Carrot2,
|
||||||
|
@ -67,7 +67,7 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFacto
|
||||||
/**
|
/**
|
||||||
* A lazily-built cache of stop words per field.
|
* A lazily-built cache of stop words per field.
|
||||||
*/
|
*/
|
||||||
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
|
private HashMap<String, List<CharArraySet>> solrStopWords = new HashMap<>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Carrot2's default lexical resources to use in addition to Solr's stop
|
* Carrot2's default lexical resources to use in addition to Solr's stop
|
||||||
|
@ -79,31 +79,34 @@ public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFacto
|
||||||
* Obtains stop words for a field from the associated
|
* Obtains stop words for a field from the associated
|
||||||
* {@link StopFilterFactory}, if any.
|
* {@link StopFilterFactory}, if any.
|
||||||
*/
|
*/
|
||||||
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
|
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
|
||||||
// No need to synchronize here, Carrot2 ensures that instances
|
// No need to synchronize here, Carrot2 ensures that instances
|
||||||
// of this class are not used by multiple threads at a time.
|
// of this class are not used by multiple threads at a time.
|
||||||
if (!solrStopWords.containsKey(fieldName)) {
|
synchronized (solrStopWords) {
|
||||||
final Analyzer fieldAnalyzer = core.getLatestSchema().getFieldType(fieldName)
|
if (!solrStopWords.containsKey(fieldName)) {
|
||||||
.getIndexAnalyzer();
|
solrStopWords.put(fieldName, new ArrayList<>());
|
||||||
if (fieldAnalyzer instanceof TokenizerChain) {
|
|
||||||
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
|
|
||||||
.getTokenFilterFactories();
|
|
||||||
for (TokenFilterFactory factory : filterFactories) {
|
|
||||||
if (factory instanceof StopFilterFactory) {
|
|
||||||
// StopFilterFactory holds the stop words in a CharArraySet
|
|
||||||
solrStopWords.put(fieldName,
|
|
||||||
((StopFilterFactory) factory).getStopWords());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (factory instanceof CommonGramsFilterFactory) {
|
IndexSchema schema = core.getLatestSchema();
|
||||||
solrStopWords.put(fieldName,
|
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
|
||||||
((CommonGramsFilterFactory) factory)
|
if (fieldAnalyzer instanceof TokenizerChain) {
|
||||||
.getCommonWords());
|
final TokenFilterFactory[] filterFactories =
|
||||||
|
((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
|
||||||
|
for (TokenFilterFactory factory : filterFactories) {
|
||||||
|
if (factory instanceof StopFilterFactory) {
|
||||||
|
// StopFilterFactory holds the stop words in a CharArraySet
|
||||||
|
CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
|
||||||
|
solrStopWords.get(fieldName).add(stopWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (factory instanceof CommonGramsFilterFactory) {
|
||||||
|
CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
|
||||||
|
solrStopWords.get(fieldName).add(commonWords);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return solrStopWords.get(fieldName);
|
||||||
}
|
}
|
||||||
return solrStopWords.get(fieldName);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,6 +17,9 @@
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -45,9 +48,6 @@ import org.carrot2.core.LanguageCode;
|
||||||
import org.carrot2.util.attribute.AttributeUtils;
|
import org.carrot2.util.attribute.AttributeUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@ -211,7 +211,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||||
// stoplabels.mt, so we're expecting only one cluster with label "online".
|
// stoplabels.mt, so we're expecting only one cluster with label "online".
|
||||||
final List<NamedList<Object>> clusters = checkEngine(
|
final List<NamedList<Object>> clusters = checkEngine(
|
||||||
getClusteringEngine(engineName), 1, params);
|
getClusteringEngine(engineName), 1, params);
|
||||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -226,7 +226,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||||
// only one cluster with label "online".
|
// only one cluster with label "online".
|
||||||
final List<NamedList<Object>> clusters = checkEngine(
|
final List<NamedList<Object>> clusters = checkEngine(
|
||||||
getClusteringEngine("lexical-resource-check"), 1, params);
|
getClusteringEngine("lexical-resource-check"), 1, params);
|
||||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -243,9 +243,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||||
|
|
||||||
final List<NamedList<Object>> clusters = checkEngine(
|
final List<NamedList<Object>> clusters = checkEngine(
|
||||||
getClusteringEngine("lexical-resource-check"), 2, params);
|
getClusteringEngine("lexical-resource-check"), 2, params);
|
||||||
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
|
assertEquals(Collections.singletonList("online"), getLabels(clusters.get(0)));
|
||||||
assertEquals(ImmutableList.of("solrownstopword"),
|
assertEquals(Collections.singletonList("solrownstopword"), getLabels(clusters.get(1)));
|
||||||
getLabels(clusters.get(1)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -395,8 +394,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||||
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default");
|
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default");
|
||||||
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
Lists.newArrayList("stc", "default", "mock"),
|
Arrays.asList("stc", "default", "mock"),
|
||||||
Lists.newArrayList(engines.keySet()));
|
new ArrayList<>(engines.keySet()));
|
||||||
assertEquals(
|
assertEquals(
|
||||||
LingoClusteringAlgorithm.class,
|
LingoClusteringAlgorithm.class,
|
||||||
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
||||||
|
@ -407,8 +406,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||||
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order");
|
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order");
|
||||||
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
Lists.newArrayList("unavailable", "lingo", "stc", "mock", "default"),
|
Arrays.asList("unavailable", "lingo", "stc", "mock", "default"),
|
||||||
Lists.newArrayList(engines.keySet()));
|
new ArrayList<>(engines.keySet()));
|
||||||
assertEquals(
|
assertEquals(
|
||||||
LingoClusteringAlgorithm.class,
|
LingoClusteringAlgorithm.class,
|
||||||
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
||||||
|
@ -419,8 +418,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||||
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups");
|
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups");
|
||||||
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
||||||
assertEquals(
|
assertEquals(
|
||||||
Lists.newArrayList("", "default"),
|
Arrays.asList("", "default"),
|
||||||
Lists.newArrayList(engines.keySet()));
|
new ArrayList<>(engines.keySet()));
|
||||||
assertEquals(
|
assertEquals(
|
||||||
MockClusteringAlgorithm.class,
|
MockClusteringAlgorithm.class,
|
||||||
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.carrot2.core.Cluster;
|
import org.carrot2.core.Cluster;
|
||||||
|
@ -29,8 +30,6 @@ import org.carrot2.util.attribute.Bindable;
|
||||||
import org.carrot2.util.attribute.Input;
|
import org.carrot2.util.attribute.Input;
|
||||||
import org.carrot2.util.attribute.Output;
|
import org.carrot2.util.attribute.Output;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A mock Carrot2 clustering algorithm that outputs input documents as clusters.
|
* A mock Carrot2 clustering algorithm that outputs input documents as clusters.
|
||||||
* Useful only in tests.
|
* Useful only in tests.
|
||||||
|
@ -56,7 +55,7 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process() throws ProcessingException {
|
public void process() throws ProcessingException {
|
||||||
clusters = Lists.newArrayListWithCapacity(documents.size());
|
clusters = new ArrayList<>();
|
||||||
|
|
||||||
for (Document document : documents) {
|
for (Document document : documents) {
|
||||||
final Cluster cluster = new Cluster();
|
final Cluster cluster = new Cluster();
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.carrot2.core.Cluster;
|
import org.carrot2.core.Cluster;
|
||||||
|
@ -36,8 +37,6 @@ import org.carrot2.util.attribute.Bindable;
|
||||||
import org.carrot2.util.attribute.Input;
|
import org.carrot2.util.attribute.Input;
|
||||||
import org.carrot2.util.attribute.Output;
|
import org.carrot2.util.attribute.Output;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A mock Carrot2 clustering algorithm that outputs stem of each token of each
|
* A mock Carrot2 clustering algorithm that outputs stem of each token of each
|
||||||
* document as a separate cluster. Useful only in tests.
|
* document as a separate cluster. Useful only in tests.
|
||||||
|
@ -64,7 +63,7 @@ public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase
|
||||||
final AllTokens allTokens = preprocessingContext.allTokens;
|
final AllTokens allTokens = preprocessingContext.allTokens;
|
||||||
final AllWords allWords = preprocessingContext.allWords;
|
final AllWords allWords = preprocessingContext.allWords;
|
||||||
final AllStems allStems = preprocessingContext.allStems;
|
final AllStems allStems = preprocessingContext.allStems;
|
||||||
clusters = Lists.newArrayListWithCapacity(allTokens.image.length);
|
clusters = new ArrayList<>();
|
||||||
for (int i = 0; i < allTokens.image.length; i++) {
|
for (int i = 0; i < allTokens.image.length; i++) {
|
||||||
if (allTokens.wordIndex[i] >= 0) {
|
if (allTokens.wordIndex[i] >= 0) {
|
||||||
clusters.add(new Cluster(new String(
|
clusters.add(new Cluster(new String(
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.carrot2.core.Cluster;
|
import org.carrot2.core.Cluster;
|
||||||
|
@ -33,7 +34,6 @@ import org.carrot2.util.attribute.Bindable;
|
||||||
import org.carrot2.util.attribute.Input;
|
import org.carrot2.util.attribute.Input;
|
||||||
import org.carrot2.util.attribute.Output;
|
import org.carrot2.util.attribute.Output;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A mock Carrot2 clustering algorithm that outputs each token of each document
|
* A mock Carrot2 clustering algorithm that outputs each token of each document
|
||||||
|
@ -58,8 +58,7 @@ public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase
|
||||||
public void process() throws ProcessingException {
|
public void process() throws ProcessingException {
|
||||||
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
|
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
|
||||||
documents, "", LanguageCode.ENGLISH);
|
documents, "", LanguageCode.ENGLISH);
|
||||||
clusters = Lists
|
clusters = new ArrayList<>();
|
||||||
.newArrayListWithCapacity(preprocessingContext.allTokens.image.length);
|
|
||||||
for (char[] token : preprocessingContext.allTokens.image) {
|
for (char[] token : preprocessingContext.allTokens.image) {
|
||||||
if (token != null) {
|
if (token != null) {
|
||||||
clusters.add(new Cluster(new String(token)));
|
clusters.add(new Cluster(new String(token)));
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.carrot2.core.Cluster;
|
import org.carrot2.core.Cluster;
|
||||||
|
@ -33,8 +34,6 @@ import org.carrot2.util.attribute.Bindable;
|
||||||
import org.carrot2.util.attribute.Input;
|
import org.carrot2.util.attribute.Input;
|
||||||
import org.carrot2.util.attribute.Output;
|
import org.carrot2.util.attribute.Output;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A mock implementation of Carrot2 clustering algorithm for testing whether the
|
* A mock implementation of Carrot2 clustering algorithm for testing whether the
|
||||||
* customized lexical resource lookup works correctly. This algorithm ignores
|
* customized lexical resource lookup works correctly. This algorithm ignores
|
||||||
|
@ -60,7 +59,7 @@ public class LexicalResourcesCheckClusteringAlgorithm extends
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process() throws ProcessingException {
|
public void process() throws ProcessingException {
|
||||||
clusters = Lists.newArrayList();
|
clusters = new ArrayList<>();
|
||||||
if (wordsToCheck == null) {
|
if (wordsToCheck == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,13 +15,13 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.handler.clustering.carrot2;
|
package org.apache.solr.handler.clustering.carrot2;
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import org.carrot2.core.*;
|
import org.carrot2.core.*;
|
||||||
import org.carrot2.core.attribute.AttributeNames;
|
import org.carrot2.core.attribute.AttributeNames;
|
||||||
import org.carrot2.core.attribute.Processing;
|
import org.carrot2.core.attribute.Processing;
|
||||||
import org.carrot2.util.attribute.*;
|
import org.carrot2.util.attribute.*;
|
||||||
import org.carrot2.util.attribute.constraint.IntRange;
|
import org.carrot2.util.attribute.constraint.IntRange;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@Bindable(prefix = "MockClusteringAlgorithm")
|
@Bindable(prefix = "MockClusteringAlgorithm")
|
||||||
|
@ -62,7 +62,7 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process() throws ProcessingException {
|
public void process() throws ProcessingException {
|
||||||
clusters = Lists.newArrayList();
|
clusters = new ArrayList<>();
|
||||||
if (documents == null) {
|
if (documents == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
9d8b42afe43ba5c0a0c5d67208d5c919e45c3584
|
|
|
@ -0,0 +1 @@
|
||||||
|
5d76ec388711056bfaaacc354ed04ffa6811c7b7
|
Loading…
Reference in New Issue