SOLR-14926: Modernize and clean up search results clustering contrib.

This commit is contained in:
Dawid Weiss 2020-11-03 09:31:53 +01:00 committed by GitHub
parent 5c02737918
commit 0f871b2c56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 3042 additions and 5114 deletions

View File

@ -91,7 +91,7 @@ and Edmond Nolan.
The Polish analyzer (stempel) comes with a default The Polish analyzer (stempel) comes with a default
stopword list that is BSD-licensed created by the Carrot2 project. The file resides stopword list that is BSD-licensed created by the Carrot2 project. The file resides
in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
See http://project.carrot2.org/license.html. See https://project.carrot2.org/license.html.
The SmartChineseAnalyzer source code (smartcn) was The SmartChineseAnalyzer source code (smartcn) was
provided by Xiaoping Gao and copyright 2009 by www.imdict.net. provided by Xiaoping Gao and copyright 2009 by www.imdict.net.

View File

@ -1,5 +1,5 @@
# This file was created from the carrot2 project and is distributed under the BSD license. # This file was created from the carrot2 project and is distributed under the BSD license.
# See http://project.carrot2.org/license.html # See https://project.carrot2.org/license.html
# Also see http://www.opensource.org/licenses/bsd-license.html # Also see http://www.opensource.org/licenses/bsd-license.html
# From trunk/core/carrot2-util-text/src-resources/stopwords.pl # From trunk/core/carrot2-util-text/src-resources/stopwords.pl
vol vol

View File

@ -135,9 +135,10 @@ public class MatchHighlighter {
/** /**
* Always fetch the given set of fields for all input documents. * Always fetch the given set of fields for all input documents.
*/ */
public void alwaysFetchFields(String field, String... otherFields) { public void alwaysFetchFields(String... fields) {
Stream.concat(Stream.of(field), Stream.of(otherFields)) for (String fld : fields) {
.forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld))); fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
}
} }
/** /**

View File

@ -42,6 +42,13 @@ Improvements
* SOLR-14972: Change default port of prometheus exporter to 8989 because it clashed with default embedded zookeeper port (janhoy) * SOLR-14972: Change default port of prometheus exporter to 8989 because it clashed with default embedded zookeeper port (janhoy)
* SOLR-14926, SOLR-14926, SOLR-13506: Modernize and clean up search results clustering contrib. This issue upgrades
the clustering contrib to the new Carrot2 4.x line, dropping several CVE-prone dependencies along the way.
The parameters and configuration of the contrib extensions have changed. The documentation in Solr ref guide
has been rewritten from scratch to be up to date. Clustering code has been rewritten from scratch to work
properly regardless of the mode (standalone, distributed). The API has been stripped of ancient, unused, interfaces
and simplified. (Dawid Weiss)
Other Changes Other Changes
---------------------- ----------------------
* SOLR-14656: Autoscaling framework removed (Ishan Chattopadhyaya, noble, Ilan Ginzburg) * SOLR-14656: Autoscaling framework removed (Ishan Chattopadhyaya, noble, Ilan Ginzburg)

View File

@ -221,7 +221,7 @@ and Edmond Nolan.
The Polish analyzer (stempel) comes with a default The Polish analyzer (stempel) comes with a default
stopword list that is BSD-licensed created by the Carrot2 project. The file resides stopword list that is BSD-licensed created by the Carrot2 project. The file resides
in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
See http://project.carrot2.org/license.html. See https://project.carrot2.org/license.html.
The SmartChineseAnalyzer source code (smartcn) was The SmartChineseAnalyzer source code (smartcn) was
provided by Xiaoping Gao and copyright 2009 by www.imdict.net. provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
@ -439,13 +439,12 @@ http://sourceforge.jp/projects/jsonic/
========================================================================= =========================================================================
== Carrot2 Notice == == Carrot2 Notice ==
========================================================================= =========================================================================
Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski. Copyright (C) 2002-2020, Dawid Weiss, Stanislaw Osinski.
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file. Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
All rights reserved. All rights reserved.
This product includes software developed by the Carrot2 Project. This product includes software developed by the Carrot2 Project.
See https://project.carrot2.org/
See http://project.carrot2.org/
========================================================================= =========================================================================
== Guava Notice == == Guava Notice ==

View File

@ -18,14 +18,12 @@
apply plugin: 'java-library' apply plugin: 'java-library'
description = 'Clustering Integraton' description = 'Search Results Clustering Integraton'
dependencies { dependencies {
implementation project(':solr:core') implementation project(':solr:core')
implementation project(':lucene:analysis:common') implementation project(':lucene:analysis:common')
implementation('org.carrot2:carrot2-mini', { implementation 'org.carrot2:carrot2-core'
exclude group: "org.simpleframework", module: "simple-xml"
})
testImplementation project(':solr:test-framework') testImplementation project(':solr:test-framework')
} }

View File

@ -16,378 +16,496 @@
*/ */
package org.apache.solr.handler.clustering; package org.apache.solr.handler.clustering;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
import org.apache.solr.common.SolrDocument; import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocumentList; import org.apache.lucene.search.TotalHits;
import org.apache.solr.client.solrj.response.ClusteringResponse;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine;
import org.apache.solr.handler.component.ResponseBuilder; import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent; import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList; import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet; import org.apache.solr.search.DocSlice;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.plugin.SolrCoreAware; import org.apache.solr.util.plugin.SolrCoreAware;
import org.carrot2.clustering.Cluster;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
/** /**
* Provides a plugin for performing cluster analysis. This can either be applied to * A {@link SearchComponent} for dynamic, unsupervised grouping of
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for * search results based on the content of their text fields or contextual
* clustering documents (e.g., via <a href="http://mahout.apache.org/">Mahout</a>). * snippets around query-matching regions.
*
* <p> * <p>
* See Solr example for configuration examples.</p> * The default implementation uses clustering algorithms from the
* <a href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
* *
* @lucene.experimental * @lucene.experimental
*/ */
public class ClusteringComponent extends SearchComponent implements SolrCoreAware { public class ClusteringComponent extends SearchComponent implements SolrCoreAware {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/** /**
* Base name for all component parameters. This name is also used to * Default component name and parameter prefix.
* register this component with SearchHandler.
*/ */
public static final String COMPONENT_NAME = "clustering"; public static final String COMPONENT_NAME = "clustering";
/** /**
* Declaration-order list of search clustering engines. * Request parameter that selects one of the {@link Engine} configurations
* out of many possibly defined in the component's initialization parameters.
*/ */
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = new LinkedHashMap<>(); public static final String REQUEST_PARAM_ENGINE = COMPONENT_NAME + ".engine";
/** /**
* Declaration order list of document clustering engines. * Engine configuration initialization block name.
*/ */
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = new LinkedHashMap<>(); public static final String INIT_SECTION_ENGINE = "engine";
/** /**
* An unmodifiable view of {@link #searchClusteringEngines}. * Response section name containing output clusters.
*/ */
private final Map<String, SearchClusteringEngine> searchClusteringEnginesView = Collections.unmodifiableMap(searchClusteringEngines); public static final String RESPONSE_SECTION_CLUSTERS = "clusters";
/** /**
* Initialization parameters temporarily saved here, the component * Default log sink.
* is initialized in {@link #inform(SolrCore)} because we need to know
* the core's {@link SolrResourceLoader}.
*
* @see #init(NamedList)
*/ */
private NamedList<Object> initParams; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/** /**
* Convert a DocList to a SolrDocumentList * An internal request parameter for shard requests used for collecting
* * input documents for clustering.
* The optional param "ids" is populated with the lucene document id
* for each SolrDocument.
*
* @param docs The {@link org.apache.solr.search.DocList} to convert
* @param searcher The {@link org.apache.solr.search.SolrIndexSearcher} to use to load the docs from the Lucene index
* @param fields The names of the Fields to load
* @param ids A map to store the ids of the docs
* @return The new {@link SolrDocumentList} containing all the loaded docs
* @throws IOException if there was a problem loading the docs
* @since solr 1.4
*/ */
public static SolrDocumentList docListToSolrDocumentList( private static final String REQUEST_PARAM_COLLECT_INPUTS = COMPONENT_NAME + ".collect-inputs";
DocList docs,
SolrIndexSearcher searcher,
Set<String> fields,
Map<SolrDocument, Integer> ids ) throws IOException
{
IndexSchema schema = searcher.getSchema();
SolrDocumentList list = new SolrDocumentList(); /**
list.setNumFound(docs.matches()); * Shard request response section name containing partial document inputs.
list.setMaxScore(docs.maxScore()); */
list.setStart(docs.offset()); private static final String RESPONSE_SECTION_INPUT_DOCUMENTS = "clustering-inputs";
DocIterator dit = docs.iterator(); /**
* All engines declared in this component's initialization block.
*/
private final List<EngineEntry> declaredEngines = new ArrayList<>();
while (dit.hasNext()) { /**
int docid = dit.nextDoc(); * Declaration-order list of available search clustering engines.
*/
private final LinkedHashMap<String, EngineEntry> engines = new LinkedHashMap<>();
Document luceneDoc = searcher.doc(docid, fields); private static boolean isComponentEnabled(ResponseBuilder rb) {
SolrDocument doc = new SolrDocument(); return rb.req.getParams().getBool(COMPONENT_NAME, false);
for( IndexableField field : luceneDoc) {
if (null == fields || fields.contains(field.name())) {
SchemaField sf = schema.getField( field.name() );
doc.addField( field.name(), sf.getType().toObject( field ) );
}
}
if (docs.hasScores() && (null == fields || fields.contains("score"))) {
doc.addField("score", dit.score());
} }
list.add( doc ); private static List<InputDocument> documentsFromNamedList(List<NamedList<Object>> docList) {
return docList.stream()
.map(docProps -> {
InputDocument doc = new InputDocument(
docProps.get("id"),
(String) docProps.get("language"));
if( ids != null ) { docProps.forEach((fieldName, value) -> {
ids.put( doc, docid ); doc.addClusteredField(fieldName, (String) value);
});
doc.visitFields(docProps::add);
return doc;
})
.collect(Collectors.toList());
}
private static List<NamedList<Object>> documentsToNamedList(List<InputDocument> documents) {
return documents.stream()
.map(doc -> {
NamedList<Object> docProps = new SimpleOrderedMap<>();
docProps.add("id", doc.getId());
docProps.add("language", doc.language());
doc.visitFields(docProps::add);
return docProps;
})
.collect(Collectors.toList());
}
private static List<NamedList<Object>> clustersToNamedList(List<InputDocument> documents,
List<Cluster<InputDocument>> clusters,
EngineParameters params) {
List<NamedList<Object>> result = new ArrayList<>();
clustersToNamedListRecursive(clusters, result, params);
if (params.includeOtherTopics()) {
LinkedHashSet<InputDocument> clustered = new LinkedHashSet<>();
clusters.forEach(cluster -> collectUniqueDocuments(cluster, clustered));
List<InputDocument> unclustered = documents.stream()
.filter(doc -> !clustered.contains(doc))
.collect(Collectors.toList());
if (!unclustered.isEmpty()) {
NamedList<Object> cluster = new SimpleOrderedMap<>();
result.add(cluster);
cluster.add(ClusteringResponse.IS_OTHER_TOPICS, true);
cluster.add(ClusteringResponse.LABELS_NODE, Collections.singletonList("Other topics"));
cluster.add(ClusteringResponse.SCORE_NODE, 0d);
cluster.add(ClusteringResponse.DOCS_NODE, unclustered.stream().map(InputDocument::getId)
.collect(Collectors.toList()));
} }
} }
return list;
return result;
}
private static void clustersToNamedListRecursive(
List<Cluster<InputDocument>> outputClusters,
List<NamedList<Object>> parent, EngineParameters params) {
for (Cluster<InputDocument> cluster : outputClusters) {
NamedList<Object> converted = new SimpleOrderedMap<>();
parent.add(converted);
// Add labels
List<String> labels = cluster.getLabels();
if (labels.size() > params.maxLabels()) {
labels = labels.subList(0, params.maxLabels());
}
converted.add(ClusteringResponse.LABELS_NODE, labels);
// Add cluster score
final Double score = cluster.getScore();
if (score != null) {
converted.add(ClusteringResponse.SCORE_NODE, score);
}
List<InputDocument> docs;
if (params.includeSubclusters()) {
docs = cluster.getDocuments();
} else {
docs = new ArrayList<>(collectUniqueDocuments(cluster, new LinkedHashSet<>()));
}
converted.add(ClusteringResponse.DOCS_NODE, docs.stream().map(InputDocument::getId)
.collect(Collectors.toList()));
if (params.includeSubclusters() && !cluster.getClusters().isEmpty()) {
List<NamedList<Object>> subclusters = new ArrayList<>();
converted.add(ClusteringResponse.CLUSTERS_NODE, subclusters);
clustersToNamedListRecursive(cluster.getClusters(), subclusters, params);
}
}
}
private static LinkedHashSet<InputDocument> collectUniqueDocuments(Cluster<InputDocument> cluster, LinkedHashSet<InputDocument> unique) {
unique.addAll(cluster.getDocuments());
for (Cluster<InputDocument> sub : cluster.getClusters()) {
collectUniqueDocuments(sub, unique);
}
return unique;
} }
@Override @Override
@SuppressWarnings({"rawtypes", "unchecked"}) @SuppressWarnings({"rawtypes", "unchecked"})
public void init(NamedList args) { public void init(NamedList args) {
this.initParams = args;
super.init(args); super.init(args);
if (args != null) {
@SuppressWarnings("unchecked")
NamedList<Object> initParams = (NamedList<Object>) args;
for (Map.Entry<String, Object> entry : initParams) {
if (!INIT_SECTION_ENGINE.equals(entry.getKey())) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Unrecognized configuration entry: " + entry.getKey());
}
declaredEngines.add(new EngineEntry(((NamedList<Object>) entry.getValue()).toSolrParams()));
}
}
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
@Override @Override
public void inform(SolrCore core) { public void inform(SolrCore core) {
if (initParams != null) { declaredEngines.forEach(engineEntry -> {
log.info("Initializing Clustering Engines"); if (!engineEntry.initialize(core)) {
if (engineEntry.optional) {
// Our target list of engines, split into search-results and document clustering. if (log.isInfoEnabled()) {
SolrResourceLoader loader = core.getResourceLoader(); log.info("Optional clustering engine is not available: {}", engineEntry.engineName);
}
for (Map.Entry<String,Object> entry : initParams) {
if ("engine".equals(entry.getKey())) {
NamedList<Object> engineInitParams = (NamedList<Object>) entry.getValue();
Boolean optional = engineInitParams.getBooleanArg("optional");
optional = (optional == null ? Boolean.FALSE : optional);
String engineClassName = StringUtils.defaultIfBlank(
(String) engineInitParams.get("classname"),
CarrotClusteringEngine.class.getName());
// Instantiate the clustering engine and split to appropriate map.
final ClusteringEngine engine = loader.newInstance(engineClassName, ClusteringEngine.class);
final String name = StringUtils.defaultIfBlank(engine.init(engineInitParams, core), "");
if (!engine.isAvailable()) {
if (optional) {
log.info("Optional clustering engine not available: {}", name);
} else { } else {
throw new SolrException(ErrorCode.SERVER_ERROR, throw new SolrException(ErrorCode.SERVER_ERROR,
"A required clustering engine failed to initialize, check the logs: " + name); "A required clustering engine failed to initialize, check the logs: " + engineEntry.engineName);
} }
}
final ClusteringEngine previousEntry;
if (engine instanceof SearchClusteringEngine) {
previousEntry = searchClusteringEngines.put(name, (SearchClusteringEngine) engine);
} else if (engine instanceof DocumentClusteringEngine) {
previousEntry = documentClusteringEngines.put(name, (DocumentClusteringEngine) engine);
} else { } else {
log.warn("Unknown type of a clustering engine for class: {}", engineClassName); if (engines.put(engineEntry.engineName, engineEntry) != null) {
continue; throw new SolrException(ErrorCode.SERVER_ERROR,
} String.format(Locale.ROOT,
if (previousEntry != null) { "Duplicate clustering engine named '%s'.", engineEntry.engineName));
log.warn("Duplicate clustering engine component named '{}'.", name);
}
} }
} }
});
// Set up the default engine key for both types of engines. if (engines.size() > 0) {
setupDefaultEngine("search results clustering", searchClusteringEngines); if (log.isInfoEnabled()) {
setupDefaultEngine("document clustering", documentClusteringEngines); log.info("The following clustering engines are available: {}",
String.join(", ", engines.keySet()));
log.info("Finished Initializing Clustering Engines"); }
} else {
log.warn("No clustering engines are available.");
} }
} }
@Override @Override
public void prepare(ResponseBuilder rb) throws IOException { public void prepare(ResponseBuilder rb) {
SolrParams params = rb.req.getParams(); // Do nothing.
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
} }
/**
* Entry point for clustering in local server mode (non-distributed).
*
* @param rb The {@link ResponseBuilder}.
* @throws IOException Propagated if an I/O exception occurs.
*/
@Override @Override
public void process(ResponseBuilder rb) throws IOException { public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams(); if (!isComponentEnabled(rb)) {
if (!params.getBool(COMPONENT_NAME, false)) {
return; return;
} }
final String name = getClusteringEngineName(rb); EngineEntry engine = getEngine(rb);
boolean useResults = params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false); EngineParameters parameters = engine.defaults.derivedFrom(rb.req.getParams());
if (useResults == true) {
SearchClusteringEngine engine = searchClusteringEngines.get(name); List<InputDocument> inputs = getDocuments(rb, parameters);
if (engine != null) {
checkAvailable(name, engine); if (rb.req.getParams().getBool(ShardParams.IS_SHARD, false) &&
DocListAndSet results = rb.getResults(); rb.req.getParams().getBool(REQUEST_PARAM_COLLECT_INPUTS, false)) {
Map<SolrDocument,Integer> docIds = new HashMap<>(results.docList.size()); rb.rsp.add(RESPONSE_SECTION_INPUT_DOCUMENTS, documentsToNamedList(inputs));
SolrDocumentList solrDocList = docListToSolrDocumentList(
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else { } else {
log.warn("No engine named: {}", name); doCluster(rb, engine, inputs, parameters);
} }
} }
boolean useCollection = params.getBool(ClusteringParams.USE_COLLECTION, false);
if (useCollection == true) {
DocumentClusteringEngine engine = documentClusteringEngines.get(name);
if (engine != null) {
checkAvailable(name, engine);
boolean useDocSet = params.getBool(ClusteringParams.USE_DOC_SET, false);
NamedList<?> nl = null;
// TODO: This likely needs to be made into a background task that runs in an executor
if (useDocSet == true) {
nl = engine.cluster(rb.getResults().docSet, params);
} else {
nl = engine.cluster(params);
}
rb.rsp.add("clusters", nl);
} else {
log.warn("No engine named: {}", name);
}
}
}
private void checkAvailable(String name, ClusteringEngine engine) {
if (!engine.isAvailable()) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Clustering engine declared, but not available, check the logs: " + name);
}
}
private String getClusteringEngineName(ResponseBuilder rb){
return rb.req.getParams().get(ClusteringParams.ENGINE_NAME, ClusteringEngine.DEFAULT_ENGINE_NAME);
}
@Override @Override
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
SolrParams params = rb.req.getParams(); if (!isComponentEnabled(rb)) {
if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
return;
}
sreq.params.remove(COMPONENT_NAME);
if( ( sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS ) != 0 ){
String fl = sreq.params.get(CommonParams.FL,"*");
// if fl=* then we don't need to check.
if (fl.indexOf('*') >= 0) {
return; return;
} }
String name = getClusteringEngineName(rb); // Make sure the component is enabled for shard request.
SearchClusteringEngine engine = searchClusteringEngines.get(name); assert sreq.params.getBool(COMPONENT_NAME, false) :
if (engine != null) { "Shard request should propagate clustering component enabled state?";
checkAvailable(name, engine);
Set<String> fields = engine.getFieldsToLoad(rb.req);
if (fields == null || fields.size() == 0) {
return;
}
StringBuilder sb = new StringBuilder(); // Piggyback collecting inputs for clustering on top of get fields request.
String[] flparams = fl.split( "[,\\s]+" ); if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
Set<String> flParamSet = new HashSet<>(flparams.length); sreq.params.set(REQUEST_PARAM_COLLECT_INPUTS, true);
for (String flparam : flparams) {
// no need trim() because of split() by \s+
flParamSet.add(flparam);
}
for (String aFieldToLoad : fields) {
if (!flParamSet.contains(aFieldToLoad )) {
sb.append(',').append(aFieldToLoad);
}
}
if (sb.length() > 0) {
sreq.params.set(CommonParams.FL, fl + sb.toString());
}
} else {
log.warn("No engine named: {}", name);
}
} }
} }
@Override @Override
public void finishStage(ResponseBuilder rb) { public void finishStage(ResponseBuilder rb) {
SolrParams params = rb.req.getParams(); if (!isComponentEnabled(rb)) {
if (!params.getBool(COMPONENT_NAME, false) ||
!params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
return; return;
} }
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) { if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
String name = getClusteringEngineName(rb); List<InputDocument> inputs = new ArrayList<>();
SearchClusteringEngine engine = searchClusteringEngines.get(name); rb.finished.stream()
if (engine != null) { .filter(shardRequest -> (shardRequest.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0)
checkAvailable(name, engine); .flatMap(shardRequest -> shardRequest.responses.stream())
SolrDocumentList solrDocList = (SolrDocumentList) rb.rsp.getResponse(); .filter(rsp -> rsp.getException() == null)
// TODO: Currently, docIds is set to null in distributed environment. .map(rsp -> rsp.getSolrResponse().getResponse())
// This causes CarrotParams.PRODUCE_SUMMARY doesn't work. .forEach(response -> {
// To work CarrotParams.PRODUCE_SUMMARY under distributed mode, we can choose either one of: @SuppressWarnings("unchecked")
// (a) In each shard, ClusteringComponent produces summary and finishStage() List<NamedList<Object>> partialInputs = (List<NamedList<Object>>) response.get(RESPONSE_SECTION_INPUT_DOCUMENTS);
// merges these summaries. if (partialInputs != null) {
// (b) Adding doHighlighting(SolrDocumentList, ...) method to SolrHighlighter and inputs.addAll(documentsFromNamedList(partialInputs));
// making SolrHighlighter uses "external text" rather than stored values to produce snippets.
Map<SolrDocument,Integer> docIds = null;
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else {
log.warn("No engine named: {}", name);
} }
});
EngineEntry engine = getEngine(rb);
EngineParameters parameters = engine.defaults.derivedFrom(rb.req.getParams());
doCluster(rb, engine, inputs, parameters);
} }
} }
/** /**
* @return Expose for tests. * Run clustering of input documents and append the result to the response.
*/ */
Map<String, SearchClusteringEngine> getSearchClusteringEngines() { private void doCluster(ResponseBuilder rb, EngineEntry engine, List<InputDocument> inputs, EngineParameters parameters) {
return searchClusteringEnginesView; // log.warn("# CLUSTERING: " + inputs.size() + " document(s), contents:\n - "
// + inputs.stream().map(Object::toString).collect(Collectors.joining("\n - ")));
List<Cluster<InputDocument>> clusters = engine.get().cluster(parameters, rb.getQuery(), inputs);
rb.rsp.add(RESPONSE_SECTION_CLUSTERS, clustersToNamedList(inputs, clusters, parameters));
}
/**
* Prepares input documents for clustering.
*/
private List<InputDocument> getDocuments(ResponseBuilder responseBuilder,
EngineParameters requestParameters) throws IOException {
SolrQueryRequest solrRequest = responseBuilder.req;
Query query = responseBuilder.getQuery();
SolrIndexSearcher indexSearcher = responseBuilder.req.getSearcher();
SolrCore core = solrRequest.getCore();
String[] fieldsToCluster = requestParameters.fields().toArray(String[]::new);
IndexSchema schema = indexSearcher.getSchema();
boolean preferQueryContext = requestParameters.preferQueryContext();
SolrQueryRequest req = null;
SolrHighlighter highlighter = null;
if (preferQueryContext) {
highlighter = ((HighlightComponent) core.getSearchComponents().get(HighlightComponent.COMPONENT_NAME)).getHighlighter();
if (highlighter != null) {
Map<String, Object> args = new HashMap<>();
args.put(HighlightParams.FIELDS, fieldsToCluster);
args.put(HighlightParams.HIGHLIGHT, "true");
// We don't want any highlight marks.
args.put(HighlightParams.SIMPLE_PRE, "");
args.put(HighlightParams.SIMPLE_POST, "");
args.put(HighlightParams.FRAGSIZE, requestParameters.contextSize());
args.put(HighlightParams.SNIPPETS, requestParameters.contextCount());
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
@Override
public SolrIndexSearcher getSearcher() {
return indexSearcher;
}
};
} else {
log.warn("No highlighter configured, cannot produce summary");
preferQueryContext = false;
}
}
Map<String, Function<IndexableField, String>> fieldsToLoad = new LinkedHashMap<>();
for (String fld : requestParameters.getFieldsToLoad()) {
FieldType type = schema.getField(fld).getType();
fieldsToLoad.put(fld, (fieldValue) -> type.toObject(fieldValue).toString());
}
Function<Map<String, String>, String> docLanguage;
String languageField = requestParameters.languageField();
if (languageField != null) {
docLanguage = (doc) -> doc.getOrDefault(languageField, requestParameters.language());
} else {
docLanguage = (doc) -> requestParameters.language();
}
List<InputDocument> result = new ArrayList<>();
DocIterator it = responseBuilder.getResults().docList.iterator();
while (it.hasNext()) {
int docId = it.nextDoc();
Map<String, String> docFieldValues = new LinkedHashMap<>();
for (IndexableField indexableField : indexSearcher.doc(docId, fieldsToLoad.keySet())) {
String fieldName = indexableField.name();
Function<IndexableField, String> toString = fieldsToLoad.get(fieldName);
if (toString != null) {
String value = toString.apply(indexableField);
docFieldValues.compute(fieldName, (k, v) -> {
if (v == null) {
return value;
} else {
return v + " . " + value;
}
});
}
}
InputDocument inputDocument = new InputDocument(
docFieldValues.get(requestParameters.docIdField()),
docLanguage.apply(docFieldValues));
result.add(inputDocument);
Function<String, String> snippetProvider = (field) -> null;
if (preferQueryContext) {
DocList docAsList = new DocSlice(0, 1,
new int[]{docId},
new float[]{1.0f},
1,
1.0f,
TotalHits.Relation.EQUAL_TO);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, query, req, fieldsToCluster);
if (highlights != null && highlights.size() == 1) {
@SuppressWarnings("unchecked")
NamedList<String[]> tmp = (NamedList<String[]>) highlights.getVal(0);
snippetProvider = (field) -> {
String[] values = tmp.get(field);
if (values == null) {
return null;
} else {
return String.join(" . ", Arrays.asList(values));
}
};
}
}
Function<String, String> fullValueProvider = docFieldValues::get;
for (String field : fieldsToCluster) {
String values = snippetProvider.apply(field);
if (values == null) {
values = fullValueProvider.apply(field);
}
if (values != null) {
inputDocument.addClusteredField(field, values);
}
}
}
return result;
}
private EngineEntry getEngine(ResponseBuilder rb) {
if (engines.isEmpty()) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"No clustering engines are defined or loaded.");
}
EngineEntry engine;
String name = rb.req.getParams().get(REQUEST_PARAM_ENGINE, null);
if (name != null) {
engine = engines.get(name);
if (engine == null) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Clustering engine unknown or not loaded: " + name);
}
} else {
engine = engines.values().iterator().next();
}
return engine;
}
/**
* @return A map of initialized clustering engines, exposed for tests only.
*/
Set<String> getEngineNames() {
return engines.keySet();
} }
@Override @Override
public String getDescription() { public String getDescription() {
return "A Clustering component"; return "Search results clustering component";
}
/**
* Setup the default clustering engine.
* @see "https://issues.apache.org/jira/browse/SOLR-5219"
*/
private static <T extends ClusteringEngine> void setupDefaultEngine(String type, LinkedHashMap<String,T> map) {
// If there's already a default algorithm, leave it as is.
String engineName = ClusteringEngine.DEFAULT_ENGINE_NAME;
T defaultEngine = map.get(engineName);
if (defaultEngine == null ||
!defaultEngine.isAvailable()) {
// If there's no default algorithm, and there are any algorithms available,
// the first definition becomes the default algorithm.
for (Map.Entry<String, T> e : map.entrySet()) {
if (e.getValue().isAvailable()) {
engineName = e.getKey();
defaultEngine = e.getValue();
map.put(ClusteringEngine.DEFAULT_ENGINE_NAME, defaultEngine);
break;
}
}
}
if (defaultEngine != null) {
if (log.isInfoEnabled()) {
log.info("Default engine for {}: {} [{}]", type, engineName, defaultEngine.getClass().getSimpleName());
}
} else {
log.warn("No default engine for {}.", type);
}
} }
} }

View File

@ -1,41 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
/**
* A base class for {@link SearchClusteringEngine} and {@link DocumentClusteringEngine}.
* @lucene.experimental
*/
public abstract class ClusteringEngine {
public static final String ENGINE_NAME = "name";
public static final String DEFAULT_ENGINE_NAME = "default";
private String name;
public String init(NamedList<?> config, SolrCore core) {
name = (String) config.get(ENGINE_NAME);
return name;
}
public String getName() {
return name;
}
public abstract boolean isAvailable();
}

View File

@ -1,35 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
/**
* @lucene.experimental
*/
public interface ClusteringParams {
public static final String CLUSTERING_PREFIX = "clustering.";
public static final String ENGINE_NAME = CLUSTERING_PREFIX + "engine";
public static final String USE_SEARCH_RESULTS = CLUSTERING_PREFIX + "results";
public static final String USE_COLLECTION = CLUSTERING_PREFIX + "collection";
/**
* When clustering full documents, cluster on the Doc Set.
*/
public static final String USE_DOC_SET = CLUSTERING_PREFIX + "docs.useDocSet";
}

View File

@ -1,47 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.search.DocSet;
/**
* @lucene.experimental
*/
public abstract class DocumentClusteringEngine extends ClusteringEngine {
/**
* Experimental. Subject to change before the next release
*
* Cluster all the documents in the index. Clustering is often an expensive task that can take a long time.
* @param solrParams The params controlling clustering
* @return The clustering results
*/
public abstract NamedList<?> cluster(SolrParams solrParams);
/**
* Experimental. Subject to change before the next release
*
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
* @param solrParams The params controlling the clustering
* @return The results.
*/
public abstract NamedList<?> cluster(DocSet docs, SolrParams solrParams);
}

View File

@ -0,0 +1,195 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.core.SolrCore;
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.language.LanguageComponents;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.invoke.MethodHandles;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
*
* @lucene.experimental
* @see "https://project.carrot2.org"
*/
final class Engine {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* All resources required for the clustering engine.
*/
private EngineContext engineContext;
boolean init(String engineName, SolrCore core, EngineParameters defaultParams) {
log.info("Initializing clustering engine: {}", engineName);
this.engineContext = new EngineContext(defaultParams.resources(), core);
{
ClusteringAlgorithm defaultAlgorithm = engineContext.getAlgorithm(defaultParams.algorithmName());
LanguageComponents defaultLanguage = engineContext.getLanguage(defaultParams.language());
if (defaultAlgorithm == null) {
log.warn("The default clustering algorithm for engine '{}' is not available: {}",
engineName, defaultParams.algorithmName());
}
if (defaultLanguage == null) {
log.warn("The default language for engine {} is not available: {}",
engineName, defaultParams.language());
}
return (defaultAlgorithm != null && defaultLanguage != null);
}
}
List<Cluster<InputDocument>> cluster(EngineParameters parameters, Query query, List<InputDocument> documents) {
try {
checkParameters(parameters);
ClusteringAlgorithm algorithm = engineContext.getAlgorithm(parameters.algorithmName());
populateAlgorithmParameters(query, parameters, algorithm);
// Sort documents by ID so that results are not order-sensitive.
documents.sort(Comparator.comparing(a -> a.getId().toString()));
// Split documents into language groups.
String defaultLanguage = parameters.language();
Map<String, List<InputDocument>> documentsByLanguage =
documents.stream()
.collect(
Collectors.groupingBy(
doc -> {
String lang = doc.language();
return lang == null ? defaultLanguage : lang;
}));
// Cluster documents within each language group.
HashSet<String> warnOnce = new HashSet<>();
LinkedHashMap<String, List<Cluster<InputDocument>>> clustersByLanguage =
new LinkedHashMap<>();
for (Map.Entry<String, List<InputDocument>> e : documentsByLanguage.entrySet()) {
String lang = e.getKey();
if (!engineContext.isLanguageSupported(lang)) {
if (warnOnce.add(lang)) {
log.warn(
"Language '{}' is not supported, documents in this "
+ "language will not be clustered.", lang);
}
} else {
LanguageComponents langComponents = engineContext.getLanguage(lang);
if (!algorithm.supports(langComponents)) {
if (warnOnce.add(lang)) {
log.warn(
"Language '{}' is not supported by algorithm '{}', documents in this "
+ "language will not be clustered.", lang, parameters.algorithmName());
}
} else {
clustersByLanguage.put(
lang, algorithm.cluster(e.getValue().stream(), langComponents));
}
}
}
List<Cluster<InputDocument>> clusters;
if (clustersByLanguage.size() == 1) {
clusters = clustersByLanguage.values().iterator().next();
} else {
clusters = clustersByLanguage.entrySet().stream()
.map(e -> {
Cluster<InputDocument> cluster = new Cluster<>();
cluster.addLabel(e.getKey());
e.getValue().forEach(cluster::addCluster);
return cluster;
})
.collect(Collectors.toList());
}
return clusters;
} catch (Exception e) {
log.error("Clustering request failed.", e);
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
}
}
private void populateAlgorithmParameters(Query query, EngineParameters requestParameters, ClusteringAlgorithm algorithm) {
LinkedHashMap<String, String> attrs = requestParameters.otherParameters();
// Set the optional query hint. We extract just the terms
if (!attrs.containsKey("queryHint")) {
Set<String> termSet = new LinkedHashSet<>();
query.visit(new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
for (Term t : terms) {
termSet.add(t.text());
}
}
});
attrs.put("queryHint", String.join(" ", termSet));
}
algorithm.accept(new FlatKeysAttrVisitor(attrs));
}
private void checkParameters(EngineParameters parameters) {
ClusteringAlgorithm algorithm = engineContext.getAlgorithm(parameters.algorithmName());
if (algorithm == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
"Algorithm '%s' not found.",
parameters.algorithmName()));
}
String defaultLanguage = parameters.language();
LanguageComponents languageComponents = engineContext.getLanguage(defaultLanguage);
if (languageComponents == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
"Language '%s' is not supported.",
defaultLanguage));
}
if (!algorithm.supports(languageComponents)) {
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
"Language '%s' is not supported by algorithm '%s'.",
defaultLanguage,
parameters.algorithmName()));
}
if (parameters.fields().isEmpty()) {
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
"At least one field name specifying content for clustering is required in parameter '%s'.",
EngineParameters.PARAM_FIELDS));
}
}
}

View File

@ -0,0 +1,177 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.solr.core.SolrCore;
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.ClusteringAlgorithmProvider;
import org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.clustering.stc.STCClusteringAlgorithm;
import org.carrot2.language.LanguageComponents;
import org.carrot2.language.LanguageComponentsLoader;
import org.carrot2.language.LoadedLanguages;
import org.carrot2.util.ChainedResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.ServiceLoader;
import java.util.function.Supplier;
import java.util.stream.Collectors;
/**
* Clustering engine context: algorithms, preloaded language
* resources and initial validation.
*/
final class EngineContext {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final LinkedHashMap<String, LanguageComponents> languages;
private final Map<String, ClusteringAlgorithmProvider> algorithmProviders;
private final static Map<String, String> aliasedNames;
static {
aliasedNames = new HashMap<>();
aliasedNames.put(LingoClusteringAlgorithm.class.getName(), LingoClusteringAlgorithm.NAME);
aliasedNames.put(STCClusteringAlgorithm.class.getName(), STCClusteringAlgorithm.NAME);
aliasedNames.put(BisectingKMeansClusteringAlgorithm.class.getName(), BisectingKMeansClusteringAlgorithm.NAME);
}
EngineContext(String resourcesPath, SolrCore core) {
LanguageComponentsLoader loader = LanguageComponents.loader();
List<Path> resourceLocations = new ArrayList<>();
Path configDir = Paths.get(core.getResourceLoader().getConfigDir());
if (resourcesPath != null && !resourcesPath.trim().isEmpty()) {
configDir = configDir.resolve(resourcesPath);
resourceLocations.add(configDir);
}
if (!resourceLocations.isEmpty()) {
log.info(
"Clustering algorithm resources first looked up relative to: {}", resourceLocations);
loader.withResourceLookup(
(provider) ->
new ChainedResourceLookup(
Arrays.asList(
new PathResourceLookup(resourceLocations),
provider.defaultResourceLookup())));
} else {
log.info("Resources read from defaults (JARs).");
}
ClassLoader classLoader = getClass().getClassLoader();
algorithmProviders =
ServiceLoader.load(ClusteringAlgorithmProvider.class, classLoader)
.stream()
.map(ServiceLoader.Provider::get)
.collect(Collectors.toMap(ClusteringAlgorithmProvider::name, e -> e));
// Only load the resources of algorithms we're interested in.
loader.limitToAlgorithms(
algorithmProviders.values().stream()
.map(Supplier::get)
.toArray(ClusteringAlgorithm[]::new));
languages = new LinkedHashMap<>();
try {
LoadedLanguages loadedLanguages = loader.load();
for (String lang : loadedLanguages.languages()) {
languages.put(lang, loadedLanguages.language(lang));
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
// Debug info about loaded languages.
for (String lang : languages.keySet()) {
if (log.isTraceEnabled()) {
log.trace(
"Loaded language '{}' with components:\n - {}",
lang,
languages.get(lang).components().stream()
.map(Class::getSimpleName)
.collect(Collectors.joining("\n - ")));
}
}
// Remove algorithms for which there are no languages that are supported.
algorithmProviders
.entrySet()
.removeIf(e -> !isAlgorithmAvailable(e.getValue(), languages.values()));
algorithmProviders.forEach(
(name, prov) -> {
String supportedLanguages =
languages.values().stream()
.filter(lc -> prov.get().supports(lc))
.map(LanguageComponents::language)
.collect(Collectors.joining(", "));
log.info(
"Clustering algorithm {} loaded with support for the following languages: {}",
name,
supportedLanguages);
});
}
ClusteringAlgorithm getAlgorithm(String algorithmName) {
if (!algorithmProviders.containsKey(algorithmName)
&& aliasedNames.containsKey(algorithmName)) {
algorithmName = aliasedNames.get(algorithmName);
}
ClusteringAlgorithmProvider provider = algorithmProviders.get(algorithmName);
return provider == null ? null : provider.get();
}
LanguageComponents getLanguage(String language) {
return languages.get(language);
}
boolean isLanguageSupported(String language) {
return languages.containsKey(language);
}
private boolean isAlgorithmAvailable(
ClusteringAlgorithmProvider provider, Collection<LanguageComponents> languages) {
ClusteringAlgorithm algorithm = provider.get();
Optional<LanguageComponents> first = languages.stream().filter(algorithm::supports).findFirst();
if (first.isEmpty()) {
log.warn("Algorithm does not support any of the available languages: {}", provider.name());
return false;
} else {
return true;
}
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.SchemaField;
import java.util.function.Supplier;
/**
* Parses each clustering engine configuration
* initialization parameters.
*/
final class EngineEntry implements Supplier<Engine> {
/**
* Marks the engine as optional (if unavailable).
*/
private static final String PARAM_OPTIONAL = "optional";
/**
* Unique engine name parameter.
*/
private static final String PARAM_NAME = "name";
final boolean optional;
final String engineName;
final EngineParameters defaults;
/**
* Preinitialized instance of a clustering engine.
*/
private Engine engine;
/**
* {@code true} if the engine has been initialized properly and is available.
*/
private boolean available;
EngineEntry(SolrParams params) {
this.optional = params.getBool(PARAM_OPTIONAL, false);
this.engineName = params.get(PARAM_NAME, "");
defaults = new EngineParameters(params);
}
boolean initialize(SolrCore core) {
SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField();
if (uniqueField == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
ClusteringComponent.class.getSimpleName() + " requires the declaration of uniqueKeyField in the schema.");
}
String docIdField = uniqueField.getName();
defaults.setDocIdField(docIdField);
engine = new Engine();
available = engine.init(engineName, core, defaults);
return available;
}
@Override
public Engine get() {
return engine;
}
}

View File

@ -0,0 +1,353 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.commons.lang3.StringUtils;
import org.apache.solr.common.params.SolrParams;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Objects;
import java.util.Set;
/**
* {@link Engine} configuration parameters (and other parameters that
* may tweak clustering algorithms on a per-request basis).
*
* @lucene.experimental
*/
public final class EngineParameters implements Cloneable {
/**
* Common prefix for configuration of engine settings.
*/
private static final String PARAM_PREFIX = "clustering.";
/**
* @see #algorithmName()
*/
public static final String PARAM_ALGORITHM = PARAM_PREFIX + "algorithm";
/**
* @see #maxLabels()
*/
public static final String PARAM_MAX_LABELS = PARAM_PREFIX + "maxLabels";
/**
* @see #includeSubclusters()
*/
public static final String PARAM_INCLUDE_SUBCLUSTERS = PARAM_PREFIX + "includeSubclusters";
/**
* @see #includeOtherTopics()
*/
public static final String PARAM_INCLUDE_OTHER_TOPICS = PARAM_PREFIX + "includeOtherTopics";
/**
* @see #language()
*/
public static final String PARAM_LANGUAGE = PARAM_PREFIX + "language";
/**
* @see #languageField()
*/
public static final String PARAM_LANGUAGE_FIELD = PARAM_PREFIX + "languageField";
/**
* @see #resources()
*/
public static final String PARAM_RESOURCES = PARAM_PREFIX + "resources";
/**
* @see #fields()
*/
public static final String PARAM_FIELDS = PARAM_PREFIX + "fields";
/**
* @see #preferQueryContext()
*/
public static final String PARAM_PREFER_QUERY_CONTEXT = PARAM_PREFIX + "preferQueryContext";
/**
* @see #contextSize()
*/
public static final String PARAM_CONTEXT_SIZE = PARAM_PREFIX + "contextSize";
/**
* @see #contextCount()
*/
public static final String PARAM_CONTEXT_COUNT = PARAM_PREFIX + "contextCount";
/**
* @see #PARAM_MAX_LABELS
*/
private int maxLabels = Integer.MAX_VALUE;
/**
* @see #PARAM_INCLUDE_SUBCLUSTERS
*/
private boolean includeSubclusters = true;
/**
* @see #PARAM_INCLUDE_OTHER_TOPICS
*/
private boolean includeOtherTopics = true;
/**
* @see #PARAM_ALGORITHM
*/
private String algorithmName;
/**
* @see #PARAM_RESOURCES
*/
private String resources;
/**
* @see #PARAM_LANGUAGE
*/
private String language = "English";
/**
* @see #PARAM_LANGUAGE_FIELD
*/
private String languageField;
/**
* @see #PARAM_PREFER_QUERY_CONTEXT
*/
private boolean preferQueryContext;
/**
* @see #PARAM_CONTEXT_SIZE
*/
private int contextSize = 80 * 4;
/**
* @see #PARAM_CONTEXT_COUNT
*/
private int contextCount = 3;
/**
* @see #PARAM_FIELDS
*/
private LinkedHashSet<String> fields = new LinkedHashSet<>();
/**
* Non-engine configuration parameters (algorithm parameters).
*/
private LinkedHashMap<String, String> otherParameters = new LinkedHashMap<>();
/**
* Unique-value document identifier field. This is required for clustering since clusters
* only reference documents by their ID field's value.
*/
private String docIdField;
EngineParameters(SolrParams params) {
extractFrom(params);
}
/**
* Extract parameter values from the given {@link SolrParams}.
*/
private EngineParameters extractFrom(SolrParams params) {
params.stream().forEachOrdered(e -> {
switch (e.getKey()) {
case PARAM_MAX_LABELS:
maxLabels = params.getInt(PARAM_MAX_LABELS);
break;
case PARAM_INCLUDE_SUBCLUSTERS:
includeSubclusters = params.getBool(PARAM_INCLUDE_SUBCLUSTERS);
break;
case PARAM_INCLUDE_OTHER_TOPICS:
includeOtherTopics = params.getBool(PARAM_INCLUDE_OTHER_TOPICS);
break;
case PARAM_ALGORITHM:
algorithmName = params.get(PARAM_ALGORITHM);
break;
case PARAM_RESOURCES:
resources = params.get(PARAM_RESOURCES);
break;
case PARAM_LANGUAGE:
language = params.get(PARAM_LANGUAGE);
break;
case PARAM_LANGUAGE_FIELD:
languageField = params.get(PARAM_LANGUAGE_FIELD);
break;
case PARAM_PREFER_QUERY_CONTEXT:
preferQueryContext = params.getBool(PARAM_PREFER_QUERY_CONTEXT);
break;
case PARAM_CONTEXT_COUNT:
contextCount = params.getPrimitiveInt(PARAM_CONTEXT_COUNT);
break;
case PARAM_CONTEXT_SIZE:
contextSize = params.getPrimitiveInt(PARAM_CONTEXT_SIZE);
break;
case PARAM_FIELDS:
fields.addAll(Arrays.asList(params.get(PARAM_FIELDS).split("[,]\\s*")));
break;
default:
// Unrecognized parameter. Preserve it.
String[] value = e.getValue();
if (value != null) {
if (value.length == 1) {
otherParameters.put(e.getKey(), value[0]);
} else {
otherParameters.put(e.getKey(), String.join(", ", value));
}
}
break;
}
});
return this;
}
/**
* @return Maximum number of returned cluster labels (even if the algorithm
* returns more).
*/
int maxLabels() {
return maxLabels;
}
/**
* @return If {@code true}, include subclusters in response (if the algorithm
* produces hierarchical clustering).
*/
boolean includeSubclusters() {
return includeSubclusters;
}
/**
* @return If {@code true}, include a synthetic cluster called "Other Topics" that
* consists of all documents not assigned to any other cluster.
*/
boolean includeOtherTopics() {
return includeOtherTopics;
}
/**
* @return Name of the clustering algorithm to use (as loaded via the service
* * extension point {@link org.carrot2.clustering.ClusteringAlgorithm}).
*/
String algorithmName() {
return algorithmName;
}
/**
* @return Return Solr component-configuration relative language resources path.
*/
String resources() {
return resources;
}
/**
* @return Name of the default language to use for clustering. The corresponding
* {@link org.carrot2.language.LanguageComponents} must be available (loaded via
* service provider extension).
*/
String language() {
return language;
}
/**
* @return Name of the field that carries each document's language. {@code null} value
* means all documents will be clustered according to the default {@link #language()}.
* If not {@code null} and the document's field has a missing value, it will be clustered
* using the default {@link #language()} as well.
*/
String languageField() {
return languageField;
}
/**
* @return Names of all fields whose textual content will be passed to the clustering engine.
* Comma or space separated.
*/
Set<String> fields() {
return fields;
}
/**
* @return Returns {@code true} if clustering should try to extract context fragments
* around the matching query regions rather than use full field content. Such context snippets
* typically cluster well because they carry a more compact and query-related information.
*/
boolean preferQueryContext() {
return preferQueryContext;
}
/**
* @return Returns the maximum query context window to use if {@link #preferQueryContext()} is {@code true}.
*/
int contextSize() {
return contextSize;
}
/**
* @return Returns the maximum number of different, non-contiguous query context snippets from a single field
* if {@link #preferQueryContext()} is {@code true}.
*/
int contextCount() {
return contextCount;
}
LinkedHashMap<String, String> otherParameters() {
return otherParameters;
}
@Override
protected EngineParameters clone() {
try {
EngineParameters clone = (EngineParameters) super.clone();
clone.otherParameters = new LinkedHashMap<>(this.otherParameters);
clone.fields.addAll(this.fields);
return clone;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e);
}
}
/**
* @return Return a copy of the argument with any parameters present in
* {@code params} overriding this object defaults.
*/
EngineParameters derivedFrom(SolrParams params) {
EngineParameters cloned = this.clone();
cloned.extractFrom(params);
return cloned;
}
String docIdField() {
return Objects.requireNonNull(docIdField);
}
void setDocIdField(String docIdField) {
this.docIdField = Objects.requireNonNull(docIdField);
}
Set<String> getFieldsToLoad() {
Set<String> fields = new LinkedHashSet<>(fields());
fields.add(docIdField());
String languageField = languageField();
if (StringUtils.isNotBlank(languageField)) {
fields.add(languageField);
}
return fields;
}
}

View File

@ -0,0 +1,194 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.carrot2.attrs.AcceptingVisitor;
import org.carrot2.attrs.AliasMapper;
import org.carrot2.attrs.AttrBoolean;
import org.carrot2.attrs.AttrDouble;
import org.carrot2.attrs.AttrEnum;
import org.carrot2.attrs.AttrInteger;
import org.carrot2.attrs.AttrObject;
import org.carrot2.attrs.AttrObjectArray;
import org.carrot2.attrs.AttrString;
import org.carrot2.attrs.AttrStringArray;
import org.carrot2.attrs.AttrVisitor;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Objects;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;
/**
* {@link AttrVisitor} that responds to "flattened" key paths and values, updating
* corresponding algorithm parameters with values contained in the map.
*/
class FlatKeysAttrVisitor implements AttrVisitor {
final Function<String, Object> classToInstance = AliasMapper.SPI_DEFAULTS::fromName;
final ArrayDeque<String> keyPath = new ArrayDeque<>();
final LinkedHashMap<String, String> attrs;
/**
* @param attrs A map of attributes to set. Note the map has ordered keys:
* this is required for complex sub-types so that instantiation of
* a value precedes setting its attributes.
*/
FlatKeysAttrVisitor(LinkedHashMap<String, String> attrs) {
this.attrs = attrs;
}
@Override
public void visit(String key, AttrBoolean attr) {
ifKeyExists(key, (path, value) -> {
attr.set(value == null ? null : Boolean.parseBoolean(value));
});
}
@Override
public void visit(String key, AttrInteger attr) {
ifKeyExists(key, (path, value) -> {
attr.set(value == null ? null : Integer.parseInt(value));
});
}
@Override
public void visit(String key, AttrDouble attr) {
ifKeyExists(key, (path, value) -> {
attr.set(value == null ? null : Double.parseDouble(value));
});
}
@Override
public void visit(String key, AttrString attr) {
ifKeyExists(key, (path, value) -> {
attr.set(value);
});
}
@Override
public void visit(String key, AttrStringArray attr) {
ifKeyExists(key, (path, value) -> {
if (value == null) {
attr.set(new String[0]);
} else {
attr.set(value.split(",\\s*"));
}
});
}
@Override
public <T extends Enum<T>> void visit(String key, AttrEnum<T> attr) {
ifKeyExists(key, (path, value) -> {
try {
attr.set(Enum.valueOf(attr.enumClass(), value));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Value at key '%s' should be an enum constant of class '%s', but no such " +
"constant exists: '%s' (available constants: %s)",
key,
attr.enumClass().getSimpleName(),
toDebugString(value),
EnumSet.allOf(attr.enumClass())));
}
});
}
@Override
public <T extends AcceptingVisitor> void visit(String key, AttrObject<T> attr) {
ifKeyExists(key, (path, value) -> {
if (value == null) {
attr.set(null);
} else {
T t = safeCast(classToInstance.apply(value), key, attr.getInterfaceClass());
attr.set(t);
}
});
T t = attr.get();
if (t != null) {
withKey(key, path -> {
t.accept(this);
});
}
}
@Override
public <T extends AcceptingVisitor> void visit(String key, AttrObjectArray<T> attr) {
ifKeyExists(key, (path, value) -> {
throw new RuntimeException("Setting arrays of objects not implemented for attribute: "
+ key + " (" + attr.getDescription() + ")");
});
}
private <T> T safeCast(Object value, String key, Class<T> clazz) {
if (value == null) {
return null;
} else {
if (!clazz.isInstance(value)) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Value at key '%s' should be an instance of '%s', but encountered class '%s': '%s'",
key,
clazz.getSimpleName(),
value.getClass().getSimpleName(),
toDebugString(value)));
}
return clazz.cast(value);
}
}
private String toDebugString(Object value) {
if (value == null) {
return "[null]";
} else if (value instanceof Object[]) {
return Arrays.deepToString(((Object[]) value));
} else {
return Objects.toString(value);
}
}
private void withKey(String key, Consumer<String> pathConsumer) {
keyPath.addLast(key);
try {
String path = String.join(".", keyPath);
pathConsumer.accept(path);
} finally {
keyPath.removeLast();
}
}
private void ifKeyExists(String key, BiConsumer<String, String> pathConsumer) {
withKey(key, (path) -> {
if (attrs.containsKey(path)) {
String value = attrs.get(path);
if (value.trim().isEmpty()) {
value = null;
}
pathConsumer.accept(path, value);
}
});
}
}

View File

@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.carrot2.clustering.Document;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.BiConsumer;
import java.util.stream.Collectors;
/**
* Representation of a single logical "document" for clustering.
*/
final class InputDocument implements Document {
private final Object id;
private final Map<String, String> clusteredFields = new LinkedHashMap<>();
private final String language;
InputDocument(Object docId, String language) {
this.id = Objects.requireNonNull(docId);
this.language = language;
}
@Override
public void visitFields(BiConsumer<String, String> fieldConsumer) {
clusteredFields.forEach(fieldConsumer);
}
Object getId() {
return id;
}
String language() {
return language;
}
void addClusteredField(String fieldName, String fieldValue) {
assert !clusteredFields.containsKey(fieldName);
clusteredFields.put(fieldName, fieldValue);
}
@Override
public String toString() {
return String.format(Locale.ROOT,
"doc[%s, lang=%s, fields=%s]",
getId(),
language,
clusteredFields.entrySet().stream().map(e -> e.getKey() + ": " + e.getValue()).collect(Collectors.joining(", ")));
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.carrot2.util.ResourceLookup;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
/**
* Carrot2 resource provider from the provided list of filesystem paths.
*/
final class PathResourceLookup implements ResourceLookup {
private final List<Path> locations;
PathResourceLookup(List<Path> locations) {
if (locations == null || locations.isEmpty()) {
throw new RuntimeException("At least one resource location is required.");
}
this.locations = locations;
}
@Override
public InputStream open(String resource) throws IOException {
Path p = locate(resource);
if (p == null) {
throw new IOException(
"Resource "
+ p
+ " not found relative to: "
+ locations.stream()
.map(path -> path.toAbsolutePath().toString())
.collect(Collectors.joining(", ")));
}
return new BufferedInputStream(Files.newInputStream(p));
}
@Override
public boolean exists(String resource) {
return locate(resource) != null;
}
@Override
public String pathOf(String resource) {
return "["
+ locations.stream()
.map(path -> path.resolve(resource).toAbsolutePath().toString())
.collect(Collectors.joining(" | "))
+ "]";
}
private Path locate(String resource) {
for (Path base : locations) {
Path p = base.resolve(resource);
if (Files.exists(p)) {
return p;
}
}
return null;
}
}

View File

@ -1,52 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
/**
* Base class for clustering engines performing cluster analysis on search
* results.
*
* @lucene.experimental
*/
public abstract class SearchClusteringEngine extends ClusteringEngine {
/**
* Do the clustering, return a clusters structure to be appended to
* {@link SolrQueryResponse}.
*/
public abstract Object cluster(Query query, SolrDocumentList solrDocumentList,
Map<SolrDocument,Integer> docIds, SolrQueryRequest sreq);
/**
* Returns the set of field names to load.
* Concrete classes can override this method if needed.
* Default implementation returns null, that is, all stored fields are loaded.
*
* @return The set of field names to load.
*/
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
return null;
}
}

View File

@ -1,565 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.function.Supplier;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TotalHits;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.SuppressForbidden;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.clustering.ClusteringEngine;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.SolrIndexSearcher;
import org.carrot2.core.Cluster;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.shaded.guava.common.base.MoreObjects;
import org.carrot2.shaded.guava.common.base.Strings;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
import org.carrot2.util.attribute.AttributeValueSet;
import org.carrot2.util.attribute.AttributeValueSets;
import org.carrot2.util.resource.ClassLoaderLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
*
* @see "http://project.carrot2.org"
* @lucene.experimental
*/
public class CarrotClusteringEngine extends SearchClusteringEngine {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/**
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
*/
static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
/**
* Name of Carrot2 document's field containing Solr document's identifier.
*/
private static final String SOLR_DOCUMENT_ID = "solrId";
/**
* Name of Solr document's field containing the document's identifier. To avoid
* repeating the content of documents in clusters on output, each cluster contains
* identifiers of documents it contains.
*/
private String idFieldName;
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
private Controller controller = ControllerFactory.createPooling();
/**
* {@link IClusteringAlgorithm} class used for actual clustering.
*/
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
/** Solr core we're bound to. */
private SolrCore core;
@Override
public boolean isAvailable() {
return clusteringAlgorithmClass != null;
}
@Override
@SuppressWarnings("rawtypes")
public String init(NamedList config, final SolrCore core) {
this.core = core;
String result = super.init(config, core);
final SolrParams initParams = config.toSolrParams();
// Initialization attributes for Carrot2 controller.
HashMap<String, Object> initAttributes = new HashMap<>();
// Customize Carrot2's resource lookup to first look for resources
// using Solr's resource loader. If that fails, try loading from the classpath.
ResourceLookup resourceLookup = new ResourceLookup(
// Solr-specific resource loading.
new SolrResourceLocator(core, initParams),
// Using the class loader directly because this time we want to omit the prefix
new ClassLoaderLocator(core.getResourceLoader().getClassLoader()));
DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
.resourceLookup(resourceLookup);
// Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
try {
this.clusteringAlgorithmClass = core.getResourceLoader().findClass(
carrotAlgorithmClassName, IClusteringAlgorithm.class);
} catch (SolrException s) {
if (!(s.getCause() instanceof ClassNotFoundException)) {
throw s;
}
}
// Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
// of this component. This by-name convention lookup is used to simplify configuring algorithms.
String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
if (log.isInfoEnabled()) {
log.info("Initializing Clustering Engine '{}'", MoreObjects.firstNonNull(componentName, "<no 'name' attribute>"));
}
if (!Strings.isNullOrEmpty(componentName)) {
IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
if (attributeXmls.length > 0) {
if (attributeXmls.length > 1) {
log.warn("More than one attribute file found, first one will be used: {}"
, Arrays.toString(attributeXmls)); // nowarn
}
withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> {
try {
AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open());
AttributeValueSet defaultSet = avs.getDefaultAttributeValueSet();
initAttributes.putAll(defaultSet.getAttributeValues());
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Could not read attributes XML for clustering component: " + componentName, e);
}
return null;
});
}
}
// Extract solrconfig attributes, they take precedence.
extractCarrotAttributes(initParams, initAttributes);
// Customize the stemmer and tokenizer factories. The implementations we provide here
// are included in the code base of Solr, so that it's possible to refactor
// the Lucene APIs the factories rely on if needed.
// Additionally, we set a custom lexical resource factory for Carrot2 that
// will use both Carrot2 default stop words as well as stop words from
// the StopFilter defined on the field.
final AttributeBuilder attributeBuilder = BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes);
attributeBuilder.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.TOKENIZER_FACTORY)) {
attributeBuilder.tokenizerFactory(LuceneCarrot2TokenizerFactory.class);
}
if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.STEMMER_FACTORY)) {
attributeBuilder.stemmerFactory(LuceneCarrot2StemmerFactory.class);
}
// Pass the schema (via the core) to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrCore", core);
// Carrot2 uses current thread's context class loader to get
// certain classes (e.g. custom tokenizer/stemmer) at initialization time.
// To make sure classes from contrib JARs are available,
// we swap the context class loader for the time of clustering.
withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> this.controller.init(initAttributes));
SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField();
if (uniqueField == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField");
}
this.idFieldName = uniqueField.getName();
return result;
}
@Override
public Object cluster(Query query, SolrDocumentList solrDocList,
Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
try {
// Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<>();
List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
// Pass the fields on which clustering runs.
attributes.put("solrFieldNames", getFieldsForClustering(sreq));
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
// Perform clustering and convert to an output structure of clusters.
//
// Carrot2 uses current thread's context class loader to get
// certain classes (e.g. custom tokenizer/stemmer) at runtime.
// To make sure classes from contrib JARs are available,
// we swap the context class loader for the time of clustering.
return withContextClassLoader(core.getResourceLoader().getClassLoader(),
() -> clustersToNamedList(controller.process(attributes,
clusteringAlgorithmClass).getClusters(), sreq.getParams()));
} catch (Exception e) {
log.error("Carrot2 clustering failed", e);
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
}
}
@Override
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq));
fields.add(idFieldName);
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
fields.addAll(getCustomFieldsMap(solrParams).keySet());
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME);
if (StringUtils.isNotBlank(languageField)) {
fields.add(languageField);
}
return fields;
}
/**
* Returns the names of fields that will be delivering the actual
* content for clustering. Currently, there are two such fields: document
* title and document content.
*/
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
SolrParams solrParams = sreq.getParams();
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
if (StringUtils.isBlank(snippetFieldSpec)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
final Set<String> fields = new HashSet<>();
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
return fields;
}
/**
* Prepares Carrot2 documents for clustering.
*/
private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds,
Query query, final SolrQueryRequest sreq) throws IOException {
SolrHighlighter highlighter = null;
SolrParams solrParams = sreq.getParams();
SolrCore core = sreq.getCore();
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
// Maps Solr field names to Carrot2 custom field names
Map<String, String> customFields = getCustomFieldsMap(solrParams);
// Parse language code map string into a map
Map<String, String> languageCodeMap = new HashMap<>();
if (StringUtils.isNotBlank(languageField)) {
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
final String[] split = pair.split(":");
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
languageCodeMap.put(split[0], split[1]);
} else {
log.warn("Unsupported format for {}: '{}'. Skipping this mapping."
, CarrotParams.LANGUAGE_CODE_MAP, pair);
}
}
}
// Get the documents
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
SolrQueryRequest req = null;
String[] snippetFieldAry = null;
if (produceSummary) {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
Map<String, Object> args = new HashMap<>();
snippetFieldAry = snippetFieldSpec.split("[, ]");
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
args.put(HighlightParams.SIMPLE_POST, "");
args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
@Override
public SolrIndexSearcher getSearcher() {
return sreq.getSearcher();
}
};
} else {
log.warn("No highlighter configured, cannot produce summary");
produceSummary = false;
}
}
Iterator<SolrDocument> docsIter = solrDocList.iterator();
List<Document> result = new ArrayList<>(solrDocList.size());
float[] scores = {1.0f};
int[] docsHolder = new int[1];
Query theQuery = query;
while (docsIter.hasNext()) {
SolrDocument sdoc = docsIter.next();
String snippet = null;
// TODO: docIds will be null when running distributed search.
// See comment in ClusteringComponent#finishStage().
if (produceSummary && docIds != null) {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f, TotalHits.Relation.EQUAL_TO);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {
// should only be one value given our setup
// should only be one document
@SuppressWarnings("unchecked")
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
final StringBuilder sb = new StringBuilder();
for (int j = 0; j < snippetFieldAry.length; j++) {
// Join fragments with a period, so that Carrot2 does not create
// cross-fragment phrases, such phrases rarely make sense.
String [] highlt = tmp.get(snippetFieldAry[j]);
if (highlt != null && highlt.length > 0) {
for (int i = 0; i < highlt.length; i++) {
sb.append(highlt[i]);
sb.append(" . ");
}
}
}
snippet = sb.toString();
}
}
// If summaries not enabled or summary generation failed, use full content.
if (snippet == null) {
snippet = getConcatenated(sdoc, snippetFieldSpec);
}
// Create a Carrot2 document
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
snippet, Objects.toString(sdoc.getFieldValue(urlField), ""));
// Store Solr id of the document, we need it to map document instances
// found in clusters back to identifiers.
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
// Set language
if (StringUtils.isNotBlank(languageField)) {
Collection<Object> languages = sdoc.getFieldValues(languageField);
if (languages != null) {
// Use the first Carrot2-supported language
for (Object l : languages) {
String lang = Objects.toString(l, "");
if (languageCodeMap.containsKey(lang)) {
lang = languageCodeMap.get(lang);
}
// Language detection Library for Java uses dashes to separate
// language variants, such as 'zh-cn', but Carrot2 uses underscores.
if (lang.indexOf('-') > 0) {
lang = lang.replace('-', '_');
}
// If the language is supported by Carrot2, we'll get a non-null value
final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
if (carrot2Language != null) {
carrotDocument.setLanguage(carrot2Language);
break;
}
}
}
}
// Add custom fields
if (customFields != null) {
for (Entry<String, String> entry : customFields.entrySet()) {
carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey()));
}
}
result.add(carrotDocument);
}
return result;
}
/**
* Expose clustering algorithm class for tests.
*/
Class<? extends IClusteringAlgorithm> getClusteringAlgorithmClass() {
return clusteringAlgorithmClass;
}
/**
* Prepares a map of Solr field names (keys) to the corresponding Carrot2
* custom field names.
*/
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
Map<String, String> customFields = new HashMap<>();
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
if (customFieldsSpec != null) {
customFields = new HashMap<>();
for (String customFieldSpec : customFieldsSpec) {
String [] split = customFieldSpec.split(":");
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
customFields.put(split[0], split[1]);
} else {
log.warn("Unsupported format for {}: '{}'. Skipping this field definition."
, CarrotParams.CUSTOM_FIELD_NAME, customFieldSpec);
}
}
}
return customFields;
}
private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
StringBuilder result = new StringBuilder();
for (String field : fieldsSpec.split("[, ]")) {
Collection<Object> vals = sdoc.getFieldValues(field);
if (vals == null) continue;
Iterator<Object> ite = vals.iterator();
while(ite.hasNext()){
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append(Objects.toString(ite.next(), "")).append(" . ");
}
}
return result.toString().trim();
}
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
List<NamedList<Object>> result = new ArrayList<>();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
return result;
}
private void clustersToNamedList(List<Cluster> outputClusters,
List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
NamedList<Object> cluster = new SimpleOrderedMap<>();
parent.add(cluster);
// Add labels
List<String> labels = outCluster.getPhrases();
if (labels.size() > maxLabels) {
labels = labels.subList(0, maxLabels);
}
cluster.add("labels", labels);
// Add cluster score
final Double score = outCluster.getScore();
if (score != null) {
cluster.add("score", score);
}
// Add other topics marker
if (outCluster.isOtherTopics()) {
cluster.add("other-topics", outCluster.isOtherTopics());
}
// Add documents
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
List<Object> docList = new ArrayList<>();
cluster.add("docs", docList);
for (Document doc : docs) {
docList.add(doc.getField(SOLR_DOCUMENT_ID));
}
// Add subclusters
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
List<NamedList<Object>> subclusters = new ArrayList<>();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);
}
}
}
/**
* Extracts parameters that can possibly match some attributes of Carrot2 algorithms.
*/
private void extractCarrotAttributes(SolrParams solrParams,
Map<String, Object> attributes) {
// Extract all non-predefined parameters. This way, we'll be able to set all
// parameters of Carrot2 algorithms without defining their names as constants.
for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames
.hasNext();) {
String paramName = paramNames.next();
if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) {
attributes.put(paramName, solrParams.get(paramName));
}
}
}
@SuppressForbidden(reason = "Uses context class loader as a workaround to inject correct classloader to 3rd party libs")
private static <T> T withContextClassLoader(ClassLoader loader, Supplier<T> action) {
Thread ct = Thread.currentThread();
ClassLoader prev = ct.getContextClassLoader();
try {
ct.setContextClassLoader(loader);
return action.get();
} finally {
ct.setContextClassLoader(prev);
}
}
}

View File

@ -1,73 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
/**
* Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration).
* @lucene.experimental
*/
public final class CarrotParams {
private static String CARROT_PREFIX = "carrot.";
public static String ALGORITHM = CARROT_PREFIX + "algorithm";
public static String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
public static String URL_FIELD_NAME = CARROT_PREFIX + "url";
public static String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
public static String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
public static String CUSTOM_FIELD_NAME = CARROT_PREFIX + "custom";
public static String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
public static String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
public static String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
public static String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
public static String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
public static String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
/**
* Points to Carrot<sup>2</sup> resources
*/
public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
static final Set<String> CARROT_PARAM_NAMES = new HashSet<>(Arrays.asList(
ALGORITHM,
TITLE_FIELD_NAME,
URL_FIELD_NAME,
SNIPPET_FIELD_NAME,
LANGUAGE_FIELD_NAME,
CUSTOM_FIELD_NAME,
PRODUCE_SUMMARY,
SUMMARY_FRAGSIZE,
SUMMARY_SNIPPETS,
NUM_DESCRIPTIONS,
OUTPUT_SUB_CLUSTERS,
RESOURCES_DIR,
LANGUAGE_CODE_MAP));
/** No instances. */
private CarrotParams() {}
}

View File

@ -1,246 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.lang.invoke.MethodHandles;
import java.nio.CharBuffer;
import java.util.HashMap;
import org.apache.lucene.analysis.ar.ArabicNormalizer;
import org.apache.lucene.analysis.ar.ArabicStemmer;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.IStemmerFactory;
import org.carrot2.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tartarus.snowball.SnowballStemmer;
import org.tartarus.snowball.ext.DanishStemmer;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.EnglishStemmer;
import org.tartarus.snowball.ext.FinnishStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import org.tartarus.snowball.ext.GermanStemmer;
import org.tartarus.snowball.ext.HungarianStemmer;
import org.tartarus.snowball.ext.ItalianStemmer;
import org.tartarus.snowball.ext.NorwegianStemmer;
import org.tartarus.snowball.ext.PortugueseStemmer;
import org.tartarus.snowball.ext.RomanianStemmer;
import org.tartarus.snowball.ext.RussianStemmer;
import org.tartarus.snowball.ext.SpanishStemmer;
import org.tartarus.snowball.ext.SwedishStemmer;
import org.tartarus.snowball.ext.TurkishStemmer;
/**
* An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
* APIs. Should the relevant Lucene APIs need to change, the changes can be made
* in this class.
*
* @lucene.experimental
*/
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
public IStemmer getStemmer(LanguageCode language) {
switch (language) {
case ARABIC:
return ArabicStemmerFactory.createStemmer();
case CHINESE_SIMPLIFIED:
return IdentityStemmer.INSTANCE;
default:
/*
* For other languages, try to use snowball's stemming.
*/
return SnowballStemmerFactory.createStemmer(language);
}
}
/**
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
* project.
*/
private final static class SnowballStemmerFactory {
/**
* Static hard mapping from language codes to stemmer classes in Snowball.
* This mapping is not dynamic because we want to keep the possibility to
* obfuscate these classes.
*/
private static HashMap<LanguageCode, Class<? extends SnowballStemmer>> snowballStemmerClasses;
static {
snowballStemmerClasses = new HashMap<>();
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
snowballStemmerClasses
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
snowballStemmerClasses
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
PortugueseStemmer.class);
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
}
/**
* An adapter converting Snowball programs into {@link IStemmer} interface.
*/
private static class SnowballStemmerAdapter implements IStemmer {
private final SnowballStemmer snowballStemmer;
public SnowballStemmerAdapter(SnowballStemmer snowballStemmer) {
this.snowballStemmer = snowballStemmer;
}
@Override
public CharSequence stem(CharSequence word) {
snowballStemmer.setCurrent(word.toString());
if (snowballStemmer.stem()) {
return snowballStemmer.getCurrent();
} else {
return null;
}
}
}
/**
* Create and return an {@link IStemmer} adapter for a
* {@link SnowballStemmer} for a given language code. An identity stemmer is
* returned for unknown languages.
*/
public static IStemmer createStemmer(LanguageCode language) {
final Class<? extends SnowballStemmer> stemmerClazz = snowballStemmerClasses
.get(language);
if (stemmerClazz == null) {
log.warn("No Snowball stemmer class for: {}. "
+ "Quality of clustering may be degraded.", language.name());
return IdentityStemmer.INSTANCE;
}
try {
return new SnowballStemmerAdapter(stemmerClazz.getConstructor().newInstance());
} catch (Exception e) {
log.warn("Could not instantiate snowball stemmer for language: {}"
+ ". Quality of clustering may be degraded."
, language.name(), e);
return IdentityStemmer.INSTANCE;
}
}
}
/**
* Factory of {@link IStemmer} implementations for the
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
* to be present in classpath, otherwise an empty (identity) stemmer is
* returned.
*/
private static class ArabicStemmerFactory {
static {
try {
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
} catch (ClassNotFoundException e) {
log
.warn(
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ "of Arabic content may be degraded. For best quality clusters, "
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
e);
}
}
/**
* Adapter to lucene-contrib Arabic analyzers.
*/
private static class LuceneStemmerAdapter implements IStemmer {
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
private char[] buffer = new char[0];
private LuceneStemmerAdapter() {
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
}
@Override
public CharSequence stem(CharSequence word) {
if (word.length() > buffer.length) {
buffer = new char[word.length()];
}
for (int i = 0; i < word.length(); i++) {
buffer[i] = word.charAt(i);
}
int newLen = normalizer.normalize(buffer, word.length());
newLen = delegate.stem(buffer, newLen);
if (newLen != word.length() || !equals(buffer, newLen, word)) {
return CharBuffer.wrap(buffer, 0, newLen);
}
// Same-same.
return null;
}
private boolean equals(char[] buffer, int len, CharSequence word) {
assert len == word.length();
for (int i = 0; i < len; i++) {
if (buffer[i] != word.charAt(i))
return false;
}
return true;
}
}
public static IStemmer createStemmer() {
try {
return new LuceneStemmerAdapter();
} catch (Exception e) {
return IdentityStemmer.INSTANCE;
}
}
}
/**
* An implementation of {@link IStemmer} that always returns <code>null</code>
* which means no stemming.
*/
private static class IdentityStemmer implements IStemmer {
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
@Override
public CharSequence stem(CharSequence word) {
return null;
}
}
}

View File

@ -1,167 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ITokenizerFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.ExceptionUtils;
import org.carrot2.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
* Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
* classpath at runtime, the default Carrot2's tokenizer is used. Should the
* Lucene APIs need to change, the changes can be made in this class.
*
* @lucene.experimental
*/
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@Override
public ITokenizer getTokenizer(LanguageCode language) {
switch (language) {
case CHINESE_SIMPLIFIED:
return ChineseTokenizerFactory.createTokenizer();
/*
* We use our own analyzer for Arabic. Lucene's version has special
* support for Nonspacing-Mark characters (see
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
* have them included as letters in the parser.
*/
case ARABIC:
// Intentional fall-through.
default:
return new ExtendedWhitespaceTokenizer();
}
}
/**
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
* factory will fall back to the default white space tokenizer.
*/
private static final class ChineseTokenizerFactory {
static {
try {
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
} catch (Throwable e) {
log
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ "of Chinese content may be degraded. For best quality clusters, "
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
if (e instanceof Error) {
throw (Error) e;
}
}
}
static ITokenizer createTokenizer() {
try {
return new ChineseTokenizer();
} catch (Throwable e) {
if (e instanceof OutOfMemoryError) {
throw (OutOfMemoryError) e;
}
return new ExtendedWhitespaceTokenizer();
}
}
private final static class ChineseTokenizer implements ITokenizer {
private final static Pattern numeric = Pattern
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
private Tokenizer sentenceTokenizer;
private TokenStream wordTokenFilter;
private CharTermAttribute term = null;
private final MutableCharArray tempCharSequence;
private final Class<?> tokenFilterClass;
private ChineseTokenizer() throws Exception {
this.tempCharSequence = new MutableCharArray(new char[0]);
// As Smart Chinese is not available during compile time,
// we need to resort to reflection.
final Class<?> tokenizerClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
Reader.class).newInstance((Reader) null);
this.tokenFilterClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
}
@Override
public short nextToken() throws IOException {
final boolean hasNextToken = wordTokenFilter.incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = term.buffer();
final int length = term.length();
tempCharSequence.reset(image, 0, length);
if (length == 1 && image[0] == ',') {
// ChineseTokenizer seems to convert all punctuation to ','
// characters
flags = ITokenizer.TT_PUNCTUATION;
} else if (numeric.matcher(tempCharSequence).matches()) {
flags = ITokenizer.TT_NUMERIC;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
return ITokenizer.TT_EOF;
}
@Override
public void setTermBuffer(MutableCharArray array) {
array.reset(term.buffer(), 0, term.length());
}
@Override
public void reset(Reader input) {
try {
sentenceTokenizer.setReader(input);
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
TokenStream.class).newInstance(sentenceTokenizer);
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
} catch (Exception e) {
throw ExceptionUtils.wrapAsRuntimeException(e);
}
}
}
}
}

View File

@ -1,142 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A {@link IResourceLocator} that delegates resource searches to {@link SolrCore}.
*
* @lucene.experimental
*/
class SolrResourceLocator implements IResourceLocator {
private final SolrResourceLoader resourceLoader;
private final String carrot2ResourcesDir;
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public SolrResourceLocator(SolrCore core, SolrParams initParams) {
resourceLoader = core.getResourceLoader();
String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
carrot2ResourcesDir = firstNonNull(resourcesDir, CarrotClusteringEngine.CARROT_RESOURCES_PREFIX);
}
@SuppressWarnings("unchecked")
public static <T> T firstNonNull(T... args) {
for (T t : args) {
if (t != null) return t;
}
throw new NullPointerException("At least one element has to be non-null.");
}
@Override
public IResource[] getAll(final String resource) {
final String resourceName = carrot2ResourcesDir + "/" + resource;
log.debug("Looking for Solr resource: {}", resourceName);
InputStream resourceStream = null;
final byte [] asBytes;
try {
resourceStream = resourceLoader.openResource(resourceName);
asBytes = IOUtils.toByteArray(resourceStream);
} catch (IOException e) {
log.debug("Resource not found in Solr's config: {}. Using the default {} from Carrot JAR."
, resourceName, resource);
return new IResource[] {};
} finally {
if (resourceStream != null) {
try {
resourceStream.close();
} catch (IOException e) {
// ignore.
}
}
}
log.info("Loaded Solr resource: {}", resourceName);
final IResource foundResource = new IResource() {
@Override
public InputStream open() {
return new ByteArrayInputStream(asBytes);
}
@Override
public int hashCode() {
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.equals(obj);
}
@Override
public String toString() {
return "Solr config resource: " + resourceName;
}
};
return new IResource[] { foundResource };
}
@Override
public int hashCode() {
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.equals(obj);
}
@Override
public String toString() {
String configDir = "";
try {
configDir = "configDir=" + new File(resourceLoader.getConfigDir()).getAbsolutePath() + ", ";
} catch (Exception ignored) {
// If we get the exception, the resource loader implementation
// probably does not support getConfigDir(). Not a big problem.
}
return "SolrResourceLocator, " + configDir
+ "Carrot2 relative lexicalResourcesDir=" + carrot2ResourcesDir;
}
}

View File

@ -1,140 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
import org.apache.lucene.analysis.core.StopFilterFactory;
import org.apache.lucene.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
/**
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
* words from a field's StopFilter to the default stop words used in Carrot2,
* for all languages Carrot2 supports. Completely replacing Carrot2 stop words
* with Solr's wouldn't make much sense because clustering needs more aggressive
* stop words removal. In other words, if something is a stop word during
* indexing, then it should also be a stop word during clustering, but not the
* other way round.
*
* @lucene.experimental
*/
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFactory {
@Init
@Input
@Attribute(key = "solrCore")
public SolrCore core;
@Processing
@Input
@Attribute(key = "solrFieldNames")
public Set<String> fieldNames;
/**
* A lazily-built cache of stop words per field.
*/
private HashMap<String, List<CharArraySet>> solrStopWords = new HashMap<>();
/**
* Carrot2's default lexical resources to use in addition to Solr's stop
* words.
*/
public DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
/**
* Obtains stop words for a field from the associated
* {@link StopFilterFactory}, if any.
*/
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
// No need to synchronize here, Carrot2 ensures that instances
// of this class are not used by multiple threads at a time.
synchronized (solrStopWords) {
if (!solrStopWords.containsKey(fieldName)) {
solrStopWords.put(fieldName, new ArrayList<>());
IndexSchema schema = core.getLatestSchema();
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
if (fieldAnalyzer instanceof TokenizerChain) {
final TokenFilterFactory[] filterFactories =
((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet
CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
solrStopWords.get(fieldName).add(stopWords);
}
if (factory instanceof CommonGramsFilterFactory) {
CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
solrStopWords.get(fieldName).add(commonWords);
}
}
}
}
return solrStopWords.get(fieldName);
}
}
@Override
public ILexicalData getLexicalData(LanguageCode languageCode) {
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
.getLexicalData(languageCode);
return new ILexicalData() {
@Override
public boolean isStopLabel(CharSequence word) {
// Nothing in Solr maps to the concept of a stop label,
// so return Carrot2's default here.
return carrot2LexicalData.isStopLabel(word);
}
@Override
public boolean isCommonWord(MutableCharArray word) {
// Loop over the fields involved in clustering first
for (String fieldName : fieldNames) {
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
if (stopWords.contains(word)) {
return true;
}
}
}
// Check default Carrot2 stop words too
return carrot2LexicalData.isCommonWord(word);
}
};
}
}

View File

@ -16,8 +16,14 @@
*/ */
/** /**
* {@link org.apache.solr.handler.clustering.ClusteringComponent} and common APIs for specific implementations. * A {@link org.apache.solr.handler.component.SearchComponent} for dynamic,
* unsupervised grouping of
* search results based on the content of their text fields or contextual
* snippets around query-matching regions.
*
* <p>
* The default implementation uses clustering algorithms from the
* <a href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
*/ */
package org.apache.solr.handler.clustering; package org.apache.solr.handler.clustering;

View File

@ -0,0 +1,3 @@
org.apache.solr.handler.clustering.MockClusteringAlgorithmProvider
org.apache.solr.handler.clustering.EchoClusteringAlgorithmProvider
org.apache.solr.handler.clustering.ResourceCheckAlgorithmProvider

View File

@ -1,10 +0,0 @@
<attribute-sets default="overridden-attributes">
<attribute-set id="overridden-attributes">
<value-set>
<label>defaults</label>
<attribute key="MockClusteringAlgorithm.depth"><value value="1" /></attribute>
<attribute key="MockClusteringAlgorithm.labels"><value value="3" /></attribute>
<attribute key="MockClusteringAlgorithm.maxClusters"><value value="13" /></attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -1,246 +0,0 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Syntax:
# "source" => "target"
# "source".length() > 0 (source cannot be empty.)
# "target".length() >= 0 (target can be empty.)
# example:
# "À" => "A"
# "\u00C0" => "A"
# "\u00C0" => "\u0041"
# "ß" => "ss"
# "\t" => " "
# "\n" => ""
# À => A
"\u00C0" => "A"
# Á => A
"\u00C1" => "A"
# Â => A
"\u00C2" => "A"
# Ã => A
"\u00C3" => "A"
# Ä => A
"\u00C4" => "A"
# Å => A
"\u00C5" => "A"
# Æ => AE
"\u00C6" => "AE"
# Ç => C
"\u00C7" => "C"
# È => E
"\u00C8" => "E"
# É => E
"\u00C9" => "E"
# Ê => E
"\u00CA" => "E"
# Ë => E
"\u00CB" => "E"
# Ì => I
"\u00CC" => "I"
# Í => I
"\u00CD" => "I"
# Î => I
"\u00CE" => "I"
# Ï => I
"\u00CF" => "I"
# IJ => IJ
"\u0132" => "IJ"
# Ð => D
"\u00D0" => "D"
# Ñ => N
"\u00D1" => "N"
# Ò => O
"\u00D2" => "O"
# Ó => O
"\u00D3" => "O"
# Ô => O
"\u00D4" => "O"
# Õ => O
"\u00D5" => "O"
# Ö => O
"\u00D6" => "O"
# Ø => O
"\u00D8" => "O"
# Œ => OE
"\u0152" => "OE"
# Þ
"\u00DE" => "TH"
# Ù => U
"\u00D9" => "U"
# Ú => U
"\u00DA" => "U"
# Û => U
"\u00DB" => "U"
# Ü => U
"\u00DC" => "U"
# Ý => Y
"\u00DD" => "Y"
# Ÿ => Y
"\u0178" => "Y"
# à => a
"\u00E0" => "a"
# á => a
"\u00E1" => "a"
# â => a
"\u00E2" => "a"
# ã => a
"\u00E3" => "a"
# ä => a
"\u00E4" => "a"
# å => a
"\u00E5" => "a"
# æ => ae
"\u00E6" => "ae"
# ç => c
"\u00E7" => "c"
# è => e
"\u00E8" => "e"
# é => e
"\u00E9" => "e"
# ê => e
"\u00EA" => "e"
# ë => e
"\u00EB" => "e"
# ì => i
"\u00EC" => "i"
# í => i
"\u00ED" => "i"
# î => i
"\u00EE" => "i"
# ï => i
"\u00EF" => "i"
# ij => ij
"\u0133" => "ij"
# ð => d
"\u00F0" => "d"
# ñ => n
"\u00F1" => "n"
# ò => o
"\u00F2" => "o"
# ó => o
"\u00F3" => "o"
# ô => o
"\u00F4" => "o"
# õ => o
"\u00F5" => "o"
# ö => o
"\u00F6" => "o"
# ø => o
"\u00F8" => "o"
# œ => oe
"\u0153" => "oe"
# ß => ss
"\u00DF" => "ss"
# þ => th
"\u00FE" => "th"
# ù => u
"\u00F9" => "u"
# ú => u
"\u00FA" => "u"
# û => u
"\u00FB" => "u"
# ü => u
"\u00FC" => "u"
# ý => y
"\u00FD" => "y"
# ÿ => y
"\u00FF" => "y"
# ff => ff
"\uFB00" => "ff"
# fi => fi
"\uFB01" => "fi"
# fl => fl
"\uFB02" => "fl"
# ffi => ffi
"\uFB03" => "ffi"
# ffl => ffl
"\uFB04" => "ffl"
# ſt => ft
"\uFB05" => "ft"
# st => st
"\uFB06" => "st"

View File

@ -1,193 +1,27 @@
<?xml version="1.0" encoding="UTF-8" ?> <schema name="example" version="1.6">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
This is the Solr schema file. This file should be named "schema.xml" and
should be in the conf directory under the solr home
(i.e. ./solr/conf/schema.xml by default)
or located where the classloader for the Solr webapp can find it.
This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.
For more information, on how to customize this file, please see
http://wiki.apache.org/solr/SchemaXml
-->
<schema name="example" version="1.1">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.1" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default -->
<!-- field type definitions. The "name" attribute is
just a label to be used by field definitions. The "class"
attribute and any other attributes determine the real
behavior of the fieldType.
Class names starting with "solr" refer to java classes in the
org.apache.solr.analysis package.
-->
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!-- The optional sortMissingLast and sortMissingFirst attributes are
currently supported on types that are sorted internally as strings.
- If sortMissingLast="true", then a sort on this field will cause documents
without the field to come after documents with the field,
regardless of the requested sort order (asc or desc).
- If sortMissingFirst="true", then a sort on this field will cause documents
without the field to come before documents with the field,
regardless of the requested sort order.
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
then default lucene sorting will be used which places docs without the
field first in an ascending sort and last in a descending sort.
-->
<!--
Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
-->
<fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
<!--
Numeric field types that index each value at various levels of precision
to accelerate range queries when the number of values between the range
endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
implementation details.
Smaller precisionStep values (specified in bits) will lead to more tokens
indexed per value, slightly larger index size, and faster range queries.
A precisionStep of 0 disables indexing at different precision levels.
-->
<fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
is a more restricted form of the canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime
The trailing "Z" designates UTC time and is mandatory.
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
All other components are mandatory.
Expressions can also be used to denote calculations that should be
performed relative to "NOW" to determine the value, ie...
NOW/HOUR
... Round to the start of the current hour
NOW-1DAY
... Exactly 1 day prior to now
NOW/DAY+6MONTHS+3DAYS
... 6 months and 3 days in the future from the start of
the current day
Consult the TrieDateField javadocs for more information.
-->
<fieldType name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" sortMissingLast="true" omitNorms="true"/>
<!-- The "RandomSortField" is not used to store or search any
data. You can declare fields of this type it in your schema
to generate psuedo-random orderings of your docs for sorting
purposes. The ordering is generated based on the field name
and the version of the index, As long as the index version
remains unchanged, and the same field name is reused,
the ordering of the docs will be consistent.
If you want differend psuedo-random orderings of documents,
for the same version of the index, use a dynamicField and
change the name
-->
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
<!-- solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying.
The optional positionIncrementGap puts space between multiple fields of
this type on the same document, with the purpose of preventing false phrase
matching across fields.
For more info on customizing your analyzer chain, please see
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
-->
<!-- One can also specify an existing Analyzer class that has a
default constructor via the class attribute on the analyzer element
<fieldType name="text_greek" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
</fieldType>
-->
<!-- A text field that only splits on whitespace for exact matching of words -->
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- A text field that uses WordDelimiterGraphFilter to enable splitting and matching of
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
WordDelim parts) are removed.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index"> <analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/> <tokenizer class="solr.MockTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
-->
<!-- Case insensitive stop word removal.
-->
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory"
ignoreCase="true" ignoreCase="true"
words="stopwords.txt" words="stopwords.txt"/>
/> <filter class="solr.WordDelimiterGraphFilterFactory"
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/> <filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory" /> <filter class="solr.FlattenGraphFilterFactory"/>
</analyzer> </analyzer>
<analyzer type="query"> <analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/> <tokenizer class="solr.MockTokenizerFactory"/>
<!--<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>--> <filter class="solr.WordDelimiterGraphFilterFactory"
<!--<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>--> generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" splitOnCaseChange="1"/>
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/> <filter class="solr.PorterStemFilterFactory"/>
@ -195,156 +29,11 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
<fieldType name="textTight" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
<!--<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
<filter class="solr.FlattenGraphFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<!--<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!--
Setup simple analysis for spell checking
-->
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
-->
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<!-- KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
<!-- The LowerCase TokenFilter does what you expect, which can be
when you want your sorting to be case insensitive
-->
<filter class="solr.LowerCaseFilterFactory"/>
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory"/>
<!-- The PatternReplaceFilter gives you the flexibility to use
Java Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string,
which may include back refrences to portions of the orriginal
string matched by the pattern.
See the Java Regular Expression documentation for more
infomation on pattern and replacement string syntax.
http://docs.oracle.com/javase/8/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory"
pattern="([^a-z])" replacement="" replace="all"
/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed, any data added to
them will be ignored outright
-->
<fieldType name="ignored" stored="false" indexed="false" class="solr.StrField"/>
<!-- Valid attributes for fields:
name: mandatory - the name for the field
type: mandatory - the name of a previously defined type from the <fieldType>s
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
termVectors: [false] set to true to store the term vector for a given field.
When using MoreLikeThis, fields used for similarity should be stored for
best performance.
-->
<field name="id" type="string" indexed="true" stored="true" required="true"/> <field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="url" type="string" indexed="true" stored="true" required="false"/>
<field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true"/>
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/> <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/> <field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true"/>
<field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/> <field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="body" type="text" indexed="true" stored="true" multiValued="true"/> <field name="testSet" type="string" indexed="true" stored="false" multiValued="false" required="true" />
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
a "*" only at the start or the end.
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="random*" type="random"/>
<dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
<dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
<!-- uncomment the following to ignore any fields that don't already match an existing
field name or dynamic field, rather than reporting them as an error.
alternately, change the type="ignored" to some other type e.g. "text" if you want
unknown fields indexed and/or stored by default -->
<!--dynamicField name="*" type="ignored" /-->
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey> <uniqueKey>id</uniqueKey>
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="url" dest="text"/>
<copyField source="title" dest="text"/>
<copyField source="body" dest="text"/>
<copyField source="snippet" dest="text"/>
<!-- dynamic destination -->
<copyField source="*_dynamic" dest="dynamic_*"/>
<copyField source="id" dest="range_facet_l"/>
</schema> </schema>

View File

@ -1,4 +1,5 @@
<?xml version="1.0" encoding="UTF-8" ?> <?xml version="1.0" encoding="UTF-8" ?>
<!-- <!--
Licensed to the Apache Software Foundation (ASF) under one or more Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with contributor license agreements. See the NOTICE file distributed with
@ -32,409 +33,97 @@
<useCompoundFile>${useCompoundFile:false}</useCompoundFile> <useCompoundFile>${useCompoundFile:false}</useCompoundFile>
</indexConfig> </indexConfig>
<!-- Enables JMX if and only if an existing MBeanServer is found, use
this if you want to configure JMX through JVM parameters. Remove
this to disable exposing Solr configuration and statistics to JMX.
If you want to connect to a particular server, specify the agentId
e.g. <jmx agentId="myAgent" />
If you want to start a new MBeanServer, specify the serviceUrl
e.g <jmx serviceurl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr" />
For more details see http://wiki.apache.org/solr/SolrJmx
-->
<jmx />
<!-- the default high-performance update handler -->
<updateHandler class="solr.DirectUpdateHandler2"> <updateHandler class="solr.DirectUpdateHandler2">
<!-- A prefix of "solr." for class names is an alias that
causes solr to search appropriate packages, including
org.apache.solr.(search|update|request|core|analysis)
-->
<!-- Perform a <commit/> automatically under certain conditions:
maxDocs - number of updates since last commit is greater than this
maxTime - oldest uncommited update (in ms) is this long ago
<autoCommit>
<maxDocs>10000</maxDocs>
<maxTime>1000</maxTime>
</autoCommit>
-->
</updateHandler> </updateHandler>
<query>
<!-- Maximum number of clauses in a boolean query... can affect
range or prefix queries that expand to big boolean
queries. An exception is thrown if exceeded. -->
<maxBooleanClauses>${solr.max.booleanClauses:1024}</maxBooleanClauses>
<!-- Cache used by SolrIndexSearcher for filters (DocSets),
unordered sets of *all* documents that match a query.
When a new searcher is opened, its caches may be prepopulated
or "autowarmed" using data from caches in the old searcher.
autowarmCount is the number of items to prepopulate. For CaffeineCache,
the autowarmed items will be the most recently accessed items.
Parameters:
class - the SolrCache implementation (currently only CaffeineCache)
size - the maximum number of entries in the cache
initialSize - the initial capacity (number of entries) of
the cache. (seel java.util.HashMap)
autowarmCount - the number of entries to prepopulate from
an old cache.
-->
<filterCache
class="solr.CaffeineCache"
size="512"
initialSize="512"
autowarmCount="128"/>
<!-- queryResultCache caches results of searches - ordered lists of
document ids (DocList) based on a query, a sort, and the range
of documents requested. -->
<queryResultCache
class="solr.CaffeineCache"
size="512"
initialSize="512"
autowarmCount="32"/>
<!-- documentCache caches Lucene Document objects (the stored fields for each document).
Since Lucene internal document ids are transient, this cache will not be autowarmed. -->
<documentCache
class="solr.CaffeineCache"
size="512"
initialSize="512"
autowarmCount="0"/>
<!-- If true, stored fields that are not requested will be loaded lazily.
This can result in a significant speed improvement if the usual case is to
not load all stored fields, especially if the skipped fields are large compressed
text fields.
-->
<enableLazyFieldLoading>true</enableLazyFieldLoading>
<!-- Example of a generic cache. These caches may be accessed by name
through SolrIndexSearcher.getCache(),cacheLookup(), and cacheInsert().
The purpose is to enable easy caching of user/application level data.
The regenerator argument should be specified as an implementation
of solr.search.CacheRegenerator if autowarming is desired. -->
<!--
<cache name="myUserCache"
class="solr.CaffeineCache"
size="4096"
initialSize="1024"
autowarmCount="1024"
regenerator="org.mycompany.mypackage.MyRegenerator"
/>
-->
<!-- An optimization that attempts to use a filter to satisfy a search.
If the requested sort does not include score, then the filterCache
will be checked for a filter matching the query. If found, the filter
will be used as the source of document ids, and then the sort will be
applied to that.
<useFilterForSortedQuery>true</useFilterForSortedQuery>
-->
<!-- An optimization for use with the queryResultCache. When a search
is requested, a superset of the requested number of document ids
are collected. For example, if a search for a particular query
requests matching documents 10 through 19, and queryWindowSize is 50,
then documents 0 through 49 will be collected and cached. Any further
requests in that range can be satisfied via the cache. -->
<queryResultWindowSize>50</queryResultWindowSize>
<!-- Maximum number of documents to cache for any entry in the
queryResultCache. -->
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
<!-- a newSearcher event is fired whenever a new searcher is being prepared
and there is a current searcher handling requests (aka registered). -->
<!-- QuerySenderListener takes an array of NamedList and executes a
local query request for each NamedList in sequence. -->
<listener event="newSearcher" class="solr.QuerySenderListener">
<arr name="queries">
<lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
<lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
<lst><str name="q">static newSearcher warming query from solrconfig.xml</str></lst>
</arr>
</listener>
<!-- a firstSearcher event is fired whenever a new searcher is being
prepared but there is no current registered searcher to handle
requests or to gain autowarming data from. -->
<listener event="firstSearcher" class="solr.QuerySenderListener">
<arr name="queries">
<lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
<lst><str name="q">static firstSearcher warming query from solrconfig.xml</str></lst>
</arr>
</listener>
<!-- If a search request comes in and there is no current registered searcher,
then immediately register the still warming searcher and use it. If
"false" then all requests will block until the first searcher is done
warming. -->
<useColdSearcher>false</useColdSearcher>
</query>
<requestDispatcher>
<!--Make sure your system has some authentication before enabling remote streaming!
<requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="-1" />
-->
<!-- Set HTTP caching related parameters (for proxy caches and clients).
To get the behaviour of Solr 1.2 (ie: no caching related headers)
use the never304="true" option and do not specify a value for
<cacheControl>
-->
<!-- <httpCaching never304="true"> -->
<httpCaching lastModifiedFrom="openTime"
etagSeed="Solr">
<!-- lastModFrom="openTime" is the default, the Last-Modified value
(and validation against If-Modified-Since requests) will all be
relative to when the current Searcher was opened.
You can change it to lastModFrom="dirLastMod" if you want the
value to exactly corrispond to when the physical index was last
modified.
etagSeed="..." is an option you can change to force the ETag
header (and validation against If-None-Match requests) to be
differnet even if the index has not changed (ie: when making
significant changes to your config file)
lastModifiedFrom and etagSeed are both ignored if you use the
never304="true" option.
-->
<!-- If you include a <cacheControl> directive, it will be used to
generate a Cache-Control header, as well as an Expires header
if the value contains "max-age="
By default, no Cache-Control header is generated.
You can use the <cacheControl> option even if you have set
never304="true"
-->
<!-- <cacheControl>max-age=30, public</cacheControl> -->
</httpCaching>
</requestDispatcher>
<requestHandler name="/select" class="solr.SearchHandler"> <requestHandler name="/select" class="solr.SearchHandler">
<!-- default values for query parameters --> <!-- default values for query parameters -->
<lst name="defaults"> <lst name="defaults">
<str name="echoParams">explicit</str> <str name="echoParams">explicit</str>
<!--
<int name="rows">10</int>
<str name="fl">*</str>
<str name="version">2.1</str>
-->
</lst> </lst>
<arr name="last-components"> <arr name="last-components">
<str>clustering</str> <str>clustering</str>
</arr> </arr>
</requestHandler> </requestHandler>
<requestHandler name="docClustering" class="solr.SearchHandler">
<!-- default values for query parameters -->
<lst name="defaults">
<str name="echoParams">explicit</str>
<!--
<int name="rows">10</int>
<str name="fl">*</str>
<str name="version">2.1</str>
-->
</lst>
<arr name="last-components">
<str>doc-clustering</str>
</arr>
</requestHandler>
<!-- DisMaxRequestHandler allows easy searching across multiple fields
for simple user-entered phrases. Its implementation is now
just the standard SearchHandler with a default query parser
of "dismax".
see http://wiki.apache.org/solr/DisMaxRequestHandler
-->
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering"> <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering">
<!-- Declare an engine -->
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">default</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">mock-external-attrs</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
<!-- takes precedence over external XML -->
<int name="MockClusteringAlgorithm.labels">4</int>
</lst>
<lst name="engine">
<str name="name">echo</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">lexical-resource-check</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">lexical-resource-check-custom-resource-dir</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
<str name="carrot.resourcesDir">clustering/custom</str>
</lst>
<lst name="engine">
<str name="name">custom-duplicating-tokenizer</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoTokensClusteringAlgorithm</str>
<str name="PreprocessingPipeline.tokenizerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingTokenizerFactory</str>
</lst>
<lst name="engine">
<str name="name">custom-duplicating-stemmer</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoStemsClusteringAlgorithm</str>
<str name="PreprocessingPipeline.stemmerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingStemmerFactory</str>
</lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
<!-- Declare an engine -->
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">mock</str>
<str name="classname">org.apache.solr.handler.clustering.MockDocumentClusteringEngine</str>
</lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering-name-default">
<lst name="engine">
<str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
</lst>
<lst name="engine"> <lst name="engine">
<str name="name">default</str> <str name="name">default</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="clustering.fields">title</str>
<str name="clustering.algorithm">MockClusteringAlgorithm</str>
</lst> </lst>
<lst name="engine">
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering-name-decl-order">
<lst name="engine">
<bool name="optional">true</bool>
<str name="name">unavailable</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.UnavailableAlgorithm</str>
</lst>
<lst name="engine"> <lst name="engine">
<str name="name">lingo</str> <str name="name">lingo</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="clustering.fields">title, snippet</str>
<str name="clustering.algorithm">Lingo</str>
</lst> </lst>
<lst name="engine"> <lst name="engine">
<str name="name">stc</str> <str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> <str name="clustering.fields">title, snippet</str>
<str name="clustering.algorithm">STC</str>
</lst> </lst>
<lst name="engine">
<str name="name">kmeans</str>
<str name="clustering.fields">title, snippet</str>
<str name="clustering.algorithm">Bisecting K-Means</str>
</lst>
<lst name="engine"> <lst name="engine">
<str name="name">mock</str> <str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str> <str name="clustering.fields">title</str>
<str name="clustering.algorithm">MockClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">mock-solrconfig-attrs</str>
<str name="clustering.fields">title, snippet</str>
<str name="clustering.algorithm">MockClusteringAlgorithm</str>
<bool name="clustering.includeOtherTopics">false</bool>
<int name="maxClusters">2</int>
<int name="hierarchyDepth">1</int>
</lst>
<lst name="engine">
<str name="name">echo</str>
<str name="clustering.algorithm">EchoClusteringAlgorithm</str>
<str name="clustering.fields">title, snippet</str>
</lst>
<lst name="engine">
<str name="name">testCustomLanguageResources</str>
<str name="clustering.algorithm">ResourceCheckAlgorithm</str>
<str name="clustering.fields">title</str>
<str name="clustering.resources">testCustomLanguageResources</str>
<bool name="clustering.includeOtherTopics">false</bool>
<str name="text">
was
bar
baz
</str>
</lst>
<lst name="engine">
<str name="name">testParamDefaultLanguage</str>
<str name="clustering.fields">title</str>
<str name="clustering.algorithm">ResourceCheckAlgorithm</str>
<bool name="clustering.includeOtherTopics">false</bool>
<str name="clustering.language">German</str>
<str name="text">
abc
</str>
</lst>
<lst name="engine">
<str name="name">testParamLanguageField</str>
<str name="clustering.algorithm">ResourceCheckAlgorithm</str>
<bool name="clustering.includeOtherTopics">false</bool>
<str name="clustering.fields">title</str>
<str name="clustering.languageField">lang</str>
<str name="clustering.language">Italian</str>
<str name="text">test</str>
</lst> </lst>
</searchComponent> </searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering-name-dups">
<lst name="engine">
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
</searchComponent>
<highlighting>
<!-- Configure the standard fragmenter -->
<!-- This could most likely be commented out in the "default" case -->
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
<lst name="defaults">
<int name="hl.fragsize">100</int>
</lst>
</fragmenter>
<!-- A regular-expression-based fragmenter (f.i., for sentence extraction) -->
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
<lst name="defaults">
<!-- slightly smaller fragsizes work better because of slop -->
<int name="hl.fragsize">70</int>
<!-- allow 50% slop on fragment sizes -->
<float name="hl.regex.slop">0.5</float>
<!-- a basic sentence pattern -->
<str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
</lst>
</fragmenter>
<!-- Configure the standard formatter -->
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
<lst name="defaults">
<str name="hl.simple.pre"><![CDATA[<em>]]></str>
<str name="hl.simple.post"><![CDATA[</em>]]></str>
</lst>
</formatter>
</highlighting>
<!-- queryResponseWriter plugins... query responses will be written using the
writer specified by the 'wt' request parameter matching the name of a registered
writer.
The "default" writer is the default and will be used if 'wt' is not specified
in the request. XMLResponseWriter will be used if nothing is specified here.
The json, python, and ruby writers are also available by default.
<queryResponseWriter name="xml" class="solr.XMLResponseWriter" default="true"/>
<queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
<queryResponseWriter name="python" class="solr.PythonResponseWriter"/>
<queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/>
<queryResponseWriter name="php" class="solr.PHPResponseWriter"/>
<queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/>
<queryResponseWriter name="custom" class="com.example.MyResponseWriter"/>
-->
<!-- XSLT response writer transforms the XML output by any xslt file found
in Solr's conf/xslt directory. Changes to xslt files are checked for
every xsltCacheLifetimeSeconds.
-->
<queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
<int name="xsltCacheLifetimeSeconds">5</int>
</queryResponseWriter>
<!-- example of registering a query parser
<queryParser name="lucene" class="org.apache.solr.search.LuceneQParserPlugin"/>
-->
<!-- example of registering a custom function parser
<valueSourceParser name="myfunc" class="com.mycompany.MyValueSourceParser" />
-->
<!-- config for the admin interface -->
<admin>
<defaultQuery>solr</defaultQuery>
</admin>
</config> </config>

View File

@ -14,12 +14,12 @@
# limitations under the License. # limitations under the License.
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
# a couple of test stopwords to test that the words are really being # A couple of test stopwords to test that the words are really being
# configured from this file: # configured from this file:
stopworda stopworda
stopwordb stopwordb
#Standard english stop words taken from Lucene's StopAnalyzer # Standard english stop words taken from Lucene's StopAnalyzer
a a
an an
and and
@ -56,4 +56,3 @@ was
will will
with with
solrownstopword solrownstopword

View File

@ -1,31 +0,0 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text
aaa => aaaa
bbb => bbbb1 bbbb2
ccc => cccc1,cccc2
a\=>a => b\=>b
a\,a => b\,b
fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
pixima => pixma

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<Configuration>
<Appenders>
<Console name="STDERR" target="SYSTEM_ERR">
<PatternLayout>
<Pattern>
%maxLen{%-4r %-5p (%t) [%X{node_name} %X{collection} %X{shard} %X{replica} %X{core} %X{trace_id}] %c{1.} %m%notEmpty{
=>%ex{short}}}{10240}%n
</Pattern>
</PatternLayout>
</Console>
</Appenders>
<Loggers>
<Logger name="org.apache.zookeeper" level="WARN"/>
<Logger name="org.apache.hadoop" level="WARN"/>
<Logger name="org.apache.directory" level="WARN"/>
<Logger name="org.apache.solr.hadoop" level="INFO"/>
<Logger name="org.eclipse.jetty" level="INFO"/>
<Root level="INFO">
<AppenderRef ref="STDERR"/>
</Root>
</Loggers>
</Configuration>

View File

@ -0,0 +1,17 @@
- Knowledge Discovery [6]
- Patterns [6]
- Data Mining Applications [5]
- Statistical Analysis [4]
- Computer [3]
- Creating [3]
- Data Mining Solutions [3]
- Known as Data Mining [3]
- Text Mining [3]
- Databases KDD [2]
- Extraction of Hidden Predictive [2]
- Information from Large [2]
- Open [2]
- Powers [2]
- Searching [2]
- Tools [2]
- Other topics [1]

View File

@ -0,0 +1,10 @@
- Knowledge Discovery [8]
- Databases [6]
- Patterns [6]
- Analysis [5]
- Applications [5]
- Software [5]
- Businesses [4]
- Predictive [4]
- Process [4]
- Other topics [2]

View File

@ -0,0 +1,2 @@
- Lang: English
- was[-, -] bar[ignoredWord, ignoredLabel] baz[-, ignoredLabel]

View File

@ -0,0 +1,8 @@
- Foundations; Includes; Tutorials [4]
- Institute; DMI; Agencies; Analyzing; Different; Group; June; Knowledge-discovery; Microsoft; Office; Perspectives; Projects; Reported; SourceWatch; Started; Stores; Summarizing; UW-Madison [4]
- Integrated; Page; St@tServ [4]
- Oracle; Social; Media; Pentaho; Visualization [4]
- Patterns; Extraction; Managers [4]
- SQL; Server; Techniques [4]
- Predictive; Enterprise; Analytics [3]
- Text; Searching; Correlations; Discovering; Fuel; Gleaned; Investor; Involves; Iterative; Raw; Relationships; SAS; Smarter; Snooping; Unnoticed [3]

View File

@ -0,0 +1,17 @@
- Knowledge Discovery [6]
- Patterns [6]
- Data Mining Applications [5]
- Statistical Analysis [4]
- Computer [3]
- Creating [3]
- Data Mining Solutions [3]
- Known as Data Mining [3]
- Text Mining [3]
- Databases KDD [2]
- Extraction of Hidden Predictive [2]
- Information from Large [2]
- Open [2]
- Powers [2]
- Searching [2]
- Tools [2]
- Other topics [1]

View File

@ -0,0 +1,9 @@
- English
- Lang: English
- test[-, -]
- French
- Lang: French
- test[-, -]
- German
- Lang: German
- test[-, -]

View File

@ -0,0 +1,12 @@
- Cluster 1
- Cluster 1.1 [3]
- Cluster 1.2 [3]
- Cluster 1.3 [3]
- Cluster 2
- Cluster 2.1 [3]
- Cluster 2.2 [3]
- Cluster 2.3 [3]
- Cluster 3
- Cluster 3.1 [3]
- Cluster 3.2 [3]
- Cluster 3.3 [3]

View File

@ -0,0 +1,4 @@
- Cluster 1 [9]
- Cluster 2 [9]
- Cluster 3 [9]
- Other topics [3]

View File

@ -0,0 +1,13 @@
- Cluster 1
- Cluster 1.1 [3]
- Cluster 1.2 [3]
- Cluster 1.3 [3]
- Cluster 2
- Cluster 2.1 [3]
- Cluster 2.2 [3]
- Cluster 2.3 [3]
- Cluster 3
- Cluster 3.1 [3]
- Cluster 3.2 [3]
- Cluster 3.3 [3]
- Other topics [3]

View File

@ -0,0 +1,10 @@
- Knowledge Discovery [8]
- Databases [6]
- Patterns [6]
- Analysis [5]
- Applications [5]
- Software [5]
- Businesses [4]
- Predictive [4]
- Process [4]
- Other topics [2]

View File

@ -1,250 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import java.io.File;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.junit.BeforeClass;
/**
*
*/
public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
protected static int numberOfDocs = 0;
@BeforeClass
public static void beforeClass() throws Exception {
File testHome = createTempDir().toFile();
FileUtils.copyDirectory(getFile("clustering/solr"), testHome);
initCore("solrconfig.xml", "schema.xml", testHome.getAbsolutePath());
numberOfDocs = 0;
for (String[] doc : DOCUMENTS) {
assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
numberOfDocs++;
}
// Add a multi-valued snippet
final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
multiValuedSnippet.addField("id", numberOfDocs++);
multiValuedSnippet.addField("title", "Title");
multiValuedSnippet.addField("url", "URL");
multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
// Add a document with multi-field title and snippet
final SolrInputDocument multiFieldDoc = new SolrInputDocument();
multiFieldDoc.addField("id", numberOfDocs++);
multiFieldDoc.addField("title", "Title field");
multiFieldDoc.addField("heading", "Heading field");
multiFieldDoc.addField("url", "URL");
multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
// Add a document with one language supported by Carrot2
final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
docWithOneSupprtedLanguage.addField("title", "");
docWithOneSupprtedLanguage.addField("url", "one_supported_language");
docWithOneSupprtedLanguage.addField("lang", "zh-cn");
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
// Add a document with more languages, one supported by Carrot2
final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
docWithOneSupprtedLanguageOfMany.addField("lang", "de");
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
// Add a document with more languages, one supported by Carrot2
final SolrInputDocument docWithCustomFields = new SolrInputDocument();
docWithCustomFields.addField("id", numberOfDocs++);
docWithCustomFields.addField("url", "custom_fields");
docWithCustomFields.addField("intfield_i", 10);
docWithCustomFields.addField("floatfield_f", 10.5);
docWithCustomFields.addField("heading", "first");
docWithCustomFields.addField("heading", "second");
assertNull(h.validateUpdate(adoc(docWithCustomFields)));
assertNull(h.validateUpdate(commit()));
}
/**
* Expose package-scope methods from {@link ClusteringComponent} to tests.
*/
protected final Map<String,SearchClusteringEngine> getSearchClusteringEngines(ClusteringComponent comp) {
return comp.getSearchClusteringEngines();
}
final static String[][] DOCUMENTS = new String[][]{
{"http://en.wikipedia.org/wiki/Data_mining",
"Data Mining - Wikipedia",
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."},
{"http://en.wikipedia.org/wiki/Datamining",
"Data mining - Wikipedia, the free encyclopedia",
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."},
{"http://www.statsoft.com/textbook/stdatmin.html",
"Electronic Statistics Textbook: Data Mining Techniques",
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."},
{"http://www.thearling.com/text/dmwhite/dmwhite.htm",
"An Introduction to Data Mining",
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."},
{"http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
"Data Mining: What is Data Mining?",
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."},
{"http://www.spss.com/datamine",
"Data Mining Software, Data Mining Applications and Data Mining Solutions",
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."},
{"http://www.kdnuggets.com/",
"KD Nuggets",
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."},
{"http://www.answers.com/topic/data-mining",
"data mining: Definition from Answers.com",
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."},
{"http://www.statsoft.com/products/dataminer.htm",
"STATISTICA Data Mining and Predictive Modeling Solutions",
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."},
{"http://datamining.typepad.com/",
"Data Mining: Text Mining, Visualization and Social Media",
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."},
{"http://www.twocrows.com/",
"Two Crows Corporation",
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."},
{"http://www.thearling.com/",
"Thearling.com",
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."},
{"http://www.ccsu.edu/datamining/",
"CCSU - Data Mining",
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."},
{"http://www.oracle.com/technology/products/bi/odm",
"Oracle Data Mining",
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."},
{"http://databases.about.com/od/datamining/a/datamining.htm",
"Data Mining: An Introduction",
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."},
{"http://www.dmoz.org/Computers/Software/Databases/Data_Mining/",
"Open Directory - Computers: Software: Databases: Data Mining",
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."},
{"http://www.cs.wisc.edu/dmi/",
"DMI:Data Mining Institute",
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."},
{"http://www.the-data-mine.com/",
"The Data Mine",
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."},
{"http://www.statserv.com/datamining.html",
"St@tServ - About Data Mining",
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."},
{"http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm",
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."},
{"http://www.pentaho.com/products/data_mining/",
"Pentaho Commercial Open Source Business Intelligence: Data Mining",
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."},
{"http://www.investorhome.com/mining.htm",
"Investor Home - Data Mining",
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."},
{"http://www.datamining.com/",
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."},
{"http://www.sourcewatch.org/index.php?title=Data_mining",
"Data mining - SourceWatch",
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."},
{"http://www.autonlab.org/tutorials/",
"Statistical Data Mining Tutorials",
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."},
{"http://www.microstrategy.com/data-mining/index.asp",
"Data Mining",
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."},
{"http://www.datamininglab.com/",
"Elder Research",
"Provides consulting and short courses in data mining and pattern discovery patterns in data."},
{"http://www.sqlserverdatamining.com/",
"SQL Server Data Mining > Home",
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."},
{"http://databases.about.com/cs/datamining/g/dmining.htm",
"Data Mining",
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."},
{"http://www.sas.com/technologies/analytics/datamining/index.html",
"Data Mining Software and Text Mining | SAS",
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."}
};
}

View File

@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.response.Cluster;
import org.apache.solr.client.solrj.response.ClusteringResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.List;
import java.util.function.Consumer;
import java.util.stream.Collectors;
@SuppressSSL
public class ClusteringComponentDistributedTest extends BaseDistributedSearchTestCase {
private final static String QUERY_TESTSET_SAMPLE_DOCUMENTS = "testSet:sampleDocs";
@Override
public String getSolrHome() {
return getFile("clustering/solr/collection1").getParent();
}
@Before
public void indexDocs() throws Exception {
del("*:*");
String[] languages = {
"English",
"French",
"German",
"Unknown",
};
int docId = 0;
for (String[] doc : SampleData.SAMPLE_DOCUMENTS) {
index(
"id", Integer.toString(docId),
"title", doc[0],
"snippet", doc[1],
"testSet", "sampleDocs",
"lang", languages[docId % languages.length]
);
docId++;
}
commit();
}
@Test
@ShardsFixed(num = 2)
public void testLingoAlgorithm() throws Exception {
compareToExpected(clusters(QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "lingo");
}));
}
@Test
@ShardsFixed(num = 2)
public void testStcAlgorithm() throws Exception {
compareToExpected(clusters(QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "stc");
}));
}
private void compareToExpected(List<Cluster> actual) throws IOException {
String resourceSuffix = "";
String expected = ClusteringComponentTest.getTestResource(getClass(), resourceSuffix);
ClusteringComponentTest.compareWhitespaceNormalized(toString(actual), expected);
}
private List<Cluster> clusters(String query, Consumer<ModifiableSolrParams> paramsConsumer) throws Exception {
handle.clear();
handle.put("responseHeader", SKIP);
handle.put("response", SKIP);
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.Q, query);
params.add(CommonParams.ROWS, "1000");
params.add(CommonParams.SORT, id + " desc");
params.add(ClusteringComponent.COMPONENT_NAME, "true");
paramsConsumer.accept(params);
QueryResponse response = query(true, params);
ClusteringResponse clusteringResponse = response.getClusteringResponse();
Assert.assertNotNull(clusteringResponse);
return clusteringResponse.getClusters();
}
private String toString(List<Cluster> clusters) {
return toString(clusters, "", new StringBuilder()).toString();
}
private StringBuilder toString(List<Cluster> clusters, String indent, StringBuilder sb) {
clusters.forEach(c -> {
sb.append(indent);
sb.append("- " + c.getLabels().stream().collect(Collectors.joining("; ")));
if (!c.getDocs().isEmpty()) {
sb.append(" [" + c.getDocs().size() + "]");
}
sb.append("\n");
if (!c.getClusters().isEmpty()) {
toString(c.getClusters(), indent + " ", sb);
}
});
return sb;
}
}

View File

@ -16,128 +16,380 @@
*/ */
package org.apache.solr.handler.clustering; package org.apache.solr.handler.clustering;
import java.util.HashSet; import com.carrotsearch.randomizedtesting.RandomizedContext;
import java.util.Set; import org.apache.commons.io.FileUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.solr.client.solrj.response.ClusteringResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.QueryComponent; import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.response.ResultContext;
import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.DocList; import org.carrot2.clustering.Cluster;
import org.apache.solr.search.QueryCommand; import org.hamcrest.MatcherAssert;
import org.apache.solr.search.QueryResult; import org.hamcrest.Matchers;
import org.junit.Before; import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
/** /**
* * Tests {@link Engine}.
* */
**/ public class ClusteringComponentTest extends SolrTestCaseJ4 {
public class ClusteringComponentTest extends AbstractClusteringTestCase { private final static String QUERY_TESTSET_SAMPLE_DOCUMENTS = "testSet:sampleDocs";
@Before @BeforeClass
public void doBefore() { public static void beforeClass() throws Exception {
clearIndex(); File testHome = createTempDir().toFile();
FileUtils.copyDirectory(getFile("clustering/solr"), testHome);
initCore("solrconfig.xml", "schema.xml", testHome.getAbsolutePath());
String[] languages = {
"English",
"French",
"German",
"Unknown",
};
int docId = 0;
for (String[] doc : SampleData.SAMPLE_DOCUMENTS) {
assertNull(h.validateUpdate(adoc(
"id", Integer.toString(docId),
"title", doc[0],
"snippet", doc[1],
"testSet", "sampleDocs",
"lang", languages[docId % languages.length])));
docId++;
}
assertNull(h.validateUpdate(commit()));
} }
@Test @Test
public void testComponent() throws Exception { public void testLingoAlgorithm() throws Exception {
SolrCore core = h.getCore(); compareToExpected(clusters("lingo", QUERY_TESTSET_SAMPLE_DOCUMENTS));
SearchComponent sc = core.getSearchComponent("clustering");
assertTrue("sc is null and it shouldn't be", sc != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(ClusteringComponent.COMPONENT_NAME, "true");
params.add(CommonParams.Q, "*:*");
params.add(ClusteringParams.USE_SEARCH_RESULTS, "true");
SolrRequestHandler handler = core.getRequestHandler("/select");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
rsp.addResponseHeader(new SimpleOrderedMap<>());
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
NamedList<?> values = rsp.getValues();
Object clusters = values.get("clusters");
//System.out.println("Clusters: " + clusters);
assertTrue("clusters is null and it shouldn't be", clusters != null);
req.close();
params = new ModifiableSolrParams();
params.add(ClusteringComponent.COMPONENT_NAME, "true");
params.add(ClusteringParams.ENGINE_NAME, "mock");
params.add(ClusteringParams.USE_COLLECTION, "true");
params.add(QueryComponent.COMPONENT_NAME, "false");
handler = core.getRequestHandler("docClustering");
rsp = new SolrQueryResponse();
rsp.addResponseHeader(new SimpleOrderedMap<>());
req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
values = rsp.getValues();
clusters = values.get("clusters");
//System.out.println("Clusters: " + clusters);
assertTrue("clusters is null and it shouldn't be", clusters != null);
req.close();
} }
// tests ClusteringComponent.docListToSolrDocumentList
@Test @Test
public void testDocListConversion() throws Exception { public void testStcAlgorithm() throws Exception {
assertU("", adoc("id", "3234", "url", "ignoreme", "val_i", "1", compareToExpected(clusters("stc", QUERY_TESTSET_SAMPLE_DOCUMENTS));
"val_dynamic", "quick red fox"));
assertU("", adoc("id", "3235", "url", "ignoreme", "val_i", "1",
"val_dynamic", "quick green fox"));
assertU("", adoc("id", "3236", "url", "ignoreme", "val_i", "1",
"val_dynamic", "quick brown fox"));
assertU("", commit());
h.getCore().withSearcher(srchr -> {
QueryResult qr = new QueryResult();
QueryCommand cmd = new QueryCommand();
cmd.setQuery(new MatchAllDocsQuery());
cmd.setLen(10);
qr = srchr.search(qr, cmd);
DocList docs = qr.getDocList();
assertEquals("wrong docs size", 3, docs.size());
Set<String> fields = new HashSet<>();
fields.add("val_dynamic");
fields.add("dynamic_val");
fields.add("range_facet_l"); // copied from id
SolrDocumentList list = ClusteringComponent.docListToSolrDocumentList(docs, srchr, fields, null);
assertEquals("wrong list Size", docs.size(), list.size());
for (SolrDocument document : list) {
assertTrue("unexpected field", ! document.containsKey("val_i"));
assertTrue("unexpected id field", ! document.containsKey("id"));
assertTrue("original field", document.containsKey("val_dynamic"));
assertTrue("dyn copy field", document.containsKey("dynamic_val"));
assertTrue("copy field", document.containsKey("range_facet_l"));
assertNotNull("original field null", document.get("val_dynamic"));
assertNotNull("dyn copy field null", document.get("dynamic_val"));
assertNotNull("copy field null", document.get("range_facet_l"));
} }
return null;
@Test
public void testKmeansAlgorithm() throws Exception {
compareToExpected(clusters("kmeans", QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
@Test
public void testParamSubclusters() throws Exception {
compareToExpected("off", clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, false);
}));
compareToExpected("on", clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, true);
}));
}
@Test
public void testParamOtherTopics() throws Exception {
compareToExpected(clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, false);
}));
}
/**
* We'll make two queries, one with- and another one without summary
* and assert that documents are shorter when highlighter is in use.
*/
@Test
public void testClusteringOnHighlights() throws Exception {
String query = "+snippet:mine +" + QUERY_TESTSET_SAMPLE_DOCUMENTS;
Consumer<ModifiableSolrParams> common = params -> {
params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
params.add(EngineParameters.PARAM_CONTEXT_SIZE, Integer.toString(80));
params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(1));
};
List<Cluster<SolrDocument>> highlighted = clusters("echo", query,
common.andThen(params -> {
params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
}));
List<Cluster<SolrDocument>> full = clusters("echo", query,
common.andThen(params -> {
params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "false");
}));
// Echo clustering algorithm just returns document fields as cluster labels
// so highlighted snippets should never be longer than full field content.
Assert.assertEquals(highlighted.size(), full.size());
for (int i = 0; i < highlighted.size(); i++) {
List<String> labels1 = highlighted.get(i).getLabels();
List<String> labels2 = full.get(i).getLabels();
assertEquals(labels1.size(), labels2.size());
for (int j = 0; j < labels1.size(); j++) {
MatcherAssert.assertThat("Summary shorter than original document?",
labels1.get(j).length(),
Matchers.lessThanOrEqualTo(labels2.get(j).length()));
}
}
}
/**
* We'll make two queries, one short summaries and another one with longer
* summaries and will check that the results differ.
*/
@Test
public void testSummaryFragSize() throws Exception {
String query = "+snippet:mine +" + QUERY_TESTSET_SAMPLE_DOCUMENTS;
Consumer<ModifiableSolrParams> common = params -> {
params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(1));
};
List<Cluster<SolrDocument>> shortSummaries = clusters("echo", query,
common.andThen(params -> {
params.add(EngineParameters.PARAM_CONTEXT_SIZE, Integer.toString(30));
}));
List<Cluster<SolrDocument>> longSummaries = clusters("echo", query,
common.andThen(params -> {
params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(80));
}));
Assert.assertEquals(shortSummaries.size(), longSummaries.size());
for (int i = 0; i < shortSummaries.size(); i++) {
List<String> shortLabels = shortSummaries.get(i).getLabels();
List<String> longLabels = longSummaries.get(i).getLabels();
assertEquals(shortLabels.size(), longLabels.size());
for (int j = 0; j < shortLabels.size(); j++) {
MatcherAssert.assertThat("Shorter summary is longer than longer summary?",
shortLabels.get(j).length(),
Matchers.lessThanOrEqualTo(longLabels.get(j).length()));
}
}
}
/**
* Test passing algorithm parameters via SolrParams.
*/
@Test
public void testPassingAttributes() throws Exception {
compareToExpected(clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.set("maxClusters", 2);
params.set("hierarchyDepth", 1);
params.add(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
}));
}
/**
* Test passing algorithm parameters via Solr configuration file.
*/
@Test
public void testPassingAttributesViaSolrConfig() throws Exception {
compareToExpected(clusters("mock-solrconfig-attrs", QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
/**
* Test maximum label truncation.
*/
@Test
public void testParamMaxLabels() throws Exception {
List<Cluster<SolrDocument>> clusters = clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
params.set("labelsPerCluster", "5");
params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
params.set(EngineParameters.PARAM_MAX_LABELS, "3");
});
clusters.forEach(c -> {
MatcherAssert.assertThat(c.getLabels(), Matchers.hasSize(3));
}); });
} }
@Test
public void testCustomLanguageResources() throws Exception {
compareToExpected(clusters(
"testCustomLanguageResources",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
@Test
public void testParamDefaultLanguage() throws Exception {
compareToExpected(clusters(
"testParamDefaultLanguage",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
/**
* Verify that documents with an explicit language name
* field are clustered in separate batches.
*
* @see EngineParameters#PARAM_LANGUAGE_FIELD
*/
@Test
public void testParamLanguageField() throws Exception {
compareToExpected(clusters(
"testParamLanguageField",
QUERY_TESTSET_SAMPLE_DOCUMENTS));
}
private void compareToExpected(List<Cluster<SolrDocument>> clusters) throws IOException {
compareToExpected("", clusters);
}
private void compareToExpected(String resourceSuffix,
List<Cluster<SolrDocument>> clusters) throws IOException {
String actual = toString(clusters);
String expected = getTestResource(getClass(), resourceSuffix);
compareWhitespaceNormalized(actual, expected);
}
static void compareWhitespaceNormalized(String actual, String expected) {
Function<String, String> normalize = v -> v.replaceAll("\r", "").replaceAll("[ \t]+", " ").trim();
if (!normalize.apply(expected).equals(normalize.apply(actual))) {
throw new AssertionError(String.format(Locale.ROOT,
"The actual clusters structure differs from the expected one. Expected:\n%s\n\nActual:\n%s",
expected,
actual));
}
}
static String getTestResource(Class<?> clazz, String expectedResourceSuffix) throws IOException {
RandomizedContext ctx = RandomizedContext.current();
String resourceName = String.format(Locale.ROOT,
"%s-%s%s.txt",
ctx.getTargetClass().getSimpleName(),
ctx.getTargetMethod().getName(),
expectedResourceSuffix.isEmpty() ? "" : "-" + expectedResourceSuffix);
String expected;
try (InputStream is = clazz.getResourceAsStream(resourceName)) {
if (is == null) {
throw new AssertionError("Test resource not found: " + resourceName + " (class-relative to " +
clazz.getName() + ")");
}
expected = new String(is.readAllBytes(), StandardCharsets.UTF_8);
}
return expected;
}
private String toString(List<Cluster<SolrDocument>> clusters) {
return toString(clusters, "", new StringBuilder()).toString();
}
private StringBuilder toString(List<Cluster<SolrDocument>> clusters, String indent, StringBuilder sb) {
clusters.forEach(c -> {
sb.append(indent);
sb.append("- " + c.getLabels().stream().collect(Collectors.joining("; ")));
if (!c.getDocuments().isEmpty()) {
sb.append(" [" + c.getDocuments().size() + "]");
}
sb.append("\n");
if (!c.getClusters().isEmpty()) {
toString(c.getClusters(), indent + " ", sb);
}
});
return sb;
}
private List<Cluster<SolrDocument>> clusters(String engineName, String query, Consumer<ModifiableSolrParams> paramsConsumer) {
return clusters("/select", engineName, query, paramsConsumer);
}
private List<Cluster<SolrDocument>> clusters(String engineName, String query) {
return clusters("/select", engineName, query, params -> {
});
}
private List<Cluster<SolrDocument>> clusters(String handlerName, String engineName, String query,
Consumer<ModifiableSolrParams> paramsConsumer) {
SolrCore core = h.getCore();
ModifiableSolrParams reqParams = new ModifiableSolrParams();
reqParams.add(ClusteringComponent.COMPONENT_NAME, "true");
reqParams.add(ClusteringComponent.REQUEST_PARAM_ENGINE, engineName);
reqParams.add(CommonParams.Q, query);
reqParams.add(CommonParams.ROWS, "1000");
paramsConsumer.accept(reqParams);
SearchHandler handler = (SearchHandler) core.getRequestHandler(handlerName);
assertTrue("Clustering engine named '" + engineName + "' exists.", handler.getComponents().stream()
.filter(c -> c instanceof ClusteringComponent)
.flatMap(c -> ((ClusteringComponent) c).getEngineNames().stream())
.anyMatch(localName -> Objects.equals(localName, engineName)));
SolrQueryResponse rsp = new SolrQueryResponse();
rsp.addResponseHeader(new SimpleOrderedMap<>());
try (SolrQueryRequest req = new LocalSolrQueryRequest(core, reqParams)) {
handler.handleRequest(req, rsp);
NamedList<?> values = rsp.getValues();
@SuppressWarnings("unchecked")
List<NamedList<Object>> clusters = (List<NamedList<Object>>) values.get("clusters");
String idField = core.getLatestSchema().getUniqueKeyField().getName();
Map<String, SolrDocument> idToDoc = new HashMap<>();
ResultContext resultContext = (ResultContext) rsp.getResponse();
for (Iterator<SolrDocument> it = resultContext.getProcessedDocuments(); it.hasNext(); ) {
SolrDocument doc = it.next();
idToDoc.put(doc.getFirstValue(idField).toString(), doc);
}
return clusters.stream().map(c -> toCluster(c, idToDoc)).collect(Collectors.toList());
}
}
@SuppressWarnings("unchecked")
private Cluster<SolrDocument> toCluster(NamedList<Object> v, Map<String, SolrDocument> idToDoc) {
Cluster<SolrDocument> c = new Cluster<>();
v.forEach((key, value) -> {
switch (key) {
case ClusteringResponse.DOCS_NODE:
((List<String>) value).forEach(docId -> c.addDocument(idToDoc.get(docId)));
break;
case ClusteringResponse.LABELS_NODE:
((List<String>) value).forEach(c::addLabel);
break;
case ClusteringResponse.SCORE_NODE:
c.setScore(((Number) value).doubleValue());
break;
case ClusteringResponse.CLUSTERS_NODE:
((List<NamedList<Object>>) value).forEach(sub -> {
c.addCluster(toCluster(sub, idToDoc));
});
break;
case ClusteringResponse.IS_OTHER_TOPICS:
// Just ignore the attribute.
break;
default:
throw new RuntimeException("Unknown output property " + key + " in cluster: " + v.jsonStr());
}
});
return c;
}
} }

View File

@ -1,54 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.common.params.CommonParams;
import org.junit.Test;
@SuppressSSL
public class DistributedClusteringComponentTest extends
BaseDistributedSearchTestCase {
@Override
public String getSolrHome() {
return getFile("clustering/solr/collection1").getParent();
}
@Test
public void test() throws Exception {
del("*:*");
int numberOfDocs = 0;
for (String[] doc : AbstractClusteringTestCase.DOCUMENTS) {
index(id, Integer.toString(numberOfDocs++), "url", doc[0], "title", doc[1], "snippet", doc[2]);
}
commit();
handle.clear();
// Only really care about the clusters for this test case, so drop the header and response
handle.put("responseHeader", SKIP);
handle.put("response", SKIP);
query(
ClusteringComponent.COMPONENT_NAME, "true",
CommonParams.Q, "*:*",
CommonParams.SORT, id + " desc",
ClusteringParams.USE_SEARCH_RESULTS, "true");
// destroy is not needed because distribTearDown method of base class does it.
//destroyServers();
}
}

View File

@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.carrot2.attrs.AttrComposite;
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.Document;
import org.carrot2.language.LanguageComponents;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
/**
* Test-only pseudo clustering algorithm that creates
* a cluster for each input document and sets the labels
* of this cluster to the full content of clustered input
* fields.
*/
public class EchoClusteringAlgorithm extends AttrComposite implements ClusteringAlgorithm {
@Override
public boolean supports(LanguageComponents languageComponents) {
return true;
}
@Override
public Set<Class<?>> requiredLanguageComponents() {
return Collections.emptySet();
}
@Override
public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> documentStream, LanguageComponents languageComponents) {
List<Cluster<T>> clusters = new ArrayList<>();
documentStream.forEach(document -> {
final Cluster<T> cluster = new Cluster<>();
cluster.addDocument(document);
document.visitFields((field, value) -> {
cluster.addLabel(field + ":" + value);
});
clusters.add(cluster);
});
return clusters;
}
}

View File

@ -15,28 +15,20 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.solr.handler.clustering; package org.apache.solr.handler.clustering;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.search.DocSet;
import org.carrot2.clustering.ClusteringAlgorithmProvider;
/** /**
* * SPI provider of {@link EchoClusteringAlgorithm}.
* */
**/ public class EchoClusteringAlgorithmProvider implements ClusteringAlgorithmProvider {
public class MockDocumentClusteringEngine extends DocumentClusteringEngine {
@Override @Override
public NamedList<?> cluster(DocSet docs, SolrParams solrParams) { public String name() {
return new NamedList<>(); return EchoClusteringAlgorithm.class.getSimpleName();
} }
@Override @Override
public NamedList<?> cluster(SolrParams solrParams) { public EchoClusteringAlgorithm get() {
return new NamedList<>(); return new EchoClusteringAlgorithm();
}
@Override
public boolean isAvailable() {
return true;
} }
} }

View File

@ -0,0 +1,129 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.carrot2.attrs.AttrComposite;
import org.carrot2.attrs.AttrInteger;
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.Document;
import org.carrot2.language.LanguageComponents;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Creates a stable set of synthetic clusters based on the provided parameters.
*/
public class MockClusteringAlgorithm extends AttrComposite implements ClusteringAlgorithm {
public AttrInteger docsInCluster =
attributes.register(
"docsInCluster",
AttrInteger.builder().label("Number of documents in each cluster.")
.min(1)
.max(5)
.defaultValue(3));
public AttrInteger hierarchyDepth =
attributes.register(
"hierarchyDepth",
AttrInteger.builder().label("Levels of clusters hierarchy.")
.min(1)
.max(3)
.defaultValue(2));
public AttrInteger maxClusters =
attributes.register(
"maxClusters",
AttrInteger.builder().label("Maximum number of clusters at each hierarchy level.")
.min(2)
.max(100)
.defaultValue(3));
public AttrInteger labelsPerCluster =
attributes.register(
"labelsPerCluster",
AttrInteger.builder().label("Number of labels generated for each cluster.")
.min(1)
.max(5)
.defaultValue(1));
@Override
public boolean supports(LanguageComponents languageComponents) {
return true;
}
@Override
public Set<Class<?>> requiredLanguageComponents() {
return Collections.emptySet();
}
@Override
public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> documentStream,
LanguageComponents languageComponents) {
List<T> documents = documentStream.collect(Collectors.toList());
if (docsInCluster.get() > documents.size()) {
throw new AssertionError();
}
Supplier<T> docSupplier = new Supplier<>() {
Iterator<T> i = documents.iterator();
@Override
public T get() {
if (!i.hasNext()) {
i = documents.iterator();
}
return i.next();
}
};
return createClusters(hierarchyDepth.get(), "Cluster ", docSupplier);
}
private <T extends Document> List<Cluster<T>> createClusters(int level, String prefix,
Supplier<T> docSupplier) {
ArrayList<Cluster<T>> clusters = new ArrayList<>();
for (int count = maxClusters.get(), idx = 1; count > 0; count--, idx++) {
String label = prefix + (prefix.endsWith(" ") ? "" : ".") + idx;
Cluster<T> c = new Cluster<>();
c.addLabel(label);
for (int cnt = 1, max = labelsPerCluster.get(); cnt < max; cnt++) {
c.addLabel("Label " + cnt);
}
c.setScore(level * count * 0.01);
if (level == 1) {
for (int j = docsInCluster.get(); j > 0; j--) {
c.addDocument(docSupplier.get());
}
} else {
createClusters(level - 1, label, docSupplier).forEach(c::addCluster);
}
clusters.add(c);
}
return clusters;
}
}

View File

@ -14,20 +14,18 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.solr.handler.clustering.carrot2; package org.apache.solr.handler.clustering;
import org.carrot2.core.LanguageCode; import org.carrot2.clustering.ClusteringAlgorithmProvider;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.IStemmerFactory;
public class DuplicatingStemmerFactory implements IStemmerFactory { public class MockClusteringAlgorithmProvider implements ClusteringAlgorithmProvider {
@Override @Override
public IStemmer getStemmer(LanguageCode language) { public String name() {
return new IStemmer() { return MockClusteringAlgorithm.class.getSimpleName();
@Override
public CharSequence stem(CharSequence word) {
return word.toString() + word.toString();
} }
};
@Override
public MockClusteringAlgorithm get() {
return new MockClusteringAlgorithm();
} }
} }

View File

@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
import org.carrot2.attrs.AttrComposite;
import org.carrot2.attrs.AttrString;
import org.carrot2.clustering.Cluster;
import org.carrot2.clustering.ClusteringAlgorithm;
import org.carrot2.clustering.Document;
import org.carrot2.language.LanguageComponents;
import org.carrot2.language.LexicalData;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Creates synthetic clusters with diagnostics of
* {@link LanguageComponents} passed to the clustering method.
*/
class ResourceCheckAlgorithm extends AttrComposite implements ClusteringAlgorithm {
public AttrString text =
attributes.register(
"text",
AttrString.builder().label("Input text to analyze.")
.defaultValue(null));
@Override
public Set<Class<?>> requiredLanguageComponents() {
return Set.of(LexicalData.class);
}
@Override
public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> documentStream,
LanguageComponents languageComponents) {
ArrayList<Cluster<T>> clusters = new ArrayList<>();
Cluster<T> cluster = new Cluster<>();
cluster.addLabel("Lang: " + languageComponents.language());
clusters.add(cluster);
cluster = new Cluster<>();
clusters.add(cluster);
LexicalData lexicalData = languageComponents.get(LexicalData.class);
cluster.addLabel(Arrays.stream(text.get().trim().split("[\\s]+"))
.map(term -> String.format(Locale.ROOT,
"%s[%s, %s]",
term,
lexicalData.ignoreWord(term) ? "ignoredWord" : "-",
lexicalData.ignoreLabel(term) ? "ignoredLabel" : "-"))
.collect(Collectors.joining(" ")));
return clusters;
}
}

View File

@ -14,12 +14,18 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.solr.handler.clustering;
/** import org.carrot2.clustering.ClusteringAlgorithmProvider;
* {@link org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine} and related classes for use in the {@link org.apache.solr.handler.clustering.ClusteringComponent}.
*/
package org.apache.solr.handler.clustering.carrot2;
public class ResourceCheckAlgorithmProvider implements ClusteringAlgorithmProvider {
@Override
public String name() {
return ResourceCheckAlgorithm.class.getSimpleName();
}
@Override
public ResourceCheckAlgorithm get() {
return new ResourceCheckAlgorithm();
}
}

View File

@ -0,0 +1,146 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering;
/**
* Sample data for tests.
*/
final class SampleData {
static final String[][] SAMPLE_DOCUMENTS =
new String[][]{
{
"Data Mining - Wikipedia",
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."
},
{
"Data mining - Wikipedia, the free encyclopedia",
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."
},
{
"Electronic Statistics Textbook: Data Mining Techniques",
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."
},
{
"An Introduction to Data Mining",
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."
},
{
"Data Mining: What is Data Mining?",
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."
},
{
"Data Mining Software, Data Mining Applications and Data Mining Solutions",
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."
},
{
"KD Nuggets",
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."
},
{
"data mining: Definition from Answers.com",
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."
},
{
"STATISTICA Data Mining and Predictive Modeling Solutions",
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."
},
{
"Data Mining: Text Mining, Visualization and Social Media",
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."
},
{
"Two Crows Corporation",
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."
},
{
"Thearling.com",
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."
},
{
"CCSU - Data Mining",
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."
},
{
"Oracle Data Mining",
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."
},
{
"Data Mining: An Introduction",
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."
},
{
"Open Directory - Computers: Software: Databases: Data Mining",
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."
},
{
"DMI:Data Mining Institute",
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."
},
{
"The Data Mine",
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."
},
{
"St@tServ - About Data Mining",
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."
},
{
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."
},
{
"Pentaho Commercial Open Source Business Intelligence: Data Mining",
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."
},
{
"Investor Home - Data Mining",
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."
},
{
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."
},
{
"Data mining - SourceWatch",
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."
},
{
"Statistical Data Mining Tutorials",
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."
},
{
"Data Mining",
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."
},
{
"Elder Research",
"Provides consulting and short courses in data mining and pattern discovery patterns in data."
},
{
"SQL Server Data Mining > Home",
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."
},
{
"Data Mining",
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."
},
{
"Data Mining Software and Text Mining | SAS",
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."
}
};
}

View File

@ -1,542 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.clustering.AbstractClusteringTestCase;
import org.apache.solr.handler.clustering.ClusteringComponent;
import org.apache.solr.handler.clustering.ClusteringEngine;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.search.DocList;
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
/**
*
*/
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
// Note: the expected number of clusters may change after upgrading Carrot2
// due to e.g. internal improvements or tuning of Carrot2 clustering.
final int expectedNumClusters = 10;
checkEngine(getClusteringEngine("default"), expectedNumClusters);
}
@Test
public void testProduceSummary() throws Exception {
// We'll make two queries, one with- and another one without summary
// and assert that documents are shorter when highlighter is in use.
final List<NamedList<Object>> noSummaryClusters = clusterWithHighlighting(false, 80);
final List<NamedList<Object>> summaryClusters = clusterWithHighlighting(true, 80);
assertEquals("Equal number of clusters", noSummaryClusters.size(), summaryClusters.size());
for (int i = 0; i < noSummaryClusters.size(); i++) {
assertTrue("Summary shorter than original document",
getLabels(noSummaryClusters.get(i)).get(1).length() >
getLabels(summaryClusters.get(i)).get(1).length());
}
}
@Test
public void testSummaryFragSize() throws Exception {
// We'll make two queries, one short summaries and another one with longer
// summaries and will check that the results differ.
final List<NamedList<Object>> shortSummaryClusters = clusterWithHighlighting(true, 30);
final List<NamedList<Object>> longSummaryClusters = clusterWithHighlighting(true, 80);
assertEquals("Equal number of clusters", shortSummaryClusters.size(), longSummaryClusters.size());
for (int i = 0; i < shortSummaryClusters.size(); i++) {
assertTrue("Summary shorter than original document",
getLabels(shortSummaryClusters.get(i)).get(1).length() <
getLabels(longSummaryClusters.get(i)).get(1).length());
}
}
private List<NamedList<Object>> clusterWithHighlighting(
boolean enableHighlighting, int fragSize) throws IOException {
// Some documents don't have mining in the snippet
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 7);
}
private List<NamedList<Object>> clusterWithHighlighting(
boolean enableHighlighting, int fragSize, int summarySnippets,
String term, int expectedNumDocuments) throws IOException {
final TermQuery query = new TermQuery(new Term("snippet", term));
final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
summaryParams.add(CarrotParams.PRODUCE_SUMMARY,
Boolean.toString(enableHighlighting));
summaryParams
.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
summaryParams
.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets));
final List<NamedList<Object>> summaryClusters = checkEngine(
getClusteringEngine("echo"), expectedNumDocuments,
expectedNumDocuments, query, summaryParams);
return summaryClusters;
}
@Test
public void testCarrotStc() throws Exception {
checkEngine(getClusteringEngine("stc"), 3);
}
@Test
public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
1, 1, 0);
}
@Test
public void testExternalXmlAttributesFile() throws Exception {
checkClusters(
checkEngine(getClusteringEngine("mock-external-attrs"), 13),
1, 4, 0);
}
@Test
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
}
@Test
public void testNumDescriptions() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
public void testClusterScores() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
AbstractClusteringTestCase.numberOfDocs, params);
int i = 1;
for (NamedList<Object> cluster : clusters) {
final Double score = getScore(cluster);
assertNotNull(score);
assertEquals(0.25 * i++, score, 0);
}
}
@Test
public void testOtherTopics() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
AbstractClusteringTestCase.numberOfDocs, params);
int i = 1;
for (NamedList<Object> cluster : clusters) {
assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
}
}
@Test
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
"online,customsolrstopword,customsolrstoplabel");
}
@Test
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
throws IOException {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
wordsToCheck);
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
// stoplabels.mt, so we're expecting only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine(engineName), 1, params);
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
}
@Test
public void testSolrStopWordsUsedInCarrot2Clustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
// "solrownstopword" is in stopwords.txt, so we're expecting
// only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 1, params);
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
}
@Test
public void testSolrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
// Force string fields to be used for clustering. Does not make sense
// in a real word, but does the job in the test.
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 2, params);
assertEquals(Collections.singletonList("online"), getLabels(clusters.get(0)));
assertEquals(Collections.singletonList("solrownstopword"), getLabels(clusters.get(1)));
}
@Test
public void testHighlightingOfMultiValueField() throws Exception {
final String snippetWithoutSummary = getLabels(clusterWithHighlighting(
false, 30, 3, "multi", 1).get(0)).get(1);
assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First"));
assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second"));
assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third"));
final String snippetWithSummary = getLabels(clusterWithHighlighting(
true, 30, 3, "multi", 1).get(0)).get(1);
assertTrue("Snippet with summary shorter than full snippet",
snippetWithoutSummary.length() > snippetWithSummary.length());
assertTrue("Summary covers first value", snippetWithSummary.contains("First"));
assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
}
@Test
public void testConcatenatingMultipleFields() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
"snippet")), params).get(0));
assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
}
@Test
public void testHighlightingMultipleFields() throws Exception {
final TermQuery query = new TermQuery(new Term("snippet", "content"));
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
final String snippetWithoutSummary = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
final String snippetWithSummary = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
assertTrue("Snippet with summary shorter than full snippet",
snippetWithoutSummary.length() > snippetWithSummary.length());
assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
}
@Test
public void testOneCarrot2SupportedLanguage() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"one_supported_language")), params).get(0));
assertEquals(3, labels.size());
assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
}
@Test
public void testOneCarrot2SupportedLanguageOfMany() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"one_supported_language_of_many")), params).get(0));
assertEquals(3, labels.size());
assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
}
@Test
public void testLanguageCodeMapping() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"one_supported_language_of_many")), params).get(0));
assertEquals(3, labels.size());
assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
}
@Test
public void testPassingOfCustomFields() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.CUSTOM_FIELD_NAME, "intfield_i:intfield");
params.add(CarrotParams.CUSTOM_FIELD_NAME, "floatfield_f:floatfield");
params.add(CarrotParams.CUSTOM_FIELD_NAME, "heading:multi");
// Let the echo mock clustering algorithm know which custom field to echo
params.add("custom-fields", "intfield,floatfield,multi");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"custom_fields")), params).get(0));
assertEquals(5, labels.size());
assertEquals("Integer field", "10", labels.get(2));
assertEquals("Float field", "10.5", labels.get(3));
assertEquals("List field", "[first, second]", labels.get(4));
}
@Test
public void testCustomTokenizer() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("custom-duplicating-tokenizer"), 1, 15, new TermQuery(new Term("title",
"field")), params).get(0));
// The custom test tokenizer duplicates each token's text
assertTrue("First token", labels.get(0).contains("TitleTitle"));
}
@Test
public void testCustomStemmer() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("custom-duplicating-stemmer"), 1, 12, new TermQuery(new Term("title",
"field")), params).get(0));
// The custom test stemmer duplicates and lowercases each token's text
assertTrue("First token", labels.get(0).contains("titletitle"));
}
@Test
public void testDefaultEngineOrder() throws Exception {
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default");
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
assertEquals(
Arrays.asList("stc", "default", "mock"),
new ArrayList<>(engines.keySet()));
assertEquals(
LingoClusteringAlgorithm.class,
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
}
@Test
public void testDeclarationEngineOrder() throws Exception {
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order");
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
assertEquals(
Arrays.asList("unavailable", "lingo", "stc", "mock", "default"),
new ArrayList<>(engines.keySet()));
assertEquals(
LingoClusteringAlgorithm.class,
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
}
@Test
public void testDeclarationNameDuplicates() throws Exception {
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups");
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
assertEquals(
Arrays.asList("", "default"),
new ArrayList<>(engines.keySet()));
assertEquals(
MockClusteringAlgorithm.class,
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
}
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
assertNotNull("clustering component should not be null", comp);
CarrotClusteringEngine engine =
(CarrotClusteringEngine) getSearchClusteringEngines(comp).get(engineName);
assertNotNull("clustering engine for name: " + engineName
+ " should not be null", engine);
return engine;
}
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
}
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
}
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
return h.getCore().withSearcher(searcher -> {
DocList docList = searcher.getDocList(query, (Query) null, new Sort(), 0,
numberOfDocs);
assertEquals("docList size", expectedNumDocs, docList.matches());
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(clusteringParams);
// Perform clustering
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
Map<SolrDocument,Integer> docIds = new HashMap<>(docList.size());
SolrDocumentList solrDocList = ClusteringComponent.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
@SuppressWarnings("unchecked")
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
return results;
});
}
private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
NamedList<Object> cluster = results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
checkCluster(results.get(i), hasSubclusters);
}
}
private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
Object id = docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
List<String> labels = getLabels(cluster);
assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) {
List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
getDocs(cluster).size());
assertEquals("number of labels in cluster", expectedLabelCount,
getLabels(cluster).size());
if (expectedSubclusterCount > 0) {
List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
@SuppressWarnings("unchecked")
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
return (List<NamedList<Object>>) cluster.get("clusters");
}
@SuppressWarnings("unchecked")
private List<String> getLabels(NamedList<Object> cluster) {
return (List<String>) cluster.get("labels");
}
private Double getScore(NamedList<Object> cluster) {
return (Double) cluster.get("score");
}
private Boolean isOtherTopics(NamedList<Object> cluster) {
return (Boolean)cluster.get("other-topics");
}
@SuppressWarnings("unchecked")
private List<Object> getDocs(NamedList<Object> cluster) {
return (List<Object>) cluster.get("docs");
}
}

View File

@ -1,51 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.io.IOException;
import java.io.Reader;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ITokenizerFactory;
import org.carrot2.text.util.MutableCharArray;
public class DuplicatingTokenizerFactory implements ITokenizerFactory {
@Override
public ITokenizer getTokenizer(LanguageCode language) {
return new ITokenizer() {
private final ExtendedWhitespaceTokenizer delegate = new ExtendedWhitespaceTokenizer();
@Override
public void setTermBuffer(MutableCharArray buffer) {
delegate.setTermBuffer(buffer);
buffer.reset(buffer.toString() + buffer.toString());
}
@Override
public void reset(Reader input) {
delegate.reset(input);
}
@Override
public short nextToken() throws IOException {
return delegate.nextToken();
}
};
}
}

View File

@ -1,76 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
/**
* A mock Carrot2 clustering algorithm that outputs input documents as clusters.
* Useful only in tests.
*/
@Bindable(prefix = "EchoClusteringAlgorithm")
public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
public List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
public List<Cluster> clusters;
@Input
@Processing
@Attribute(key = "custom-fields")
public String customFields = "";
@Override
public void process() throws ProcessingException {
clusters = new ArrayList<>();
for (Document document : documents) {
final Cluster cluster = new Cluster();
cluster.addPhrases(document.getTitle(), document.getSummary());
if (document.getLanguage() != null) {
cluster.addPhrases(document.getLanguage().name());
}
for (String field : customFields.split(",")) {
Object value = document.getField(field);
if (value != null) {
cluster.addPhrases(value.toString());
}
}
cluster.addDocuments(document);
clusters.add(cluster);
}
}
}

View File

@ -1,74 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.PreprocessingContext.AllStems;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
/**
* A mock Carrot2 clustering algorithm that outputs stem of each token of each
* document as a separate cluster. Useful only in tests.
*/
@Bindable(prefix = "EchoTokensClusteringAlgorithm")
public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase
implements IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
public List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
public List<Cluster> clusters;
public BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
documents, "", LanguageCode.ENGLISH);
final AllTokens allTokens = preprocessingContext.allTokens;
final AllWords allWords = preprocessingContext.allWords;
final AllStems allStems = preprocessingContext.allStems;
clusters = new ArrayList<>();
for (int i = 0; i < allTokens.image.length; i++) {
if (allTokens.wordIndex[i] >= 0) {
clusters.add(new Cluster(new String(
allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]])));
}
}
}
}

View File

@ -1,68 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
/**
* A mock Carrot2 clustering algorithm that outputs each token of each document
* as a separate cluster. Useful only in tests.
*/
@Bindable(prefix = "EchoTokensClusteringAlgorithm")
public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase
implements IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
public List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
public List<Cluster> clusters;
public BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
documents, "", LanguageCode.ENGLISH);
clusters = new ArrayList<>();
for (char[] token : preprocessingContext.allTokens.image) {
if (token != null) {
clusters.add(new Cluster(new String(token)));
}
}
}
}

View File

@ -1,79 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.util.ArrayList;
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
/**
* A mock implementation of Carrot2 clustering algorithm for testing whether the
* customized lexical resource lookup works correctly. This algorithm ignores
* the input documents and instead for each word from {@link #wordsToCheck}, it
* outputs a cluster labeled with the word only if the word is neither a stop
* word nor a stop label.
*/
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
public class LexicalResourcesCheckClusteringAlgorithm extends
ProcessingComponentBase implements IClusteringAlgorithm {
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
public List<Cluster> clusters;
@Input
@Processing
@Attribute
public String wordsToCheck;
public BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
clusters = new ArrayList<>();
if (wordsToCheck == null) {
return;
}
// Test with Maltese so that the English clustering performed in other tests
// is not affected by the test stopwords and stoplabels.
ILexicalData lexicalData = preprocessing.lexicalDataFactory
.getLexicalData(LanguageCode.MALTESE);
for (String word : wordsToCheck.split(",")) {
if (!lexicalData.isCommonWord(new MutableCharArray(word))
&& !lexicalData.isStopLabel(word)) {
clusters.add(new Cluster(word));
}
}
}
}

View File

@ -1,103 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import org.carrot2.core.*;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import java.util.ArrayList;
import java.util.List;
@Bindable(prefix = "MockClusteringAlgorithm")
public class MockClusteringAlgorithm extends ProcessingComponentBase implements
IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
public List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
public List<Cluster> clusters;
@Input
@Processing
@Attribute
@IntRange(min = 1, max = 5)
public int depth = 2;
@Input
@Processing
@Attribute
@IntRange(min = 1, max = 5)
public int labels = 1;
@Input
@Processing
@Attribute
@IntRange(min = 0)
public int maxClusters = 0;
@Input
@Processing
@Attribute
public int otherTopicsModulo = 0;
@Override
public void process() throws ProcessingException {
clusters = new ArrayList<>();
if (documents == null) {
return;
}
if (maxClusters > 0) {
documents = documents.subList(0, maxClusters);
}
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
Cluster cluster = createCluster(label.toString(), documentIndex, document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
Cluster newCluster = createCluster(label.toString(), documentIndex, document);
cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
Cluster cluster = new Cluster();
cluster.setScore(documentIndex * 0.25);
if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
{
cluster.setOtherTopics(true);
}
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}
cluster.addDocuments(documents);
return cluster;
}
}

View File

@ -1 +0,0 @@
7f13f63e2e213f6ea38364836408d2dc11f29804

View File

@ -1,202 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -1,9 +0,0 @@
=========================================================================
== Carrot2 Attributes Binder Notice ==
=========================================================================
Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
All rights reserved.
This product includes software developed by the Carrot2 Project.
See http://project.carrot2.org/

View File

@ -1,7 +1,7 @@
Carrot2 Project Carrot2 Project
Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński. Copyright (C) 2002-2020, Dawid Weiss, Stanisław Osiński.
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without modification, Redistribution and use in source and binary forms, with or without modification,

View File

@ -1,10 +1,10 @@
========================================================================= =========================================================================
== Carrot2 Notice == == Carrot2 Notice ==
========================================================================= =========================================================================
Copyright (C) 2002-2013, Dawid Weiss, Stanislaw Osinski. Copyright (C) 2002-2020, Dawid Weiss, Stanislaw Osinski.
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file. Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
All rights reserved. All rights reserved.
This product includes software developed by the Carrot2 Project. This product includes software developed by the Carrot2 Project.
See http://project.carrot2.org/ See https://project.carrot2.org/

View File

@ -0,0 +1 @@
fb60ab80cfd69abe6cad1939f24bd5210501b177

View File

@ -1 +0,0 @@
539317dc171b8c92cca964e87686602800cf19b0

View File

@ -1,202 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -1,5 +0,0 @@
This product includes software developed by
Google, Inc. (http://code.google.com/p/guava-libraries/)
Repacked Carrot2 Guava at:
https://github.com/carrot2/lib-repackaged

View File

@ -1 +0,0 @@
decabb42b88a8d40c1894984f4af8adb833f766b

View File

@ -1 +0,0 @@
045fda5ac6087bc82a209d8cdb73f8d0dbdcfc7b

View File

@ -1,202 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -1,2 +0,0 @@
This product includes software developed by
the SimpleXML project (http://simple.sourceforge.net).

View File

@ -1,11 +0,0 @@
An override location of the clustering algorithm's resources
attribute definitions and lexical resources.
A directory from which to load algorithm-specific stop words,
stop labels and attribute definition XMLs.
For an overview of Carrot2 lexical resources, see:
http://download.carrot2.org/head/manual/#chapter.lexical-resources
For an overview of Lingo3G lexical resources, see:
http://download.carrotsearch.com/lingo3g/manual/#chapter.lexical-resources

View File

@ -1,19 +0,0 @@
<!--
Default configuration for the bisecting k-means clustering algorithm.
This file can be loaded (and saved) by Carrot2 Workbench.
http://project.carrot2.org/download.html
-->
<attribute-sets default="attributes">
<attribute-set id="attributes">
<value-set>
<label>attributes</label>
<attribute key="MultilingualClustering.defaultLanguage">
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
</attribute>
<attribute key="MultilingualClustering.languageAggregationStrategy">
<value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -1,24 +0,0 @@
<!--
Default configuration for the Lingo clustering algorithm.
This file can be loaded (and saved) by Carrot2 Workbench.
http://project.carrot2.org/download.html
-->
<attribute-sets default="attributes">
<attribute-set id="attributes">
<value-set>
<label>attributes</label>
<!--
The language to assume for clustered documents.
For a list of allowed values, see:
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
-->
<attribute key="MultilingualClustering.defaultLanguage">
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
</attribute>
<attribute key="LingoClusteringAlgorithm.desiredClusterCountBase">
<value type="java.lang.Integer" value="20"/>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -1,19 +0,0 @@
<!--
Default configuration for the STC clustering algorithm.
This file can be loaded (and saved) by Carrot2 Workbench.
http://project.carrot2.org/download.html
-->
<attribute-sets default="attributes">
<attribute-set id="attributes">
<value-set>
<label>attributes</label>
<attribute key="MultilingualClustering.defaultLanguage">
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
</attribute>
<attribute key="MultilingualClustering.languageAggregationStrategy">
<value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -1002,7 +1002,7 @@
</arr> </arr>
</requestHandler> </requestHandler>
<!-- Clustering Component <!-- Search results clustering component
You'll need to set the solr.clustering.enabled system property You'll need to set the solr.clustering.enabled system property
when running solr to run with clustering enabled: when running solr to run with clustering enabled:
@ -1014,69 +1014,64 @@
enable="${solr.clustering.enabled:false}" enable="${solr.clustering.enabled:false}"
class="solr.clustering.ClusteringComponent" > class="solr.clustering.ClusteringComponent" >
<!-- <!--
Declaration of "engines" (clustering algorithms). Declaration of "engines" (named sets of configuration parameters).
The open source algorithms from Carrot2.org project: Various algorithms are available (names are loaded via service provider
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm extension point). The open source algorithms from Carrot2.org project:
* org.carrot2.clustering.stc.STCClusteringAlgorithm * Lingo
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm * STC
* Bisecting K-Means
Commercial algorithm Lingo3G (needs to be installed separately): Commercial algorithm Lingo3G from Carrot Search (needs to be installed separately):
* com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm * Lingo3G
--> -->
<lst name="engine"> <lst name="engine">
<str name="name">lingo3g</str> <str name="name">lingo3g</str>
<bool name="optional">true</bool> <bool name="optional">true</bool>
<str name="carrot.algorithm">com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm</str> <str name="clustering.algorithm">Lingo3G</str>
<str name="carrot.resourcesDir">clustering/carrot2</str> <str name="clustering.fields">name, features</str>
<bool name="clustering.includeOtherTopics">true</bool>
<str name="clustering.language">English</str>
</lst> </lst>
<lst name="engine"> <lst name="engine">
<str name="name">lingo</str> <str name="name">lingo</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="clustering.algorithm">Lingo</str>
<str name="carrot.resourcesDir">clustering/carrot2</str> <str name="clustering.fields">name, features</str>
<bool name="clustering.includeOtherTopics">true</bool>
<str name="clustering.language">English</str>
</lst> </lst>
<lst name="engine"> <lst name="engine">
<str name="name">stc</str> <str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> <str name="clustering.algorithm">STC</str>
<str name="carrot.resourcesDir">clustering/carrot2</str> <str name="clustering.fields">name, features</str>
<bool name="clustering.includeOtherTopics">true</bool>
<str name="clustering.language">English</str>
</lst> </lst>
<lst name="engine"> <lst name="engine">
<str name="name">kmeans</str> <str name="name">kmeans</str>
<str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str> <str name="clustering.algorithm">Bisecting K-Means</str>
<str name="carrot.resourcesDir">clustering/carrot2</str> <str name="clustering.fields">name, features</str>
<bool name="clustering.includeOtherTopics">true</bool>
<str name="clustering.language">English</str>
</lst> </lst>
</searchComponent> </searchComponent>
<!-- A request handler for demonstrating the clustering component. <!-- A request handler for demonstrating the clustering component.
This is meant as an example. This is meant as an example - in reality you will likely want
In reality you will likely want to add the component to your to add the clustering component to your default request handler.
already specified request handlers.
--> -->
<requestHandler name="/clustering" <requestHandler name="/clustering"
startup="lazy" startup="lazy"
enable="${solr.clustering.enabled:false}" enable="${solr.clustering.enabled:false}"
class="solr.SearchHandler"> class="solr.SearchHandler">
<lst name="defaults"> <lst name="defaults">
<!-- Enable clustering component by default. -->
<bool name="clustering">true</bool> <bool name="clustering">true</bool>
<bool name="clustering.results">true</bool>
<!-- Field name with the logical "title" of a each document (optional) -->
<str name="carrot.title">name</str>
<!-- Field name with the logical "URL" of a each document (optional) -->
<str name="carrot.url">id</str>
<!-- Field name with the logical "content" of a each document (optional) -->
<str name="carrot.snippet">features</str>
<!-- Apply highlighter to the title/ content and use this for clustering. -->
<bool name="carrot.produceSummary">true</bool>
<!-- the maximum number of labels per cluster -->
<!--<int name="carrot.numDescriptions">5</int>-->
<!-- produce sub clusters -->
<bool name="carrot.outputSubClusters">false</bool>
<!-- Configure the remaining request handler parameters. -->
<str name="defType">edismax</str> <str name="defType">edismax</str>
<str name="qf"> <str name="qf">
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4 text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
@ -1085,6 +1080,8 @@
<str name="rows">100</str> <str name="rows">100</str>
<str name="fl">*,score</str> <str name="fl">*,score</str>
</lst> </lst>
<!-- Append the clustering component at the end of the search handler's list of components. -->
<arr name="last-components"> <arr name="last-components">
<str>clustering</str> <str>clustering</str>
</arr> </arr>

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 206 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 394 KiB

After

Width:  |  Height:  |  Size: 622 KiB

View File

@ -16,35 +16,102 @@
// specific language governing permissions and limitations // specific language governing permissions and limitations
// under the License. // under the License.
[IMPORTANT]
====
The clustering component implementation and API (parameters) have changed significantly
in version 9.0. Please refer to the Solr Guide version matching your Solr release
exactly.
====
The *clustering* (or *cluster analysis*) plugin attempts to automatically discover groups of related search hits (documents) and assign human-readable labels to these groups. The *clustering* (or *cluster analysis*) plugin attempts to automatically discover groups of related search hits (documents) and assign human-readable labels to these groups.
By default in Solr, the clustering algorithm is applied to the search result of each single query -— this is called an _on-line_ clustering. While Solr contains an extension for full-index clustering (_off-line_ clustering) this section will focus on discussing on-line clustering only. The clustering algorithm in Solr is applied to documents included in search result of each single query -— this is called an _on-line_ clustering.
Clusters discovered for a given query can be perceived as _dynamic facets_. This is beneficial when regular faceting is difficult (field values are not known in advance) or when the queries are exploratory in nature. Take a look at the https://search.carrot2.org/#/search/web/solr/treemap[Carrot2] project's demo page to see an example of search results clustering in action (the groups in the visualization have been discovered automatically in search results to the right, there is no external information involved). Clusters discovered for a given query can be perceived as _dynamic facets_. This is beneficial when regular faceting is difficult (field values are not known in advance) or when the queries are exploratory in nature. Take a look at the https://search.carrot2.org/#/search/web/apache%20solr/treemap[Carrot^2^] project's demo page to see an example of search results clustering in action (the groups in the visualization have been discovered automatically in search results to the right, there is no external information involved).
image::images/result-clustering/carrot2.png[image,width=900] image::images/result-clustering/carrot2.png[image,width=900]
The query issued to the system was _Solr_. It seems clear that faceting could not yield a similar set of groups, although the goals of both techniques are similar—to let the user explore the set of search results and either rephrase the query or narrow the focus to a subset of current documents. Clustering is also similar to <<result-grouping.adoc#result-grouping,Result Grouping>> in that it can help to look deeper into search results, beyond the top few hits. The query issued to the system was _Apache Solr_. It seems clear that faceting could not yield a similar set of groups, although the goals of both techniques are similar—to let the user explore the set of search results and either rephrase the query or narrow the focus to a subset of current documents. Clustering is also similar to <<result-grouping.adoc#result-grouping,Result Grouping>> in that it can help to look deeper into search results, beyond the top few hits.
== Clustering Concepts == Configuration Quick Starter
Each *document* passed to the clustering component is composed of several logical parts: The clustering extension works as a search component. It needs to be declared and configured in `solrconfig.xml`, for example:
* a unique identifier, [source,xml]
* origin URL, ----
* the title, <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering">
* the main content, <lst name="engine">
* a language code of the title and content. <str name="name">lingo</str>
<str name="clustering.fields">title, content</str>
<str name="clustering.algorithm">Lingo</str>
</lst>
</searchComponent>
----
The identifier part is mandatory, everything else is optional but at least one of the text fields (title or content) will be required to make the clustering process reasonable. It is important to remember that logical document parts must be mapped to a particular schema and its fields. The content (text) for clustering can be sourced from either a stored text field or context-filtered using a highlighter, all these options are explained below in the <<Clustering Configuration,configuration>> section. The above declares the clustering component with a single *engine* -- there may be
multiple engines declared and switched at runtime. We will return to the details of
how to configure engines later.
A *clustering algorithm* is the actual logic (implementation) that discovers relationships among the documents in the search result and forms human-readable cluster labels. Depending on the choice of the algorithm the clusters may (and probably will) vary. Solr comes with several algorithms implemented in the open source http://carrot2.org[Carrot2] project, commercial alternatives also exist. The clustering component must be attached to a `SearchHandler` and explicitly enabled
via property `clustering`. It is important to attach it as the *last* component in the handler's pipeline, as shown below:
== Clustering Quick Start Example [source,xml]
----
<requestHandler name="/select" class="solr.SearchHandler">
<lst name="defaults">
<bool name="clustering">true</bool>
<str name="clustering.engine">lingo</str>
</lst>
The "```techproducts```" example included with Solr is pre-configured with all the necessary components for result clustering -- but they are disabled by default. <arr name="last-components">
<str>clustering</str>
</arr>
</requestHandler>
----
To enable the clustering component contrib and a dedicated search handler configured to use it, specify a JVM System Property when running the example: Once attached, as in the example above, the clustering will be performed automatically
on all documents matching the search handler's query. The clustering extension will take into
account all text fields listed in `clustering.fields` parameter of the engine and will
product a section of the response called `clusters` containing the discovered structure of
groups, for example (JSON response for brevity):
[source,json]
----
{
"clusters": [
{
"labels": ["Memory"],
"score": 6.80,
"docs":[ "0579B002",
"EN7800GTX/2DHTV/256M",
"TWINX2048-3200PRO",
"VDBDB1A16",
"VS1GB400C3"]},
{
"labels":["Coins and Notes"],
"score":28.560285143284457,
"docs":["EUR",
"GBP",
"NOK",
"USD"]},
{
"labels":["TFT LCD"],
"score":15.355729924203429,
"docs":["3007WFP",
"9885A004",
"MA147LL/A",
"VA902B"]}
]
}
----
The `labels` element of each cluster is a dynamically discovered phrase that describes and applies to all document identifiers under `docs` element.
== Solr Distribution Example
The `techproducts` example included with Solr is pre-configured with all the necessary components for result clustering -- but they are disabled by default.
To enable the clustering component extension and the dedicated search handler configured to use it, specify a JVM System Property when running the example:
[source,bash] [source,bash]
---- ----
@ -129,66 +196,73 @@ The output XML should include search hits and an array of automatically discover
</response> </response>
---- ----
There were a few clusters discovered for this query (`\*:*`), separating search hits into various categories: DDR, iPod, Hard Drive, etc. Each cluster has a label and score that indicates the "goodness" of the cluster. The score is algorithm-specific and is meaningful only in relation to the scores of other clusters in the same set. In other words, if cluster _A_ has a higher score than cluster _B_, cluster _A_ should be of better quality (have a better label and/or more coherent document set). Each cluster has an array of identifiers of documents belonging to it. These identifiers correspond to the `uniqueKey` field declared in the schema. A few clusters discovered for this query (`\*:*`), separate all search hits into various categories: DDR, iPod, Hard Drive, etc. Each cluster has a label and score that indicates the "goodness" of the cluster. The score is algorithm-specific and is meaningful only in relation to the scores of other clusters in the same set. In other words, if cluster _A_ has a higher score than cluster _B_, cluster _A_ should be of better quality (have a better label and/or more coherent document set). Each cluster has an array of identifiers of documents belonging to it. These identifiers correspond to the `uniqueKey` field declared in the schema.
Depending on the quality of input documents, some clusters may not make much sense. Some documents may be left out and not be clustered at all; these will be assigned to the synthetic _Other Topics_ group, marked with the `other-topics` property set to `true` (see the XML dump above for an example). The score of the other topics group is zero. Sometimes cluster labels may not make much sense (this depends on many factors -- text in clustered fields, number of documents, algorithm paramerters). Also, some documents may be left out and not be clustered at all; these will be assigned to the synthetic _Other Topics_ group, marked with the `other-topics` property set to `true` (see the XML dump above for an example). The score of the other topics group is zero.
== Installing the Clustering Contrib == Installation
The clustering contrib extension requires `dist/solr-clustering-*.jar` and all JARs under `contrib/clustering/lib`. The clustering contrib extension requires `dist/solr-clustering-*.jar` and all JARs under `contrib/clustering/lib`.
== Clustering Configuration You can include the required contrib JARs in `solrconfig.xml` as shown below (by default paths are relative to the Solr core so they may need adjustments to your configuration, or an explicit specification of the `$solr.install.dir`):
=== Declaration of the Clustering Search Component and Request Handler
Clustering extension is a search component and must be declared in `solrconfig.xml`. Such a component can be then appended to a request handler as the last component in the chain (because it requires search results which must be previously fetched by the search component).
An example configuration could look as shown below.
. Include the required contrib JARs. Note that by default paths are relative to the Solr core so they may need adjustments to your configuration, or an explicit specification of the `$solr.install.dir`.
+
[source,xml] [source,xml]
---- ----
<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" /> <lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" />
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" /> <lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" />
---- ----
. Declaration of the search component. Each component can also declare multiple clustering pipelines ("engines"), which can be selected at runtime by passing `clustering.engine=(engine name)` URL parameter.
+ == Configuration
=== Component Configuration
The following properties control `ClusteringComponent` state.
`clustering`::
The component is by default disabled, even if properly declared and attached to a search handler. The `clustering` property must be set to `true` to enable it (this can be done by setting
up default parameters in the search handler -- see below).
`clustering.engine`::
Declares which engine to use. If not present, the first declared active engine is used.
=== Clustering Engines
The declaration of clustering component in `solrconfig.xml` must include one or more predefined configurations called _engines_. For example, consider the configuration below:
[source,xml] [source,xml]
---- ----
<searchComponent name="clustering" class="solr.clustering.ClusteringComponent"> <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering">
<!-- Lingo clustering algorithm -->
<lst name="engine"> <lst name="engine">
<str name="name">lingo</str> <str name="name">lingo</str>
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str> <str name="clustering.algorithm">Lingo</str>
<str name="clustering.fields">title, content</str>
</lst> </lst>
<!-- An example definition for the STC clustering algorithm. -->
<lst name="engine"> <lst name="engine">
<str name="name">stc</str> <str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str> <str name="clustering.algorithm">STC</str>
<str name="clustering.fields">title</str>
</lst> </lst>
</searchComponent> </searchComponent>
---- ----
. A request handler to which we append the clustering component declared above.
+ This declares two separate engines (`lingo` and `stc`): these configurations have a
different clustering algorithm, and a different set of clustered document fields. The
active engine can be selected by passing `clustering.engine=_name_` parameter
at runtime (via URL) or as the default within the search handler's configuration,
as shown below:
[source,xml] [source,xml]
---- ----
<requestHandler name="/clustering" <requestHandler name="/clustering" class="solr.SearchHandler">
class="solr.SearchHandler">
<lst name="defaults"> <lst name="defaults">
<!-- Clustering component enabled. -->
<bool name="clustering">true</bool> <bool name="clustering">true</bool>
<bool name="clustering.results">true</bool> <str name="clustering.engine">stc</str>
<!-- Logical field to physical field mapping. --> <!-- Cluster the top 100 search results - bump up the 'rows' parameter. -->
<str name="carrot.url">id</str>
<str name="carrot.title">doctitle</str>
<str name="carrot.snippet">content</str>
<!-- Configure any other request handler parameters. We will cluster the
top 100 search results so bump up the 'rows' parameter. -->
<str name="rows">100</str> <str name="rows">100</str>
<str name="fl">*,score</str>
</lst> </lst>
<!-- Append clustering at the end of the list of search components. --> <!-- Append clustering at the end of the list of search components. -->
@ -198,149 +272,205 @@ An example configuration could look as shown below.
</requestHandler> </requestHandler>
---- ----
=== Configuration Parameters of the Clustering Component === Clustering Engine Configuration Parameters
The following parameters of each clustering engine or the entire clustering component (depending where they are declared) are available. Each declared engine can be configured using a number of parameters described below.
`clustering`:: `clustering.fields` (_required_)::
When `true`, clustering component is enabled. A comma (or space) separated list of text fields which should contain the text
content for clustering. At least one field must be provided. The fields are separate from search handler's `fl` parameter so that clustered fields don't have to be included in the response.
`clustering.engine`:: `clustering.algorithm` (_required_)::
Declares which clustering engine to use. If not present, the first declared engine will become the default one. The clustering algorithm is the actual logic (implementation) that discovers relationships among the documents and forms human-readable cluster labels. This parameter sets the name of the clustering algorithm this engine is going to use. Algorithms are supplied to Solr via Carrot^2^-defined service extension. By default, the following open-source algorithms should be available: `Lingo`, `STC`, `Bisecting K-Means`. A commercial clustering algorithm `Lingo3G` plugs into the same extension point and can be used, if it is available on classpath.
`clustering.results`:: .How to choose the Clustering Algorithm?
When `true`, the component will perform clustering of search results (this should be enabled). ****
The question of which algorithm to choose depends on the amount of traffic, the expected result, and the input data (each algorithm will cluster the input slightly differently). There is no one answer which algorithm is "the best": Lingo3G provides hierarchical clusters, Lingo and STC provide flat clusters. STC is faster than Lingo, but arguably produces less intuitive clusters, Lingo3G is the fastest algorithm but is not free or open source... Experiment and pick one that suits your needs.
`clustering.collection`::
When `true`, the component will perform clustering of the whole document index (this section does not cover full-index clustering).
At the engine declaration level, the following parameters are supported.
`carrot.algorithm`::
The algorithm class.
`carrot.resourcesDir`::
Algorithm-specific resources and configuration files (stop words, other lexical resources, default settings). By default points to `conf/clustering/carrot2/`
`carrot.outputSubClusters`::
If `true` and the algorithm supports hierarchical clustering, sub-clusters will also be emitted. Default value: true.
`carrot.numDescriptions`::
Maximum number of per-cluster labels to return (if the algorithm assigns more than one label to a cluster).
The `carrot.algorithm` parameter should contain a fully qualified class name of an algorithm supported by the http://project.carrot2.org[Carrot2] framework. Currently, the following algorithms are available:
* `org.carrot2.clustering.lingo.LingoClusteringAlgorithm` (open source)
* `org.carrot2.clustering.stc.STCClusteringAlgorithm` (open source)
* `org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm` (open source)
* `com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm` (commercial)
For a comparison of characteristics of these algorithms see the following links: For a comparison of characteristics of these algorithms see the following links:
* http://doc.carrot2.org/#section.advanced-topics.fine-tuning.choosing-algorithm * https://carrot2.github.io/release/4.0.4/doc/algorithms/
* http://project.carrot2.org/algorithms.html * https://carrotsearch.com/lingo3g-comparison.html
* http://carrotsearch.com/lingo3g-comparison.html
The question of which algorithm to choose depends on the amount of traffic (STC is faster than Lingo, but arguably produces less intuitive clusters, Lingo3G is the fastest algorithm but is not free or open source), expected result (Lingo3G provides hierarchical clusters, Lingo and STC provide flat clusters), and the input data (each algorithm will cluster the input slightly differently). There is no one answer which algorithm is "the best". The clustering component lists all available algorithms, languages and algorithm-language compatibility at startup. You can peek at startup logs to see what's available
in your Solr installation.
****
=== Contextual and Full Field Clustering `clustering.maxLabels`::
Maximum number of returned cluster labels (if the algorithm returns more labels, the list will
be truncated). By default all labels are returned.
The clustering engine can apply clustering to the full content of (stored) fields or it can run an internal highlighter pass to extract context-snippets before clustering. Highlighting is recommended when the logical snippet field contains a lot of content (this would affect clustering performance). Highlighting can also increase the quality of clustering because the content passed to the algorithm will be more focused around the query (it will be query-specific context). The following parameters control the internal highlighter. `clustering.includeSubclusters`::
If `true`, sub-clusters are included in the response for algorithms that support hierarchical
clustering. `false` causes only top-level clusters to be returned.
`carrot.produceSummary`:: `clustering.includeOtherTopics`::
When `true` the clustering component will run a highlighter pass on the content of logical fields pointed to by `carrot.title` and `carrot.snippet`. Otherwise full content of those fields will be clustered. If `true`, a synthetic cluster called _Other Topics_, consisting of all documents not assigned to any other cluster is formed and returned. The default value of this parameter is `true` but if
there is no need for this synthetic cluster, it can be set to `false`.
`carrot.fragSize`:: `clustering.resources`::
The size, in characters, of the snippets (aka fragments) created by the highlighter. If not specified, the default highlighting fragsize (`hl.fragsize`) will be used. Location of algorithm-specific resources and configuration files (stop words, other lexical resources, default settings). This property is `null` by default and all resources are read
from their respective algorithm default resource pool (JARs). If this property is not empty,
it resolves relative to Solr core's configuration directory. This parameter can be applied during Solr startup _only_, it can't be overriden per-request.
`carrot.summarySnippets`:: The number of summary snippets to generate for clustering. If not specified, the default highlighting snippet count (`hl.snippets`) will be used. [.text-center]
🙪
=== Logical to Document Field Mapping There are more properties applying to engine configuration. We describe these in functional sections that follow.
As already mentioned in <<Clustering Concepts>>, the clustering component clusters "documents" consisting of logical parts that need to be mapped onto physical schema of data stored in Solr. The field mapping attributes provide a connection between fields and logical document parts. Note that the content of title and snippet fields must be *stored* so that it can be retrieved at search time. === Full Field and Query-Context (Snippet) Clustering
`carrot.title`:: The clustering algorithm can consume full content of fields or just the left and right context around query-matching regions (so-called _snippets_). Contrary to the intuition, using query contexts can increase the quality of clustering even if it feeds less data to the algorithm. This is typically caused by the fact that snippets are more focused around the phrases and terms surrounding the query and the algorithm has a better signal-to-noise ratio of data to work with.
The field (alternatively comma- or space-separated list of fields) that should be mapped to the logical document's title. The clustering algorithms typically give more weight to the content of the title field compared to the content (snippet). For best results, the field should contain concise, noise-free content. If there is no clear title in your data, you can leave this parameter blank.
`carrot.snippet`:: We recommend using query contexts when fields contain a lot of content (this would affect clustering performance).
The field (alternatively comma- or space-separated list of fields) that should be mapped to the logical document's main content. If this mapping points to very large content fields the performance of clustering may drop significantly. An alternative then is to use query-context snippets for clustering instead of full field content. See the description of the `carrot.produceSummary` parameter for details.
`carrot.url`:: The following three properties control whether the context or full content are processed and how snippets are formed for clustering.
The field that should be mapped to the logical document's content URL. Leave blank if not required.
=== Clustering Multilingual Content `clustering.preferQueryContext`::
If `true`, the engine will try to extract context around the query matching regions and use these contexts as input for the clustering algorithm.
The field mapping specification can include a `carrot.lang` parameter, which defines the field that stores http://www.loc.gov/standards/iso639-2/php/code_list.php[ISO 639-1] code of the language in which the title and content of the document are written. This information can be stored in the index based on apriori knowledge of the documents' source or a language detection filter applied at indexing time. All algorithms inside the Carrot2 framework will accept ISO codes of languages defined in https://github.com/carrot2/carrot2/blob/master/core/carrot2-core/src/org/carrot2/core/LanguageCode.java[LanguageCode enum]. `clustering.contextSize`::
The maximum size, in characters, of each snippet created by the context retrieval algorithm (internal highlighter).
The language hint makes it easier for clustering algorithms to separate documents from different languages on input and to pick the right language resources for clustering. If you do have multi-lingual query results (or query results in a language different than English), it is strongly advised to map the language field appropriately. `clustering.contextCount`::
The maximum number of different, non-contiguous snippets from a single field.
`carrot.lang`:: === Default Clustering Language
The field that stores ISO 639-1 code of the language of the document's text fields.
`carrot.lcmap`:: The default implementations of clustering algorithms in Carrot^2^ (shipped with Solr)
A mapping of arbitrary strings into ISO 639 two-letter codes used by `carrot.lang`. The syntax of this parameter is the same as `langid.map.lcmap`, for example: `langid.map.lcmap=japanese:ja polish:pl english:en` have built-in support (stemming, stop words) for preprocessing a number of languages. It is important to provide the clustering algorithm with a hint of what language should be used for clustering. This can be done in two ways -- by passing the name of the default language or by providing the language as a field with each document. The following two engine configuration parameters control this:
The default language can also be set using Carrot2-specific algorithm attributes (in this case the http://doc.carrot2.org/#section.attribute.lingo.MultilingualClustering.defaultLanguage[MultilingualClustering.defaultLanguage] attribute). `clustering.language`::
Name of the default language to use for clustering. The default value of this field is `English`. The provided language must be available and the clustering algorithm must support it.
`clustering.languageField`::
Name of the document field that stores the document's language. If the field does not exist
for a document or the value is blank, the default language is used.
The list of supported languages can change dynamically (languages are loaded via external service provider extension) and may depend on the selected algorithm (algorithms can support a subset of languages for which resources are available). The clustering component will log all supported algorithm-language pairs at Solr startup, so you can inspect what's supported on your particular Solr instance. For example:
[source,text]
----
2020-10-29 [...] Clustering algorithm Lingo3G loaded with support for the following languages: Dutch, English
2020-10-29 [...] Clustering algorithm Lingo loaded with support for the following languages: Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish, Turkish
2020-10-29 [...] Clustering algorithm Bisecting K-Means loaded with support for the following languages: Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish, Turkish
----
=== Handling Multilingual Content
It is often the case that the index (and query result) contains documents in _multiple_ languages. Clustering such search results is problematic. Ideally, the engine should translate (or understand) the content of documents and then group relevant information together, regardless of the language it is written in.
In reality, clustering algorithms are typically much simpler -- they infer similarity between documents from statistical properties of terms and phrases that occur in those documents. So texts written in different languages will not cluster too well.
To deal witht his situation the default clustering component implementation in Solr will try to first group all documents by their language and then apply clustering to each sub-group in that language. It is recommended to store the language of each document as its separate field and point at it using the `clustering.languageField` configuration property described above.
== Tweaking Algorithm Settings == Tweaking Algorithm Settings
The algorithms that come with Solr are using their default settings which may be inadequate for all data sets. All algorithms have lexical resources and resources (stop words, stemmers, parameters) that may require tweaking to get better clusters (and cluster labels). For Carrot2-based algorithms it is probably best to refer to a dedicated tuning application called Carrot2 Workbench (screenshot below). From this application one can export a set of algorithm attributes as an XML file, which can be then placed under the location pointed to by `carrot.resourcesDir`. The clustering algorithms that come with Solr use their default parameter values and language resources. We highly recommend tuning both for production uses. Improving the default language resources to include words and phrases common to a particular document domain will improve clustering quality significantly.
image::images/result-clustering/carrot2-workbench.png[image,scaledwidth=75.0%] Carrot^2^ algorithms have an extensive set of parameters and language resource tuning options. Please refer to https://carrot2.github.io/release/latest/[up-to-date project documentation]. In particular, the language resources section and each algorithm's attributes section.
=== Providing Defaults for Clustering
The default attributes for all engines (algorithms) declared in the clustering component are placed under `carrot.resourcesDir` and with an expected file name of `engineName-attributes.xml`. So for an engine named `lingo` and the default value of `carrot.resourcesDir`, the attributes would be read from a file in `conf/clustering/carrot2/lingo-attributes.xml`. === Changing Clustering Algorithm Parameters
An example XML file changing the default language of documents to Polish is shown below. Clustering algorithm settings can be changed via Solr parameters either
permanently (in the Engine's declaration) or per-request (via Solr URL parameters).
For example, let's assume the following engine configuration:
[source,xml] [source,xml]
---- ----
<attribute-sets default="attributes"> <lst name="engine">
<attribute-set id="attributes"> <str name="name">lingo</str>
<value-set> <str name="clustering.algorithm">Lingo</str>
<label>attributes</label> <str name="clustering.fields">name, features</str>
<attribute key="MultilingualClustering.defaultLanguage"> <str name="clustering.language">English</str>
<value type="org.carrot2.core.LanguageCode" value="POLISH"/> </lst>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>
---- ----
=== Tweaking Algorithms at Query-Time First, locate the configuration parameters for the Lingo algorithm
at https://carrot2.github.io/release/latest/[Carrot^2^ documentation site]:
The clustering component and Carrot2 clustering algorithms can accept query-time attribute overrides. Note that certain things (for example lexical resources) can only be initialized once (at startup, via the XML configuration files). image::images/result-clustering/carrot2-docs-attrs1.png[image,scaledwidth=75%]
An example query that changes the `LingoClusteringAlgorithm.desiredClusterCountBase` parameter for the Lingo algorithm: Then locate the particular setting you'd like to change and note the
REST API path to that setting (in this case the parameter is
`minClusterSize` and its path is `preprocessing.documentAssigner.minClusterSize`):
image::images/result-clustering/carrot2-docs-attrs2.png[image,scaledwidth=75%]
Now add the full path-value pair to the engine's configuration:
[source,xml]
----
<lst name="engine">
<str name="name">lingo</str>
<str name="clustering.algorithm">Lingo</str>
<str name="clustering.fields">name, features</str>
<str name="clustering.language">English</str>
<int name="preprocessing.documentAssigner.minClusterSize">3</int>
</lst>
----
The following rules apply.
* The type of the parameter must be consistent with the type listed in Carrot^2^ specification.
* If the parameter is added to the engine's configuration in `solrconfig.xml`, the core must be reloaded for the changes to be picked up. Alternatively, pass the parameter via the request URL to change things dynamically on a per-request basis. For example, if you have the `techproducts` example running, this will cut the clusters to only those containing at least three documents:
`http://localhost:8983/solr/techproducts/clustering?q=\*:*&rows=100&wt=json&preprocessing.documentAssigner.minClusterSize=3`
* For complex types, the parameter key with the name of the instantiated type must precede any of its own parameters.
=== Custom Language Resources
Clustering algorithms rely on language and domain-specific language resources to
improve the quality of clusters (by discarding domain-specific noise and boilerplate language).
By default, language resources are read from the engine-declared algorithm default JAR. You can pass a custom location for these resources by specifying the `clustering.resources` parameter. The value of this parameter resolves to a location relative to Solr core's configuration directory. For example, the following definition:
[source,xml]
----
<lst name="engine">
<str name="name">lingo</str>
<str name="clustering.algorithm">Lingo</str>
<str name="clustering.fields">name, features</str>
<str name="clustering.language">English</str>
<str name="clustering.resources">lingo-resources</str>
</lst>
----
would result in the following log entry and expected resource location:
[source,text] [source,text]
http://localhost:8983/solr/techproducts/clustering?q=*:*&rows=100&LingoClusteringAlgorithm.desiredClusterCountBase=20 ----
Clustering algorithm resources first looked up relative to: [.../example/techproducts/solr/techproducts/conf/lingo-resources]
----
The clustering engine (the algorithm declared in `solrconfig.xml`) can also be changed at runtime by passing `clustering.engine=name` request attribute: The best way to start tuning algorithm resources is to copy all the defaults from its
corresponding Carrot^2^ JAR file (or Carrot^2^ distribution).
[source,text] == Performance Considerations
http://localhost:8983/solr/techproducts/clustering?q=*:*&rows=100&clustering.engine=kmeans
== Performance Considerations with Dynamic Clustering Clustering of search results comes with some performance considerations:
Dynamic clustering of search results comes with two major performance penalties: * The cost of fetching a larger-than-usual number of search results (50, 100 or more documents),
* Increased cost of fetching a larger-than-usual number of search results (50, 100 or more documents),
* Additional computational cost of the clustering itself. * Additional computational cost of the clustering itself.
* In distributed mode the content of document fields for clustering is collected from shards and adds some additional network overhead.
For simple queries, the clustering time will usually dominate the fetch time. If the document content is very long the retrieval of stored content can become a bottleneck. The performance impact of clustering can be lowered in several ways: For simple queries, the clustering time will usually dominate everything else. If document fields are very long, the retrieval of stored content can become a bottleneck.
* feed less content to the clustering algorithm by enabling `carrot.produceSummary` attribute, The performance impact of clustering can be lowered in several ways.
* perform clustering on selected fields (titles only) to make the input smaller,
* use a faster algorithm (STC instead of Lingo, Lingo3G instead of STC), * Cluster less data: use query context (snippets) instead of full field content (`clustering.preferQueryContext=true`).
* tune the performance attributes related directly to a specific algorithm. * Perform clustering on just a subset of document fields or curate fields for clustering (add abstracts at indexing-time) to make the input smaller.
* Tune the performance attributes related directly to a specific algorithm.
* Try a different, faster algorithm (STC instead of Lingo, Lingo3G instead of STC).
Some of these techniques are described in _Apache SOLR and Carrot2 integration strategies_ document, available at http://carrot2.github.io/solr-integration-strategies. The topic of improving performance is also included in the Carrot2 manual at http://doc.carrot2.org/#section.advanced-topics.fine-tuning.performance.
== Additional Resources == Additional Resources
The following resources provide additional information about the clustering component in Solr and its potential applications. The following resources provide additional information about the clustering component in Solr and its potential applications.
* Apache Solr and Carrot2 integration strategies: http://carrot2.github.io/solr-integration-strategies * Clustering and Visualization of Solr search results (Berlin BuzzWords conference, *2011*): http://2011.berlinbuzzwords.de/sites/2011.berlinbuzzwords.de/files/solr-clustering-visualization.pdf
* Clustering and Visualization of Solr search results (Berlin BuzzWords conference, 2011): http://2011.berlinbuzzwords.de/sites/2011.berlinbuzzwords.de/files/solr-clustering-visualization.pdf

View File

@ -26,7 +26,6 @@ import java.util.Objects;
* It is a direct mapping for the Json object Solr is returning. * It is a direct mapping for the Json object Solr is returning.
*/ */
public class Cluster { public class Cluster {
private List<String> labels; private List<String> labels;
private double score; private double score;
private List<String> docIds; private List<String> docIds;
@ -43,10 +42,10 @@ public class Cluster {
* @param docIds the list of document Ids belonging to the cluster * @param docIds the list of document Ids belonging to the cluster
*/ */
public Cluster(List<String> labels, double score, List<String> docIds, List<Cluster> subclusters, boolean otherTopics) { public Cluster(List<String> labels, double score, List<String> docIds, List<Cluster> subclusters, boolean otherTopics) {
this.labels = labels; this.labels = Objects.requireNonNullElse(labels, Collections.emptyList());
this.score = score; this.score = score;
this.docIds = docIds; this.docIds = Objects.requireNonNullElse(docIds, Collections.emptyList());
this.subclusters = subclusters; this.subclusters = Objects.requireNonNullElse(subclusters, Collections.emptyList());
this.otherTopics = otherTopics; this.otherTopics = otherTopics;
} }
@ -93,7 +92,7 @@ public class Cluster {
this.docIds = docIds; this.docIds = docIds;
} }
public List<Cluster> getSubclusters() { public List<Cluster> getClusters() {
return subclusters; return subclusters;
} }

View File

@ -26,11 +26,12 @@ import org.apache.solr.common.util.NamedList;
* Encapsulates responses from ClusteringComponent * Encapsulates responses from ClusteringComponent
*/ */
public class ClusteringResponse { public class ClusteringResponse {
private static final String CLUSTERS_NODE = "clusters"; public static final String CLUSTERS_NODE = "clusters";
private static final String LABELS_NODE = "labels"; public static final String LABELS_NODE = "labels";
private static final String DOCS_NODE = "docs"; public static final String DOCS_NODE = "docs";
private static final String SCORE_NODE = "score"; public static final String SCORE_NODE = "score";
private static final String IS_OTHER_TOPICS = "other-topics"; public static final String IS_OTHER_TOPICS = "other-topics";
private List<Cluster> clusters; private List<Cluster> clusters;
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
@ -53,7 +54,7 @@ public class ClusteringResponse {
break; break;
case SCORE_NODE: case SCORE_NODE:
score = (Double) e.getValue(); score = ((Number) e.getValue()).doubleValue();
break; break;
case CLUSTERS_NODE: case CLUSTERS_NODE:

View File

@ -57,7 +57,7 @@ public class TestClusteringResponse extends SolrJettyTestBase {
checkCluster(clusters.get(2), Arrays.asList("label3"), Arrays.asList("id7", "id8"), 1.26d, false); checkCluster(clusters.get(2), Arrays.asList("label3"), Arrays.asList("id7", "id8"), 1.26d, false);
checkCluster(clusters.get(3), Arrays.asList("label4"), Arrays.asList("id9"), 0d, true); checkCluster(clusters.get(3), Arrays.asList("label4"), Arrays.asList("id9"), 0d, true);
List<Cluster> sub = clusters.get(0).getSubclusters(); List<Cluster> sub = clusters.get(0).getClusters();
checkCluster(sub.get(0), Arrays.asList("label1.sub1"), Arrays.asList("id1", "id2"), 0.0d, false); checkCluster(sub.get(0), Arrays.asList("label1.sub1"), Arrays.asList("id1", "id2"), 0.0d, false);
checkCluster(sub.get(1), Arrays.asList("label1.sub2"), Arrays.asList("id2"), 0.0d, false); checkCluster(sub.get(1), Arrays.asList("label1.sub2"), Arrays.asList("id2"), 0.0d, false);
assertEquals(sub.size(), 2); assertEquals(sub.size(), 2);

View File

@ -1,14 +1,13 @@
# Run ./gradlew --write-locks to regenerate this file # Run ./gradlew --write-locks to regenerate this file
com.adobe.xmp:xmpcore:5.1.3 (1 constraints: 0b050a36) com.adobe.xmp:xmpcore:5.1.3 (1 constraints: 0b050a36)
com.carrotsearch:hppc:0.8.2 (2 constraints: b00ffaa6) com.carrotsearch:hppc:0.8.2 (2 constraints: ad0fc5a6)
com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.7.6 (1 constraints: 11051036) com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.7.6 (1 constraints: 11051036)
com.carrotsearch.thirdparty:simple-xml-safe:2.7.1 (1 constraints: a60a82ca)
com.cybozu.labs:langdetect:1.1-20120112 (1 constraints: 5c066d5e) com.cybozu.labs:langdetect:1.1-20120112 (1 constraints: 5c066d5e)
com.drewnoakes:metadata-extractor:2.11.0 (1 constraints: 3605323b) com.drewnoakes:metadata-extractor:2.11.0 (1 constraints: 3605323b)
com.epam:parso:2.0.11 (1 constraints: 36052c3b) com.epam:parso:2.0.11 (1 constraints: 36052c3b)
com.fasterxml.jackson.core:jackson-annotations:2.10.1 (2 constraints: 331dcd4e) com.fasterxml.jackson.core:jackson-annotations:2.10.1 (1 constraints: 84122d21)
com.fasterxml.jackson.core:jackson-core:2.10.1 (3 constraints: 633586b7) com.fasterxml.jackson.core:jackson-core:2.10.1 (2 constraints: b42a896b)
com.fasterxml.jackson.core:jackson-databind:2.10.1 (3 constraints: 941aba96) com.fasterxml.jackson.core:jackson-databind:2.10.1 (2 constraints: 840f2597)
com.fasterxml.jackson.dataformat:jackson-dataformat-smile:2.10.1 (1 constraints: 3605303b) com.fasterxml.jackson.dataformat:jackson-dataformat-smile:2.10.1 (1 constraints: 3605303b)
com.github.ben-manes.caffeine:caffeine:2.8.4 (1 constraints: 10051136) com.github.ben-manes.caffeine:caffeine:2.8.4 (1 constraints: 10051136)
com.github.virtuald:curvesapi:1.06 (1 constraints: db04f530) com.github.virtuald:curvesapi:1.06 (1 constraints: db04f530)
@ -78,7 +77,7 @@ org.apache.commons:commons-compress:1.19 (1 constraints: df04fa30)
org.apache.commons:commons-configuration2:2.1.1 (1 constraints: 0605f935) org.apache.commons:commons-configuration2:2.1.1 (1 constraints: 0605f935)
org.apache.commons:commons-csv:1.7 (1 constraints: ac04212c) org.apache.commons:commons-csv:1.7 (1 constraints: ac04212c)
org.apache.commons:commons-exec:1.3 (1 constraints: a8041d2c) org.apache.commons:commons-exec:1.3 (1 constraints: a8041d2c)
org.apache.commons:commons-lang3:3.9 (4 constraints: 702e84c7) org.apache.commons:commons-lang3:3.9 (3 constraints: 2b24bbb0)
org.apache.commons:commons-math3:3.6.1 (1 constraints: 0c050d36) org.apache.commons:commons-math3:3.6.1 (1 constraints: 0c050d36)
org.apache.commons:commons-text:1.6 (1 constraints: ab04202c) org.apache.commons:commons-text:1.6 (1 constraints: ab04202c)
org.apache.curator:curator-client:2.13.0 (1 constraints: 3805383b) org.apache.curator:curator-client:2.13.0 (1 constraints: 3805383b)
@ -127,12 +126,10 @@ org.bouncycastle:bcmail-jdk15on:1.64 (1 constraints: df04ff30)
org.bouncycastle:bcpkix-jdk15on:1.64 (1 constraints: df04ff30) org.bouncycastle:bcpkix-jdk15on:1.64 (1 constraints: df04ff30)
org.bouncycastle:bcprov-jdk15on:1.64 (1 constraints: df04ff30) org.bouncycastle:bcprov-jdk15on:1.64 (1 constraints: df04ff30)
org.brotli:dec:0.1.2 (1 constraints: 0505f035) org.brotli:dec:0.1.2 (1 constraints: 0505f035)
org.carrot2:carrot2-mini:3.16.2 (1 constraints: 3e05493b) org.carrot2:carrot2-core:4.0.4 (1 constraints: 0a050336)
org.carrot2:morfologik-fsa:2.1.5 (1 constraints: d70d9836) org.carrot2:morfologik-fsa:2.1.5 (1 constraints: d70d9836)
org.carrot2:morfologik-polish:2.1.5 (1 constraints: 0a05fd35) org.carrot2:morfologik-polish:2.1.5 (1 constraints: 0a05fd35)
org.carrot2:morfologik-stemming:2.1.5 (2 constraints: 0b12640c) org.carrot2:morfologik-stemming:2.1.5 (2 constraints: 0b12640c)
org.carrot2.attributes:attributes-binder:1.3.3 (1 constraints: a30a73ca)
org.carrot2.shaded:carrot2-guava:18.0 (2 constraints: b31b3b7b)
org.ccil.cowan.tagsoup:tagsoup:1.2.1 (1 constraints: 0605f735) org.ccil.cowan.tagsoup:tagsoup:1.2.1 (1 constraints: 0605f735)
org.checkerframework:checker-qual:2.0.0 (1 constraints: 140ae5b4) org.checkerframework:checker-qual:2.0.0 (1 constraints: 140ae5b4)
org.codehaus.janino:commons-compiler:3.0.9 (2 constraints: d910f7d1) org.codehaus.janino:commons-compiler:3.0.9 (2 constraints: d910f7d1)
@ -172,7 +169,7 @@ org.ow2.asm:asm:7.2 (2 constraints: 900e3e5e)
org.ow2.asm:asm-commons:7.2 (1 constraints: ad042e2c) org.ow2.asm:asm-commons:7.2 (1 constraints: ad042e2c)
org.rrd4j:rrd4j:3.5 (1 constraints: ac04252c) org.rrd4j:rrd4j:3.5 (1 constraints: ac04252c)
org.slf4j:jcl-over-slf4j:1.7.24 (1 constraints: 4005473b) org.slf4j:jcl-over-slf4j:1.7.24 (1 constraints: 4005473b)
org.slf4j:slf4j-api:1.7.24 (15 constraints: a3ba2a7b) org.slf4j:slf4j-api:1.7.24 (14 constraints: ccafc13c)
org.tallison:jmatio:1.5 (1 constraints: aa041f2c) org.tallison:jmatio:1.5 (1 constraints: aa041f2c)
org.tukaani:xz:1.8 (1 constraints: ad04222c) org.tukaani:xz:1.8 (1 constraints: ad04222c)
org.xerial.snappy:snappy-java:1.1.7.6 (1 constraints: 6f05a240) org.xerial.snappy:snappy-java:1.1.7.6 (1 constraints: 6f05a240)

View File

@ -76,7 +76,7 @@ org.aspectj:aspectjrt=1.8.0
org.bitbucket.b_c:jose4j=0.6.5 org.bitbucket.b_c:jose4j=0.6.5
org.bouncycastle:*=1.64 org.bouncycastle:*=1.64
org.brotli:dec=0.1.2 org.brotli:dec=0.1.2
org.carrot2:carrot2-mini=3.16.2 org.carrot2:carrot2-core=4.0.4
org.carrot2:morfologik-*=2.1.5 org.carrot2:morfologik-*=2.1.5
org.ccil.cowan.tagsoup:tagsoup=1.2.1 org.ccil.cowan.tagsoup:tagsoup=1.2.1
org.codehaus.janino:*=3.0.9 org.codehaus.janino:*=3.0.9