mirror of https://github.com/apache/lucene.git
SOLR-14926: Modernize and clean up search results clustering contrib.
This commit is contained in:
parent
5c02737918
commit
0f871b2c56
|
@ -91,7 +91,7 @@ and Edmond Nolan.
|
|||
The Polish analyzer (stempel) comes with a default
|
||||
stopword list that is BSD-licensed created by the Carrot2 project. The file resides
|
||||
in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
|
||||
See http://project.carrot2.org/license.html.
|
||||
See https://project.carrot2.org/license.html.
|
||||
|
||||
The SmartChineseAnalyzer source code (smartcn) was
|
||||
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# This file was created from the carrot2 project and is distributed under the BSD license.
|
||||
# See http://project.carrot2.org/license.html
|
||||
# See https://project.carrot2.org/license.html
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# From trunk/core/carrot2-util-text/src-resources/stopwords.pl
|
||||
vol
|
||||
|
|
|
@ -135,9 +135,10 @@ public class MatchHighlighter {
|
|||
/**
|
||||
* Always fetch the given set of fields for all input documents.
|
||||
*/
|
||||
public void alwaysFetchFields(String field, String... otherFields) {
|
||||
Stream.concat(Stream.of(field), Stream.of(otherFields))
|
||||
.forEach(fld -> fieldsAlwaysReturned.add(Objects.requireNonNull(fld)));
|
||||
public void alwaysFetchFields(String... fields) {
|
||||
for (String fld : fields) {
|
||||
fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -42,6 +42,13 @@ Improvements
|
|||
|
||||
* SOLR-14972: Change default port of prometheus exporter to 8989 because it clashed with default embedded zookeeper port (janhoy)
|
||||
|
||||
* SOLR-14926, SOLR-14926, SOLR-13506: Modernize and clean up search results clustering contrib. This issue upgrades
|
||||
the clustering contrib to the new Carrot2 4.x line, dropping several CVE-prone dependencies along the way.
|
||||
The parameters and configuration of the contrib extensions have changed. The documentation in Solr ref guide
|
||||
has been rewritten from scratch to be up to date. Clustering code has been rewritten from scratch to work
|
||||
properly regardless of the mode (standalone, distributed). The API has been stripped of ancient, unused, interfaces
|
||||
and simplified. (Dawid Weiss)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
* SOLR-14656: Autoscaling framework removed (Ishan Chattopadhyaya, noble, Ilan Ginzburg)
|
||||
|
|
|
@ -221,7 +221,7 @@ and Edmond Nolan.
|
|||
The Polish analyzer (stempel) comes with a default
|
||||
stopword list that is BSD-licensed created by the Carrot2 project. The file resides
|
||||
in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
|
||||
See http://project.carrot2.org/license.html.
|
||||
See https://project.carrot2.org/license.html.
|
||||
|
||||
The SmartChineseAnalyzer source code (smartcn) was
|
||||
provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
|
||||
|
@ -439,13 +439,12 @@ http://sourceforge.jp/projects/jsonic/
|
|||
=========================================================================
|
||||
== Carrot2 Notice ==
|
||||
=========================================================================
|
||||
Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
|
||||
Copyright (C) 2002-2020, Dawid Weiss, Stanislaw Osinski.
|
||||
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
|
||||
All rights reserved.
|
||||
|
||||
This product includes software developed by the Carrot2 Project.
|
||||
|
||||
See http://project.carrot2.org/
|
||||
See https://project.carrot2.org/
|
||||
|
||||
=========================================================================
|
||||
== Guava Notice ==
|
||||
|
|
|
@ -18,14 +18,12 @@
|
|||
|
||||
apply plugin: 'java-library'
|
||||
|
||||
description = 'Clustering Integraton'
|
||||
description = 'Search Results Clustering Integraton'
|
||||
|
||||
dependencies {
|
||||
implementation project(':solr:core')
|
||||
implementation project(':lucene:analysis:common')
|
||||
implementation('org.carrot2:carrot2-mini', {
|
||||
exclude group: "org.simpleframework", module: "simple-xml"
|
||||
})
|
||||
implementation 'org.carrot2:carrot2-core'
|
||||
|
||||
testImplementation project(':solr:test-framework')
|
||||
}
|
||||
|
|
|
@ -16,378 +16,496 @@
|
|||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TotalHits;
|
||||
import org.apache.solr.client.solrj.response.ClusteringResponse;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
import org.apache.solr.common.params.ShardParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine;
|
||||
import org.apache.solr.handler.component.HighlightComponent;
|
||||
import org.apache.solr.handler.component.ResponseBuilder;
|
||||
import org.apache.solr.handler.component.SearchComponent;
|
||||
import org.apache.solr.handler.component.ShardRequest;
|
||||
import org.apache.solr.highlight.SolrHighlighter;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.DocIterator;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.DocListAndSet;
|
||||
import org.apache.solr.search.DocSlice;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
import org.carrot2.clustering.Cluster;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Provides a plugin for performing cluster analysis. This can either be applied to
|
||||
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for
|
||||
* clustering documents (e.g., via <a href="http://mahout.apache.org/">Mahout</a>).
|
||||
* A {@link SearchComponent} for dynamic, unsupervised grouping of
|
||||
* search results based on the content of their text fields or contextual
|
||||
* snippets around query-matching regions.
|
||||
*
|
||||
* <p>
|
||||
* See Solr example for configuration examples.</p>
|
||||
*
|
||||
* The default implementation uses clustering algorithms from the
|
||||
* <a href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class ClusteringComponent extends SearchComponent implements SolrCoreAware {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
/**
|
||||
* Base name for all component parameters. This name is also used to
|
||||
* register this component with SearchHandler.
|
||||
* Default component name and parameter prefix.
|
||||
*/
|
||||
public static final String COMPONENT_NAME = "clustering";
|
||||
|
||||
/**
|
||||
* Declaration-order list of search clustering engines.
|
||||
* Request parameter that selects one of the {@link Engine} configurations
|
||||
* out of many possibly defined in the component's initialization parameters.
|
||||
*/
|
||||
private final LinkedHashMap<String, SearchClusteringEngine> searchClusteringEngines = new LinkedHashMap<>();
|
||||
public static final String REQUEST_PARAM_ENGINE = COMPONENT_NAME + ".engine";
|
||||
|
||||
/**
|
||||
* Declaration order list of document clustering engines.
|
||||
* Engine configuration initialization block name.
|
||||
*/
|
||||
private final LinkedHashMap<String, DocumentClusteringEngine> documentClusteringEngines = new LinkedHashMap<>();
|
||||
public static final String INIT_SECTION_ENGINE = "engine";
|
||||
|
||||
/**
|
||||
* An unmodifiable view of {@link #searchClusteringEngines}.
|
||||
* Response section name containing output clusters.
|
||||
*/
|
||||
private final Map<String, SearchClusteringEngine> searchClusteringEnginesView = Collections.unmodifiableMap(searchClusteringEngines);
|
||||
public static final String RESPONSE_SECTION_CLUSTERS = "clusters";
|
||||
|
||||
/**
|
||||
* Initialization parameters temporarily saved here, the component
|
||||
* is initialized in {@link #inform(SolrCore)} because we need to know
|
||||
* the core's {@link SolrResourceLoader}.
|
||||
*
|
||||
* @see #init(NamedList)
|
||||
* Default log sink.
|
||||
*/
|
||||
private NamedList<Object> initParams;
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
/**
|
||||
* Convert a DocList to a SolrDocumentList
|
||||
*
|
||||
* The optional param "ids" is populated with the lucene document id
|
||||
* for each SolrDocument.
|
||||
*
|
||||
* @param docs The {@link org.apache.solr.search.DocList} to convert
|
||||
* @param searcher The {@link org.apache.solr.search.SolrIndexSearcher} to use to load the docs from the Lucene index
|
||||
* @param fields The names of the Fields to load
|
||||
* @param ids A map to store the ids of the docs
|
||||
* @return The new {@link SolrDocumentList} containing all the loaded docs
|
||||
* @throws IOException if there was a problem loading the docs
|
||||
* @since solr 1.4
|
||||
* An internal request parameter for shard requests used for collecting
|
||||
* input documents for clustering.
|
||||
*/
|
||||
public static SolrDocumentList docListToSolrDocumentList(
|
||||
DocList docs,
|
||||
SolrIndexSearcher searcher,
|
||||
Set<String> fields,
|
||||
Map<SolrDocument, Integer> ids ) throws IOException
|
||||
{
|
||||
IndexSchema schema = searcher.getSchema();
|
||||
private static final String REQUEST_PARAM_COLLECT_INPUTS = COMPONENT_NAME + ".collect-inputs";
|
||||
|
||||
SolrDocumentList list = new SolrDocumentList();
|
||||
list.setNumFound(docs.matches());
|
||||
list.setMaxScore(docs.maxScore());
|
||||
list.setStart(docs.offset());
|
||||
/**
|
||||
* Shard request response section name containing partial document inputs.
|
||||
*/
|
||||
private static final String RESPONSE_SECTION_INPUT_DOCUMENTS = "clustering-inputs";
|
||||
|
||||
DocIterator dit = docs.iterator();
|
||||
/**
|
||||
* All engines declared in this component's initialization block.
|
||||
*/
|
||||
private final List<EngineEntry> declaredEngines = new ArrayList<>();
|
||||
|
||||
while (dit.hasNext()) {
|
||||
int docid = dit.nextDoc();
|
||||
/**
|
||||
* Declaration-order list of available search clustering engines.
|
||||
*/
|
||||
private final LinkedHashMap<String, EngineEntry> engines = new LinkedHashMap<>();
|
||||
|
||||
Document luceneDoc = searcher.doc(docid, fields);
|
||||
SolrDocument doc = new SolrDocument();
|
||||
private static boolean isComponentEnabled(ResponseBuilder rb) {
|
||||
return rb.req.getParams().getBool(COMPONENT_NAME, false);
|
||||
}
|
||||
|
||||
for( IndexableField field : luceneDoc) {
|
||||
if (null == fields || fields.contains(field.name())) {
|
||||
SchemaField sf = schema.getField( field.name() );
|
||||
doc.addField( field.name(), sf.getType().toObject( field ) );
|
||||
}
|
||||
}
|
||||
if (docs.hasScores() && (null == fields || fields.contains("score"))) {
|
||||
doc.addField("score", dit.score());
|
||||
}
|
||||
private static List<InputDocument> documentsFromNamedList(List<NamedList<Object>> docList) {
|
||||
return docList.stream()
|
||||
.map(docProps -> {
|
||||
InputDocument doc = new InputDocument(
|
||||
docProps.get("id"),
|
||||
(String) docProps.get("language"));
|
||||
|
||||
list.add( doc );
|
||||
docProps.forEach((fieldName, value) -> {
|
||||
doc.addClusteredField(fieldName, (String) value);
|
||||
});
|
||||
doc.visitFields(docProps::add);
|
||||
return doc;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
if( ids != null ) {
|
||||
ids.put( doc, docid );
|
||||
private static List<NamedList<Object>> documentsToNamedList(List<InputDocument> documents) {
|
||||
return documents.stream()
|
||||
.map(doc -> {
|
||||
NamedList<Object> docProps = new SimpleOrderedMap<>();
|
||||
docProps.add("id", doc.getId());
|
||||
docProps.add("language", doc.language());
|
||||
doc.visitFields(docProps::add);
|
||||
return docProps;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static List<NamedList<Object>> clustersToNamedList(List<InputDocument> documents,
|
||||
List<Cluster<InputDocument>> clusters,
|
||||
EngineParameters params) {
|
||||
List<NamedList<Object>> result = new ArrayList<>();
|
||||
clustersToNamedListRecursive(clusters, result, params);
|
||||
|
||||
if (params.includeOtherTopics()) {
|
||||
LinkedHashSet<InputDocument> clustered = new LinkedHashSet<>();
|
||||
clusters.forEach(cluster -> collectUniqueDocuments(cluster, clustered));
|
||||
List<InputDocument> unclustered = documents.stream()
|
||||
.filter(doc -> !clustered.contains(doc))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (!unclustered.isEmpty()) {
|
||||
NamedList<Object> cluster = new SimpleOrderedMap<>();
|
||||
result.add(cluster);
|
||||
cluster.add(ClusteringResponse.IS_OTHER_TOPICS, true);
|
||||
cluster.add(ClusteringResponse.LABELS_NODE, Collections.singletonList("Other topics"));
|
||||
cluster.add(ClusteringResponse.SCORE_NODE, 0d);
|
||||
cluster.add(ClusteringResponse.DOCS_NODE, unclustered.stream().map(InputDocument::getId)
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
}
|
||||
return list;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
private static void clustersToNamedListRecursive(
|
||||
List<Cluster<InputDocument>> outputClusters,
|
||||
List<NamedList<Object>> parent, EngineParameters params) {
|
||||
for (Cluster<InputDocument> cluster : outputClusters) {
|
||||
NamedList<Object> converted = new SimpleOrderedMap<>();
|
||||
parent.add(converted);
|
||||
|
||||
// Add labels
|
||||
List<String> labels = cluster.getLabels();
|
||||
if (labels.size() > params.maxLabels()) {
|
||||
labels = labels.subList(0, params.maxLabels());
|
||||
}
|
||||
converted.add(ClusteringResponse.LABELS_NODE, labels);
|
||||
|
||||
// Add cluster score
|
||||
final Double score = cluster.getScore();
|
||||
if (score != null) {
|
||||
converted.add(ClusteringResponse.SCORE_NODE, score);
|
||||
}
|
||||
|
||||
List<InputDocument> docs;
|
||||
if (params.includeSubclusters()) {
|
||||
docs = cluster.getDocuments();
|
||||
} else {
|
||||
docs = new ArrayList<>(collectUniqueDocuments(cluster, new LinkedHashSet<>()));
|
||||
}
|
||||
|
||||
converted.add(ClusteringResponse.DOCS_NODE, docs.stream().map(InputDocument::getId)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
if (params.includeSubclusters() && !cluster.getClusters().isEmpty()) {
|
||||
List<NamedList<Object>> subclusters = new ArrayList<>();
|
||||
converted.add(ClusteringResponse.CLUSTERS_NODE, subclusters);
|
||||
clustersToNamedListRecursive(cluster.getClusters(), subclusters, params);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static LinkedHashSet<InputDocument> collectUniqueDocuments(Cluster<InputDocument> cluster, LinkedHashSet<InputDocument> unique) {
|
||||
unique.addAll(cluster.getDocuments());
|
||||
for (Cluster<InputDocument> sub : cluster.getClusters()) {
|
||||
collectUniqueDocuments(sub, unique);
|
||||
}
|
||||
return unique;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings({"rawtypes", "unchecked"})
|
||||
public void init(NamedList args) {
|
||||
this.initParams = args;
|
||||
super.init(args);
|
||||
|
||||
if (args != null) {
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<Object> initParams = (NamedList<Object>) args;
|
||||
for (Map.Entry<String, Object> entry : initParams) {
|
||||
if (!INIT_SECTION_ENGINE.equals(entry.getKey())) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
|
||||
"Unrecognized configuration entry: " + entry.getKey());
|
||||
}
|
||||
|
||||
declaredEngines.add(new EngineEntry(((NamedList<Object>) entry.getValue()).toSolrParams()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public void inform(SolrCore core) {
|
||||
if (initParams != null) {
|
||||
log.info("Initializing Clustering Engines");
|
||||
|
||||
// Our target list of engines, split into search-results and document clustering.
|
||||
SolrResourceLoader loader = core.getResourceLoader();
|
||||
|
||||
for (Map.Entry<String,Object> entry : initParams) {
|
||||
if ("engine".equals(entry.getKey())) {
|
||||
NamedList<Object> engineInitParams = (NamedList<Object>) entry.getValue();
|
||||
Boolean optional = engineInitParams.getBooleanArg("optional");
|
||||
optional = (optional == null ? Boolean.FALSE : optional);
|
||||
|
||||
String engineClassName = StringUtils.defaultIfBlank(
|
||||
(String) engineInitParams.get("classname"),
|
||||
CarrotClusteringEngine.class.getName());
|
||||
|
||||
// Instantiate the clustering engine and split to appropriate map.
|
||||
final ClusteringEngine engine = loader.newInstance(engineClassName, ClusteringEngine.class);
|
||||
final String name = StringUtils.defaultIfBlank(engine.init(engineInitParams, core), "");
|
||||
|
||||
if (!engine.isAvailable()) {
|
||||
if (optional) {
|
||||
log.info("Optional clustering engine not available: {}", name);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"A required clustering engine failed to initialize, check the logs: " + name);
|
||||
}
|
||||
}
|
||||
|
||||
final ClusteringEngine previousEntry;
|
||||
if (engine instanceof SearchClusteringEngine) {
|
||||
previousEntry = searchClusteringEngines.put(name, (SearchClusteringEngine) engine);
|
||||
} else if (engine instanceof DocumentClusteringEngine) {
|
||||
previousEntry = documentClusteringEngines.put(name, (DocumentClusteringEngine) engine);
|
||||
} else {
|
||||
log.warn("Unknown type of a clustering engine for class: {}", engineClassName);
|
||||
continue;
|
||||
}
|
||||
if (previousEntry != null) {
|
||||
log.warn("Duplicate clustering engine component named '{}'.", name);
|
||||
declaredEngines.forEach(engineEntry -> {
|
||||
if (!engineEntry.initialize(core)) {
|
||||
if (engineEntry.optional) {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("Optional clustering engine is not available: {}", engineEntry.engineName);
|
||||
}
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"A required clustering engine failed to initialize, check the logs: " + engineEntry.engineName);
|
||||
}
|
||||
} else {
|
||||
if (engines.put(engineEntry.engineName, engineEntry) != null) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
String.format(Locale.ROOT,
|
||||
"Duplicate clustering engine named '%s'.", engineEntry.engineName));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Set up the default engine key for both types of engines.
|
||||
setupDefaultEngine("search results clustering", searchClusteringEngines);
|
||||
setupDefaultEngine("document clustering", documentClusteringEngines);
|
||||
|
||||
log.info("Finished Initializing Clustering Engines");
|
||||
if (engines.size() > 0) {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("The following clustering engines are available: {}",
|
||||
String.join(", ", engines.keySet()));
|
||||
}
|
||||
} else {
|
||||
log.warn("No clustering engines are available.");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepare(ResponseBuilder rb) throws IOException {
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (!params.getBool(COMPONENT_NAME, false)) {
|
||||
return;
|
||||
}
|
||||
public void prepare(ResponseBuilder rb) {
|
||||
// Do nothing.
|
||||
}
|
||||
|
||||
/**
|
||||
* Entry point for clustering in local server mode (non-distributed).
|
||||
*
|
||||
* @param rb The {@link ResponseBuilder}.
|
||||
* @throws IOException Propagated if an I/O exception occurs.
|
||||
*/
|
||||
@Override
|
||||
public void process(ResponseBuilder rb) throws IOException {
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (!params.getBool(COMPONENT_NAME, false)) {
|
||||
if (!isComponentEnabled(rb)) {
|
||||
return;
|
||||
}
|
||||
|
||||
final String name = getClusteringEngineName(rb);
|
||||
boolean useResults = params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false);
|
||||
if (useResults == true) {
|
||||
SearchClusteringEngine engine = searchClusteringEngines.get(name);
|
||||
if (engine != null) {
|
||||
checkAvailable(name, engine);
|
||||
DocListAndSet results = rb.getResults();
|
||||
Map<SolrDocument,Integer> docIds = new HashMap<>(results.docList.size());
|
||||
SolrDocumentList solrDocList = docListToSolrDocumentList(
|
||||
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
|
||||
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
|
||||
rb.rsp.add("clusters", clusters);
|
||||
} else {
|
||||
log.warn("No engine named: {}", name);
|
||||
}
|
||||
EngineEntry engine = getEngine(rb);
|
||||
EngineParameters parameters = engine.defaults.derivedFrom(rb.req.getParams());
|
||||
|
||||
List<InputDocument> inputs = getDocuments(rb, parameters);
|
||||
|
||||
if (rb.req.getParams().getBool(ShardParams.IS_SHARD, false) &&
|
||||
rb.req.getParams().getBool(REQUEST_PARAM_COLLECT_INPUTS, false)) {
|
||||
rb.rsp.add(RESPONSE_SECTION_INPUT_DOCUMENTS, documentsToNamedList(inputs));
|
||||
} else {
|
||||
doCluster(rb, engine, inputs, parameters);
|
||||
}
|
||||
|
||||
boolean useCollection = params.getBool(ClusteringParams.USE_COLLECTION, false);
|
||||
if (useCollection == true) {
|
||||
DocumentClusteringEngine engine = documentClusteringEngines.get(name);
|
||||
if (engine != null) {
|
||||
checkAvailable(name, engine);
|
||||
boolean useDocSet = params.getBool(ClusteringParams.USE_DOC_SET, false);
|
||||
NamedList<?> nl = null;
|
||||
|
||||
// TODO: This likely needs to be made into a background task that runs in an executor
|
||||
if (useDocSet == true) {
|
||||
nl = engine.cluster(rb.getResults().docSet, params);
|
||||
} else {
|
||||
nl = engine.cluster(params);
|
||||
}
|
||||
rb.rsp.add("clusters", nl);
|
||||
} else {
|
||||
log.warn("No engine named: {}", name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkAvailable(String name, ClusteringEngine engine) {
|
||||
if (!engine.isAvailable()) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Clustering engine declared, but not available, check the logs: " + name);
|
||||
}
|
||||
}
|
||||
|
||||
private String getClusteringEngineName(ResponseBuilder rb){
|
||||
return rb.req.getParams().get(ClusteringParams.ENGINE_NAME, ClusteringEngine.DEFAULT_ENGINE_NAME);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (!params.getBool(COMPONENT_NAME, false) || !params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
|
||||
if (!isComponentEnabled(rb)) {
|
||||
return;
|
||||
}
|
||||
sreq.params.remove(COMPONENT_NAME);
|
||||
if( ( sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS ) != 0 ){
|
||||
String fl = sreq.params.get(CommonParams.FL,"*");
|
||||
// if fl=* then we don't need to check.
|
||||
if (fl.indexOf('*') >= 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
String name = getClusteringEngineName(rb);
|
||||
SearchClusteringEngine engine = searchClusteringEngines.get(name);
|
||||
if (engine != null) {
|
||||
checkAvailable(name, engine);
|
||||
Set<String> fields = engine.getFieldsToLoad(rb.req);
|
||||
if (fields == null || fields.size() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String[] flparams = fl.split( "[,\\s]+" );
|
||||
Set<String> flParamSet = new HashSet<>(flparams.length);
|
||||
for (String flparam : flparams) {
|
||||
// no need trim() because of split() by \s+
|
||||
flParamSet.add(flparam);
|
||||
}
|
||||
for (String aFieldToLoad : fields) {
|
||||
if (!flParamSet.contains(aFieldToLoad )) {
|
||||
sb.append(',').append(aFieldToLoad);
|
||||
}
|
||||
}
|
||||
if (sb.length() > 0) {
|
||||
sreq.params.set(CommonParams.FL, fl + sb.toString());
|
||||
}
|
||||
} else {
|
||||
log.warn("No engine named: {}", name);
|
||||
}
|
||||
// Make sure the component is enabled for shard request.
|
||||
assert sreq.params.getBool(COMPONENT_NAME, false) :
|
||||
"Shard request should propagate clustering component enabled state?";
|
||||
|
||||
// Piggyback collecting inputs for clustering on top of get fields request.
|
||||
if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
|
||||
sreq.params.set(REQUEST_PARAM_COLLECT_INPUTS, true);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finishStage(ResponseBuilder rb) {
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (!params.getBool(COMPONENT_NAME, false) ||
|
||||
!params.getBool(ClusteringParams.USE_SEARCH_RESULTS, false)) {
|
||||
if (!isComponentEnabled(rb)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
|
||||
String name = getClusteringEngineName(rb);
|
||||
SearchClusteringEngine engine = searchClusteringEngines.get(name);
|
||||
if (engine != null) {
|
||||
checkAvailable(name, engine);
|
||||
SolrDocumentList solrDocList = (SolrDocumentList) rb.rsp.getResponse();
|
||||
// TODO: Currently, docIds is set to null in distributed environment.
|
||||
// This causes CarrotParams.PRODUCE_SUMMARY doesn't work.
|
||||
// To work CarrotParams.PRODUCE_SUMMARY under distributed mode, we can choose either one of:
|
||||
// (a) In each shard, ClusteringComponent produces summary and finishStage()
|
||||
// merges these summaries.
|
||||
// (b) Adding doHighlighting(SolrDocumentList, ...) method to SolrHighlighter and
|
||||
// making SolrHighlighter uses "external text" rather than stored values to produce snippets.
|
||||
Map<SolrDocument,Integer> docIds = null;
|
||||
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
|
||||
rb.rsp.add("clusters", clusters);
|
||||
} else {
|
||||
log.warn("No engine named: {}", name);
|
||||
}
|
||||
List<InputDocument> inputs = new ArrayList<>();
|
||||
rb.finished.stream()
|
||||
.filter(shardRequest -> (shardRequest.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0)
|
||||
.flatMap(shardRequest -> shardRequest.responses.stream())
|
||||
.filter(rsp -> rsp.getException() == null)
|
||||
.map(rsp -> rsp.getSolrResponse().getResponse())
|
||||
.forEach(response -> {
|
||||
@SuppressWarnings("unchecked")
|
||||
List<NamedList<Object>> partialInputs = (List<NamedList<Object>>) response.get(RESPONSE_SECTION_INPUT_DOCUMENTS);
|
||||
if (partialInputs != null) {
|
||||
inputs.addAll(documentsFromNamedList(partialInputs));
|
||||
}
|
||||
});
|
||||
|
||||
EngineEntry engine = getEngine(rb);
|
||||
EngineParameters parameters = engine.defaults.derivedFrom(rb.req.getParams());
|
||||
doCluster(rb, engine, inputs, parameters);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Expose for tests.
|
||||
* Run clustering of input documents and append the result to the response.
|
||||
*/
|
||||
Map<String, SearchClusteringEngine> getSearchClusteringEngines() {
|
||||
return searchClusteringEnginesView;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return "A Clustering component";
|
||||
private void doCluster(ResponseBuilder rb, EngineEntry engine, List<InputDocument> inputs, EngineParameters parameters) {
|
||||
// log.warn("# CLUSTERING: " + inputs.size() + " document(s), contents:\n - "
|
||||
// + inputs.stream().map(Object::toString).collect(Collectors.joining("\n - ")));
|
||||
List<Cluster<InputDocument>> clusters = engine.get().cluster(parameters, rb.getQuery(), inputs);
|
||||
rb.rsp.add(RESPONSE_SECTION_CLUSTERS, clustersToNamedList(inputs, clusters, parameters));
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup the default clustering engine.
|
||||
* @see "https://issues.apache.org/jira/browse/SOLR-5219"
|
||||
* Prepares input documents for clustering.
|
||||
*/
|
||||
private static <T extends ClusteringEngine> void setupDefaultEngine(String type, LinkedHashMap<String,T> map) {
|
||||
// If there's already a default algorithm, leave it as is.
|
||||
String engineName = ClusteringEngine.DEFAULT_ENGINE_NAME;
|
||||
T defaultEngine = map.get(engineName);
|
||||
private List<InputDocument> getDocuments(ResponseBuilder responseBuilder,
|
||||
EngineParameters requestParameters) throws IOException {
|
||||
|
||||
if (defaultEngine == null ||
|
||||
!defaultEngine.isAvailable()) {
|
||||
// If there's no default algorithm, and there are any algorithms available,
|
||||
// the first definition becomes the default algorithm.
|
||||
for (Map.Entry<String, T> e : map.entrySet()) {
|
||||
if (e.getValue().isAvailable()) {
|
||||
engineName = e.getKey();
|
||||
defaultEngine = e.getValue();
|
||||
map.put(ClusteringEngine.DEFAULT_ENGINE_NAME, defaultEngine);
|
||||
break;
|
||||
SolrQueryRequest solrRequest = responseBuilder.req;
|
||||
Query query = responseBuilder.getQuery();
|
||||
SolrIndexSearcher indexSearcher = responseBuilder.req.getSearcher();
|
||||
SolrCore core = solrRequest.getCore();
|
||||
String[] fieldsToCluster = requestParameters.fields().toArray(String[]::new);
|
||||
IndexSchema schema = indexSearcher.getSchema();
|
||||
|
||||
boolean preferQueryContext = requestParameters.preferQueryContext();
|
||||
SolrQueryRequest req = null;
|
||||
SolrHighlighter highlighter = null;
|
||||
if (preferQueryContext) {
|
||||
highlighter = ((HighlightComponent) core.getSearchComponents().get(HighlightComponent.COMPONENT_NAME)).getHighlighter();
|
||||
if (highlighter != null) {
|
||||
Map<String, Object> args = new HashMap<>();
|
||||
args.put(HighlightParams.FIELDS, fieldsToCluster);
|
||||
args.put(HighlightParams.HIGHLIGHT, "true");
|
||||
// We don't want any highlight marks.
|
||||
args.put(HighlightParams.SIMPLE_PRE, "");
|
||||
args.put(HighlightParams.SIMPLE_POST, "");
|
||||
args.put(HighlightParams.FRAGSIZE, requestParameters.contextSize());
|
||||
args.put(HighlightParams.SNIPPETS, requestParameters.contextCount());
|
||||
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
|
||||
@Override
|
||||
public SolrIndexSearcher getSearcher() {
|
||||
return indexSearcher;
|
||||
}
|
||||
};
|
||||
} else {
|
||||
log.warn("No highlighter configured, cannot produce summary");
|
||||
preferQueryContext = false;
|
||||
}
|
||||
}
|
||||
|
||||
Map<String, Function<IndexableField, String>> fieldsToLoad = new LinkedHashMap<>();
|
||||
for (String fld : requestParameters.getFieldsToLoad()) {
|
||||
FieldType type = schema.getField(fld).getType();
|
||||
fieldsToLoad.put(fld, (fieldValue) -> type.toObject(fieldValue).toString());
|
||||
}
|
||||
|
||||
Function<Map<String, String>, String> docLanguage;
|
||||
String languageField = requestParameters.languageField();
|
||||
if (languageField != null) {
|
||||
docLanguage = (doc) -> doc.getOrDefault(languageField, requestParameters.language());
|
||||
} else {
|
||||
docLanguage = (doc) -> requestParameters.language();
|
||||
}
|
||||
|
||||
List<InputDocument> result = new ArrayList<>();
|
||||
DocIterator it = responseBuilder.getResults().docList.iterator();
|
||||
while (it.hasNext()) {
|
||||
int docId = it.nextDoc();
|
||||
|
||||
Map<String, String> docFieldValues = new LinkedHashMap<>();
|
||||
for (IndexableField indexableField : indexSearcher.doc(docId, fieldsToLoad.keySet())) {
|
||||
String fieldName = indexableField.name();
|
||||
Function<IndexableField, String> toString = fieldsToLoad.get(fieldName);
|
||||
if (toString != null) {
|
||||
String value = toString.apply(indexableField);
|
||||
docFieldValues.compute(fieldName, (k, v) -> {
|
||||
if (v == null) {
|
||||
return value;
|
||||
} else {
|
||||
return v + " . " + value;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
InputDocument inputDocument = new InputDocument(
|
||||
docFieldValues.get(requestParameters.docIdField()),
|
||||
docLanguage.apply(docFieldValues));
|
||||
result.add(inputDocument);
|
||||
|
||||
Function<String, String> snippetProvider = (field) -> null;
|
||||
if (preferQueryContext) {
|
||||
DocList docAsList = new DocSlice(0, 1,
|
||||
new int[]{docId},
|
||||
new float[]{1.0f},
|
||||
1,
|
||||
1.0f,
|
||||
TotalHits.Relation.EQUAL_TO);
|
||||
|
||||
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, query, req, fieldsToCluster);
|
||||
if (highlights != null && highlights.size() == 1) {
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<String[]> tmp = (NamedList<String[]>) highlights.getVal(0);
|
||||
snippetProvider = (field) -> {
|
||||
String[] values = tmp.get(field);
|
||||
if (values == null) {
|
||||
return null;
|
||||
} else {
|
||||
return String.join(" . ", Arrays.asList(values));
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
Function<String, String> fullValueProvider = docFieldValues::get;
|
||||
|
||||
for (String field : fieldsToCluster) {
|
||||
String values = snippetProvider.apply(field);
|
||||
if (values == null) {
|
||||
values = fullValueProvider.apply(field);
|
||||
}
|
||||
if (values != null) {
|
||||
inputDocument.addClusteredField(field, values);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (defaultEngine != null) {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("Default engine for {}: {} [{}]", type, engineName, defaultEngine.getClass().getSimpleName());
|
||||
return result;
|
||||
}
|
||||
|
||||
private EngineEntry getEngine(ResponseBuilder rb) {
|
||||
if (engines.isEmpty()) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"No clustering engines are defined or loaded.");
|
||||
}
|
||||
|
||||
EngineEntry engine;
|
||||
String name = rb.req.getParams().get(REQUEST_PARAM_ENGINE, null);
|
||||
if (name != null) {
|
||||
engine = engines.get(name);
|
||||
if (engine == null) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Clustering engine unknown or not loaded: " + name);
|
||||
}
|
||||
} else {
|
||||
log.warn("No default engine for {}.", type);
|
||||
engine = engines.values().iterator().next();
|
||||
}
|
||||
return engine;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return A map of initialized clustering engines, exposed for tests only.
|
||||
*/
|
||||
Set<String> getEngineNames() {
|
||||
return engines.keySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return "Search results clustering component";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,41 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
|
||||
/**
|
||||
* A base class for {@link SearchClusteringEngine} and {@link DocumentClusteringEngine}.
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class ClusteringEngine {
|
||||
public static final String ENGINE_NAME = "name";
|
||||
public static final String DEFAULT_ENGINE_NAME = "default";
|
||||
|
||||
private String name;
|
||||
|
||||
public String init(NamedList<?> config, SolrCore core) {
|
||||
name = (String) config.get(ENGINE_NAME);
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public abstract boolean isAvailable();
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
/**
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public interface ClusteringParams {
|
||||
|
||||
public static final String CLUSTERING_PREFIX = "clustering.";
|
||||
|
||||
public static final String ENGINE_NAME = CLUSTERING_PREFIX + "engine";
|
||||
|
||||
public static final String USE_SEARCH_RESULTS = CLUSTERING_PREFIX + "results";
|
||||
|
||||
public static final String USE_COLLECTION = CLUSTERING_PREFIX + "collection";
|
||||
|
||||
/**
|
||||
* When clustering full documents, cluster on the Doc Set.
|
||||
*/
|
||||
public static final String USE_DOC_SET = CLUSTERING_PREFIX + "docs.useDocSet";
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.search.DocSet;
|
||||
|
||||
/**
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class DocumentClusteringEngine extends ClusteringEngine {
|
||||
|
||||
/**
|
||||
* Experimental. Subject to change before the next release
|
||||
*
|
||||
* Cluster all the documents in the index. Clustering is often an expensive task that can take a long time.
|
||||
* @param solrParams The params controlling clustering
|
||||
* @return The clustering results
|
||||
*/
|
||||
public abstract NamedList<?> cluster(SolrParams solrParams);
|
||||
|
||||
/**
|
||||
* Experimental. Subject to change before the next release
|
||||
*
|
||||
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
|
||||
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
|
||||
* @param solrParams The params controlling the clustering
|
||||
* @return The results.
|
||||
*/
|
||||
public abstract NamedList<?> cluster(DocSet docs, SolrParams solrParams);
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,195 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.QueryVisitor;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.carrot2.clustering.Cluster;
|
||||
import org.carrot2.clustering.ClusteringAlgorithm;
|
||||
import org.carrot2.language.LanguageComponents;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Search results clustering engine based on Carrot2 clustering algorithms.
|
||||
*
|
||||
* @lucene.experimental
|
||||
* @see "https://project.carrot2.org"
|
||||
*/
|
||||
final class Engine {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
/**
|
||||
* All resources required for the clustering engine.
|
||||
*/
|
||||
private EngineContext engineContext;
|
||||
|
||||
boolean init(String engineName, SolrCore core, EngineParameters defaultParams) {
|
||||
log.info("Initializing clustering engine: {}", engineName);
|
||||
|
||||
this.engineContext = new EngineContext(defaultParams.resources(), core);
|
||||
|
||||
{
|
||||
ClusteringAlgorithm defaultAlgorithm = engineContext.getAlgorithm(defaultParams.algorithmName());
|
||||
LanguageComponents defaultLanguage = engineContext.getLanguage(defaultParams.language());
|
||||
|
||||
if (defaultAlgorithm == null) {
|
||||
log.warn("The default clustering algorithm for engine '{}' is not available: {}",
|
||||
engineName, defaultParams.algorithmName());
|
||||
}
|
||||
|
||||
if (defaultLanguage == null) {
|
||||
log.warn("The default language for engine {} is not available: {}",
|
||||
engineName, defaultParams.language());
|
||||
}
|
||||
|
||||
return (defaultAlgorithm != null && defaultLanguage != null);
|
||||
}
|
||||
}
|
||||
|
||||
List<Cluster<InputDocument>> cluster(EngineParameters parameters, Query query, List<InputDocument> documents) {
|
||||
try {
|
||||
checkParameters(parameters);
|
||||
|
||||
ClusteringAlgorithm algorithm = engineContext.getAlgorithm(parameters.algorithmName());
|
||||
populateAlgorithmParameters(query, parameters, algorithm);
|
||||
|
||||
// Sort documents by ID so that results are not order-sensitive.
|
||||
documents.sort(Comparator.comparing(a -> a.getId().toString()));
|
||||
|
||||
// Split documents into language groups.
|
||||
String defaultLanguage = parameters.language();
|
||||
Map<String, List<InputDocument>> documentsByLanguage =
|
||||
documents.stream()
|
||||
.collect(
|
||||
Collectors.groupingBy(
|
||||
doc -> {
|
||||
String lang = doc.language();
|
||||
return lang == null ? defaultLanguage : lang;
|
||||
}));
|
||||
|
||||
// Cluster documents within each language group.
|
||||
HashSet<String> warnOnce = new HashSet<>();
|
||||
LinkedHashMap<String, List<Cluster<InputDocument>>> clustersByLanguage =
|
||||
new LinkedHashMap<>();
|
||||
for (Map.Entry<String, List<InputDocument>> e : documentsByLanguage.entrySet()) {
|
||||
String lang = e.getKey();
|
||||
if (!engineContext.isLanguageSupported(lang)) {
|
||||
if (warnOnce.add(lang)) {
|
||||
log.warn(
|
||||
"Language '{}' is not supported, documents in this "
|
||||
+ "language will not be clustered.", lang);
|
||||
}
|
||||
} else {
|
||||
LanguageComponents langComponents = engineContext.getLanguage(lang);
|
||||
if (!algorithm.supports(langComponents)) {
|
||||
if (warnOnce.add(lang)) {
|
||||
log.warn(
|
||||
"Language '{}' is not supported by algorithm '{}', documents in this "
|
||||
+ "language will not be clustered.", lang, parameters.algorithmName());
|
||||
}
|
||||
} else {
|
||||
clustersByLanguage.put(
|
||||
lang, algorithm.cluster(e.getValue().stream(), langComponents));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Cluster<InputDocument>> clusters;
|
||||
if (clustersByLanguage.size() == 1) {
|
||||
clusters = clustersByLanguage.values().iterator().next();
|
||||
} else {
|
||||
clusters = clustersByLanguage.entrySet().stream()
|
||||
.map(e -> {
|
||||
Cluster<InputDocument> cluster = new Cluster<>();
|
||||
cluster.addLabel(e.getKey());
|
||||
e.getValue().forEach(cluster::addCluster);
|
||||
return cluster;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
return clusters;
|
||||
} catch (Exception e) {
|
||||
log.error("Clustering request failed.", e);
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void populateAlgorithmParameters(Query query, EngineParameters requestParameters, ClusteringAlgorithm algorithm) {
|
||||
LinkedHashMap<String, String> attrs = requestParameters.otherParameters();
|
||||
// Set the optional query hint. We extract just the terms
|
||||
if (!attrs.containsKey("queryHint")) {
|
||||
Set<String> termSet = new LinkedHashSet<>();
|
||||
query.visit(new QueryVisitor() {
|
||||
@Override
|
||||
public void consumeTerms(Query query, Term... terms) {
|
||||
for (Term t : terms) {
|
||||
termSet.add(t.text());
|
||||
}
|
||||
}
|
||||
});
|
||||
attrs.put("queryHint", String.join(" ", termSet));
|
||||
}
|
||||
algorithm.accept(new FlatKeysAttrVisitor(attrs));
|
||||
}
|
||||
|
||||
private void checkParameters(EngineParameters parameters) {
|
||||
ClusteringAlgorithm algorithm = engineContext.getAlgorithm(parameters.algorithmName());
|
||||
if (algorithm == null) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
|
||||
"Algorithm '%s' not found.",
|
||||
parameters.algorithmName()));
|
||||
}
|
||||
|
||||
String defaultLanguage = parameters.language();
|
||||
LanguageComponents languageComponents = engineContext.getLanguage(defaultLanguage);
|
||||
if (languageComponents == null) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
|
||||
"Language '%s' is not supported.",
|
||||
defaultLanguage));
|
||||
}
|
||||
|
||||
if (!algorithm.supports(languageComponents)) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
|
||||
"Language '%s' is not supported by algorithm '%s'.",
|
||||
defaultLanguage,
|
||||
parameters.algorithmName()));
|
||||
}
|
||||
|
||||
if (parameters.fields().isEmpty()) {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, String.format(Locale.ROOT,
|
||||
"At least one field name specifying content for clustering is required in parameter '%s'.",
|
||||
EngineParameters.PARAM_FIELDS));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.carrot2.clustering.ClusteringAlgorithm;
|
||||
import org.carrot2.clustering.ClusteringAlgorithmProvider;
|
||||
import org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm;
|
||||
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
|
||||
import org.carrot2.clustering.stc.STCClusteringAlgorithm;
|
||||
import org.carrot2.language.LanguageComponents;
|
||||
import org.carrot2.language.LanguageComponentsLoader;
|
||||
import org.carrot2.language.LoadedLanguages;
|
||||
import org.carrot2.util.ChainedResourceLookup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.ServiceLoader;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Clustering engine context: algorithms, preloaded language
|
||||
* resources and initial validation.
|
||||
*/
|
||||
final class EngineContext {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
private final LinkedHashMap<String, LanguageComponents> languages;
|
||||
private final Map<String, ClusteringAlgorithmProvider> algorithmProviders;
|
||||
|
||||
private final static Map<String, String> aliasedNames;
|
||||
|
||||
static {
|
||||
aliasedNames = new HashMap<>();
|
||||
aliasedNames.put(LingoClusteringAlgorithm.class.getName(), LingoClusteringAlgorithm.NAME);
|
||||
aliasedNames.put(STCClusteringAlgorithm.class.getName(), STCClusteringAlgorithm.NAME);
|
||||
aliasedNames.put(BisectingKMeansClusteringAlgorithm.class.getName(), BisectingKMeansClusteringAlgorithm.NAME);
|
||||
}
|
||||
|
||||
EngineContext(String resourcesPath, SolrCore core) {
|
||||
LanguageComponentsLoader loader = LanguageComponents.loader();
|
||||
|
||||
List<Path> resourceLocations = new ArrayList<>();
|
||||
|
||||
Path configDir = Paths.get(core.getResourceLoader().getConfigDir());
|
||||
if (resourcesPath != null && !resourcesPath.trim().isEmpty()) {
|
||||
configDir = configDir.resolve(resourcesPath);
|
||||
resourceLocations.add(configDir);
|
||||
}
|
||||
|
||||
if (!resourceLocations.isEmpty()) {
|
||||
log.info(
|
||||
"Clustering algorithm resources first looked up relative to: {}", resourceLocations);
|
||||
|
||||
loader.withResourceLookup(
|
||||
(provider) ->
|
||||
new ChainedResourceLookup(
|
||||
Arrays.asList(
|
||||
new PathResourceLookup(resourceLocations),
|
||||
provider.defaultResourceLookup())));
|
||||
} else {
|
||||
log.info("Resources read from defaults (JARs).");
|
||||
}
|
||||
|
||||
ClassLoader classLoader = getClass().getClassLoader();
|
||||
algorithmProviders =
|
||||
ServiceLoader.load(ClusteringAlgorithmProvider.class, classLoader)
|
||||
.stream()
|
||||
.map(ServiceLoader.Provider::get)
|
||||
.collect(Collectors.toMap(ClusteringAlgorithmProvider::name, e -> e));
|
||||
|
||||
// Only load the resources of algorithms we're interested in.
|
||||
loader.limitToAlgorithms(
|
||||
algorithmProviders.values().stream()
|
||||
.map(Supplier::get)
|
||||
.toArray(ClusteringAlgorithm[]::new));
|
||||
|
||||
languages = new LinkedHashMap<>();
|
||||
try {
|
||||
LoadedLanguages loadedLanguages = loader.load();
|
||||
for (String lang : loadedLanguages.languages()) {
|
||||
languages.put(lang, loadedLanguages.language(lang));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
|
||||
// Debug info about loaded languages.
|
||||
for (String lang : languages.keySet()) {
|
||||
if (log.isTraceEnabled()) {
|
||||
log.trace(
|
||||
"Loaded language '{}' with components:\n - {}",
|
||||
lang,
|
||||
languages.get(lang).components().stream()
|
||||
.map(Class::getSimpleName)
|
||||
.collect(Collectors.joining("\n - ")));
|
||||
}
|
||||
}
|
||||
|
||||
// Remove algorithms for which there are no languages that are supported.
|
||||
algorithmProviders
|
||||
.entrySet()
|
||||
.removeIf(e -> !isAlgorithmAvailable(e.getValue(), languages.values()));
|
||||
|
||||
algorithmProviders.forEach(
|
||||
(name, prov) -> {
|
||||
String supportedLanguages =
|
||||
languages.values().stream()
|
||||
.filter(lc -> prov.get().supports(lc))
|
||||
.map(LanguageComponents::language)
|
||||
.collect(Collectors.joining(", "));
|
||||
|
||||
log.info(
|
||||
"Clustering algorithm {} loaded with support for the following languages: {}",
|
||||
name,
|
||||
supportedLanguages);
|
||||
});
|
||||
}
|
||||
|
||||
ClusteringAlgorithm getAlgorithm(String algorithmName) {
|
||||
if (!algorithmProviders.containsKey(algorithmName)
|
||||
&& aliasedNames.containsKey(algorithmName)) {
|
||||
algorithmName = aliasedNames.get(algorithmName);
|
||||
}
|
||||
|
||||
ClusteringAlgorithmProvider provider = algorithmProviders.get(algorithmName);
|
||||
return provider == null ? null : provider.get();
|
||||
}
|
||||
|
||||
LanguageComponents getLanguage(String language) {
|
||||
return languages.get(language);
|
||||
}
|
||||
|
||||
boolean isLanguageSupported(String language) {
|
||||
return languages.containsKey(language);
|
||||
}
|
||||
|
||||
private boolean isAlgorithmAvailable(
|
||||
ClusteringAlgorithmProvider provider, Collection<LanguageComponents> languages) {
|
||||
ClusteringAlgorithm algorithm = provider.get();
|
||||
Optional<LanguageComponents> first = languages.stream().filter(algorithm::supports).findFirst();
|
||||
if (first.isEmpty()) {
|
||||
log.warn("Algorithm does not support any of the available languages: {}", provider.name());
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
|
||||
import java.util.function.Supplier;
|
||||
|
||||
/**
|
||||
* Parses each clustering engine configuration
|
||||
* initialization parameters.
|
||||
*/
|
||||
final class EngineEntry implements Supplier<Engine> {
|
||||
/**
|
||||
* Marks the engine as optional (if unavailable).
|
||||
*/
|
||||
private static final String PARAM_OPTIONAL = "optional";
|
||||
|
||||
/**
|
||||
* Unique engine name parameter.
|
||||
*/
|
||||
private static final String PARAM_NAME = "name";
|
||||
|
||||
final boolean optional;
|
||||
final String engineName;
|
||||
final EngineParameters defaults;
|
||||
|
||||
/**
|
||||
* Preinitialized instance of a clustering engine.
|
||||
*/
|
||||
private Engine engine;
|
||||
|
||||
/**
|
||||
* {@code true} if the engine has been initialized properly and is available.
|
||||
*/
|
||||
private boolean available;
|
||||
|
||||
EngineEntry(SolrParams params) {
|
||||
this.optional = params.getBool(PARAM_OPTIONAL, false);
|
||||
this.engineName = params.get(PARAM_NAME, "");
|
||||
|
||||
defaults = new EngineParameters(params);
|
||||
}
|
||||
|
||||
boolean initialize(SolrCore core) {
|
||||
SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField();
|
||||
if (uniqueField == null) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
|
||||
ClusteringComponent.class.getSimpleName() + " requires the declaration of uniqueKeyField in the schema.");
|
||||
}
|
||||
String docIdField = uniqueField.getName();
|
||||
defaults.setDocIdField(docIdField);
|
||||
|
||||
engine = new Engine();
|
||||
available = engine.init(engineName, core, defaults);
|
||||
return available;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Engine get() {
|
||||
return engine;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,353 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* {@link Engine} configuration parameters (and other parameters that
|
||||
* may tweak clustering algorithms on a per-request basis).
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class EngineParameters implements Cloneable {
|
||||
/**
|
||||
* Common prefix for configuration of engine settings.
|
||||
*/
|
||||
private static final String PARAM_PREFIX = "clustering.";
|
||||
|
||||
/**
|
||||
* @see #algorithmName()
|
||||
*/
|
||||
public static final String PARAM_ALGORITHM = PARAM_PREFIX + "algorithm";
|
||||
|
||||
/**
|
||||
* @see #maxLabels()
|
||||
*/
|
||||
public static final String PARAM_MAX_LABELS = PARAM_PREFIX + "maxLabels";
|
||||
|
||||
/**
|
||||
* @see #includeSubclusters()
|
||||
*/
|
||||
public static final String PARAM_INCLUDE_SUBCLUSTERS = PARAM_PREFIX + "includeSubclusters";
|
||||
|
||||
/**
|
||||
* @see #includeOtherTopics()
|
||||
*/
|
||||
public static final String PARAM_INCLUDE_OTHER_TOPICS = PARAM_PREFIX + "includeOtherTopics";
|
||||
|
||||
/**
|
||||
* @see #language()
|
||||
*/
|
||||
public static final String PARAM_LANGUAGE = PARAM_PREFIX + "language";
|
||||
|
||||
/**
|
||||
* @see #languageField()
|
||||
*/
|
||||
public static final String PARAM_LANGUAGE_FIELD = PARAM_PREFIX + "languageField";
|
||||
|
||||
/**
|
||||
* @see #resources()
|
||||
*/
|
||||
public static final String PARAM_RESOURCES = PARAM_PREFIX + "resources";
|
||||
|
||||
/**
|
||||
* @see #fields()
|
||||
*/
|
||||
public static final String PARAM_FIELDS = PARAM_PREFIX + "fields";
|
||||
|
||||
/**
|
||||
* @see #preferQueryContext()
|
||||
*/
|
||||
public static final String PARAM_PREFER_QUERY_CONTEXT = PARAM_PREFIX + "preferQueryContext";
|
||||
|
||||
/**
|
||||
* @see #contextSize()
|
||||
*/
|
||||
public static final String PARAM_CONTEXT_SIZE = PARAM_PREFIX + "contextSize";
|
||||
|
||||
/**
|
||||
* @see #contextCount()
|
||||
*/
|
||||
public static final String PARAM_CONTEXT_COUNT = PARAM_PREFIX + "contextCount";
|
||||
|
||||
/**
|
||||
* @see #PARAM_MAX_LABELS
|
||||
*/
|
||||
private int maxLabels = Integer.MAX_VALUE;
|
||||
|
||||
/**
|
||||
* @see #PARAM_INCLUDE_SUBCLUSTERS
|
||||
*/
|
||||
private boolean includeSubclusters = true;
|
||||
|
||||
/**
|
||||
* @see #PARAM_INCLUDE_OTHER_TOPICS
|
||||
*/
|
||||
private boolean includeOtherTopics = true;
|
||||
|
||||
/**
|
||||
* @see #PARAM_ALGORITHM
|
||||
*/
|
||||
private String algorithmName;
|
||||
|
||||
/**
|
||||
* @see #PARAM_RESOURCES
|
||||
*/
|
||||
private String resources;
|
||||
|
||||
/**
|
||||
* @see #PARAM_LANGUAGE
|
||||
*/
|
||||
private String language = "English";
|
||||
|
||||
/**
|
||||
* @see #PARAM_LANGUAGE_FIELD
|
||||
*/
|
||||
private String languageField;
|
||||
|
||||
/**
|
||||
* @see #PARAM_PREFER_QUERY_CONTEXT
|
||||
*/
|
||||
private boolean preferQueryContext;
|
||||
|
||||
/**
|
||||
* @see #PARAM_CONTEXT_SIZE
|
||||
*/
|
||||
private int contextSize = 80 * 4;
|
||||
|
||||
/**
|
||||
* @see #PARAM_CONTEXT_COUNT
|
||||
*/
|
||||
private int contextCount = 3;
|
||||
|
||||
/**
|
||||
* @see #PARAM_FIELDS
|
||||
*/
|
||||
private LinkedHashSet<String> fields = new LinkedHashSet<>();
|
||||
|
||||
/**
|
||||
* Non-engine configuration parameters (algorithm parameters).
|
||||
*/
|
||||
private LinkedHashMap<String, String> otherParameters = new LinkedHashMap<>();
|
||||
|
||||
/**
|
||||
* Unique-value document identifier field. This is required for clustering since clusters
|
||||
* only reference documents by their ID field's value.
|
||||
*/
|
||||
private String docIdField;
|
||||
|
||||
EngineParameters(SolrParams params) {
|
||||
extractFrom(params);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract parameter values from the given {@link SolrParams}.
|
||||
*/
|
||||
private EngineParameters extractFrom(SolrParams params) {
|
||||
params.stream().forEachOrdered(e -> {
|
||||
switch (e.getKey()) {
|
||||
case PARAM_MAX_LABELS:
|
||||
maxLabels = params.getInt(PARAM_MAX_LABELS);
|
||||
break;
|
||||
case PARAM_INCLUDE_SUBCLUSTERS:
|
||||
includeSubclusters = params.getBool(PARAM_INCLUDE_SUBCLUSTERS);
|
||||
break;
|
||||
case PARAM_INCLUDE_OTHER_TOPICS:
|
||||
includeOtherTopics = params.getBool(PARAM_INCLUDE_OTHER_TOPICS);
|
||||
break;
|
||||
case PARAM_ALGORITHM:
|
||||
algorithmName = params.get(PARAM_ALGORITHM);
|
||||
break;
|
||||
case PARAM_RESOURCES:
|
||||
resources = params.get(PARAM_RESOURCES);
|
||||
break;
|
||||
case PARAM_LANGUAGE:
|
||||
language = params.get(PARAM_LANGUAGE);
|
||||
break;
|
||||
case PARAM_LANGUAGE_FIELD:
|
||||
languageField = params.get(PARAM_LANGUAGE_FIELD);
|
||||
break;
|
||||
case PARAM_PREFER_QUERY_CONTEXT:
|
||||
preferQueryContext = params.getBool(PARAM_PREFER_QUERY_CONTEXT);
|
||||
break;
|
||||
case PARAM_CONTEXT_COUNT:
|
||||
contextCount = params.getPrimitiveInt(PARAM_CONTEXT_COUNT);
|
||||
break;
|
||||
case PARAM_CONTEXT_SIZE:
|
||||
contextSize = params.getPrimitiveInt(PARAM_CONTEXT_SIZE);
|
||||
break;
|
||||
case PARAM_FIELDS:
|
||||
fields.addAll(Arrays.asList(params.get(PARAM_FIELDS).split("[,]\\s*")));
|
||||
break;
|
||||
default:
|
||||
// Unrecognized parameter. Preserve it.
|
||||
String[] value = e.getValue();
|
||||
if (value != null) {
|
||||
if (value.length == 1) {
|
||||
otherParameters.put(e.getKey(), value[0]);
|
||||
} else {
|
||||
otherParameters.put(e.getKey(), String.join(", ", value));
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Maximum number of returned cluster labels (even if the algorithm
|
||||
* returns more).
|
||||
*/
|
||||
int maxLabels() {
|
||||
return maxLabels;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return If {@code true}, include subclusters in response (if the algorithm
|
||||
* produces hierarchical clustering).
|
||||
*/
|
||||
boolean includeSubclusters() {
|
||||
return includeSubclusters;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return If {@code true}, include a synthetic cluster called "Other Topics" that
|
||||
* consists of all documents not assigned to any other cluster.
|
||||
*/
|
||||
boolean includeOtherTopics() {
|
||||
return includeOtherTopics;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Name of the clustering algorithm to use (as loaded via the service
|
||||
* * extension point {@link org.carrot2.clustering.ClusteringAlgorithm}).
|
||||
*/
|
||||
String algorithmName() {
|
||||
return algorithmName;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Return Solr component-configuration relative language resources path.
|
||||
*/
|
||||
String resources() {
|
||||
return resources;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Name of the default language to use for clustering. The corresponding
|
||||
* {@link org.carrot2.language.LanguageComponents} must be available (loaded via
|
||||
* service provider extension).
|
||||
*/
|
||||
String language() {
|
||||
return language;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Name of the field that carries each document's language. {@code null} value
|
||||
* means all documents will be clustered according to the default {@link #language()}.
|
||||
* If not {@code null} and the document's field has a missing value, it will be clustered
|
||||
* using the default {@link #language()} as well.
|
||||
*/
|
||||
String languageField() {
|
||||
return languageField;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Names of all fields whose textual content will be passed to the clustering engine.
|
||||
* Comma or space separated.
|
||||
*/
|
||||
Set<String> fields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns {@code true} if clustering should try to extract context fragments
|
||||
* around the matching query regions rather than use full field content. Such context snippets
|
||||
* typically cluster well because they carry a more compact and query-related information.
|
||||
*/
|
||||
boolean preferQueryContext() {
|
||||
return preferQueryContext;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the maximum query context window to use if {@link #preferQueryContext()} is {@code true}.
|
||||
*/
|
||||
int contextSize() {
|
||||
return contextSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the maximum number of different, non-contiguous query context snippets from a single field
|
||||
* if {@link #preferQueryContext()} is {@code true}.
|
||||
*/
|
||||
int contextCount() {
|
||||
return contextCount;
|
||||
}
|
||||
|
||||
LinkedHashMap<String, String> otherParameters() {
|
||||
return otherParameters;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected EngineParameters clone() {
|
||||
try {
|
||||
EngineParameters clone = (EngineParameters) super.clone();
|
||||
clone.otherParameters = new LinkedHashMap<>(this.otherParameters);
|
||||
clone.fields.addAll(this.fields);
|
||||
return clone;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Return a copy of the argument with any parameters present in
|
||||
* {@code params} overriding this object defaults.
|
||||
*/
|
||||
EngineParameters derivedFrom(SolrParams params) {
|
||||
EngineParameters cloned = this.clone();
|
||||
cloned.extractFrom(params);
|
||||
return cloned;
|
||||
}
|
||||
|
||||
String docIdField() {
|
||||
return Objects.requireNonNull(docIdField);
|
||||
}
|
||||
|
||||
void setDocIdField(String docIdField) {
|
||||
this.docIdField = Objects.requireNonNull(docIdField);
|
||||
}
|
||||
|
||||
Set<String> getFieldsToLoad() {
|
||||
Set<String> fields = new LinkedHashSet<>(fields());
|
||||
fields.add(docIdField());
|
||||
String languageField = languageField();
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
fields.add(languageField);
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.attrs.AcceptingVisitor;
|
||||
import org.carrot2.attrs.AliasMapper;
|
||||
import org.carrot2.attrs.AttrBoolean;
|
||||
import org.carrot2.attrs.AttrDouble;
|
||||
import org.carrot2.attrs.AttrEnum;
|
||||
import org.carrot2.attrs.AttrInteger;
|
||||
import org.carrot2.attrs.AttrObject;
|
||||
import org.carrot2.attrs.AttrObjectArray;
|
||||
import org.carrot2.attrs.AttrString;
|
||||
import org.carrot2.attrs.AttrStringArray;
|
||||
import org.carrot2.attrs.AttrVisitor;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
/**
|
||||
* {@link AttrVisitor} that responds to "flattened" key paths and values, updating
|
||||
* corresponding algorithm parameters with values contained in the map.
|
||||
*/
|
||||
class FlatKeysAttrVisitor implements AttrVisitor {
|
||||
final Function<String, Object> classToInstance = AliasMapper.SPI_DEFAULTS::fromName;
|
||||
final ArrayDeque<String> keyPath = new ArrayDeque<>();
|
||||
|
||||
final LinkedHashMap<String, String> attrs;
|
||||
|
||||
/**
|
||||
* @param attrs A map of attributes to set. Note the map has ordered keys:
|
||||
* this is required for complex sub-types so that instantiation of
|
||||
* a value precedes setting its attributes.
|
||||
*/
|
||||
FlatKeysAttrVisitor(LinkedHashMap<String, String> attrs) {
|
||||
this.attrs = attrs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(String key, AttrBoolean attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
attr.set(value == null ? null : Boolean.parseBoolean(value));
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(String key, AttrInteger attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
attr.set(value == null ? null : Integer.parseInt(value));
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(String key, AttrDouble attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
attr.set(value == null ? null : Double.parseDouble(value));
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(String key, AttrString attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
attr.set(value);
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(String key, AttrStringArray attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
if (value == null) {
|
||||
attr.set(new String[0]);
|
||||
} else {
|
||||
attr.set(value.split(",\\s*"));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends Enum<T>> void visit(String key, AttrEnum<T> attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
try {
|
||||
attr.set(Enum.valueOf(attr.enumClass(), value));
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format(
|
||||
Locale.ROOT,
|
||||
"Value at key '%s' should be an enum constant of class '%s', but no such " +
|
||||
"constant exists: '%s' (available constants: %s)",
|
||||
key,
|
||||
attr.enumClass().getSimpleName(),
|
||||
toDebugString(value),
|
||||
EnumSet.allOf(attr.enumClass())));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends AcceptingVisitor> void visit(String key, AttrObject<T> attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
if (value == null) {
|
||||
attr.set(null);
|
||||
} else {
|
||||
T t = safeCast(classToInstance.apply(value), key, attr.getInterfaceClass());
|
||||
attr.set(t);
|
||||
}
|
||||
});
|
||||
|
||||
T t = attr.get();
|
||||
if (t != null) {
|
||||
withKey(key, path -> {
|
||||
t.accept(this);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends AcceptingVisitor> void visit(String key, AttrObjectArray<T> attr) {
|
||||
ifKeyExists(key, (path, value) -> {
|
||||
throw new RuntimeException("Setting arrays of objects not implemented for attribute: "
|
||||
+ key + " (" + attr.getDescription() + ")");
|
||||
});
|
||||
}
|
||||
|
||||
private <T> T safeCast(Object value, String key, Class<T> clazz) {
|
||||
if (value == null) {
|
||||
return null;
|
||||
} else {
|
||||
if (!clazz.isInstance(value)) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format(
|
||||
Locale.ROOT,
|
||||
"Value at key '%s' should be an instance of '%s', but encountered class '%s': '%s'",
|
||||
key,
|
||||
clazz.getSimpleName(),
|
||||
value.getClass().getSimpleName(),
|
||||
toDebugString(value)));
|
||||
}
|
||||
return clazz.cast(value);
|
||||
}
|
||||
}
|
||||
|
||||
private String toDebugString(Object value) {
|
||||
if (value == null) {
|
||||
return "[null]";
|
||||
} else if (value instanceof Object[]) {
|
||||
return Arrays.deepToString(((Object[]) value));
|
||||
} else {
|
||||
return Objects.toString(value);
|
||||
}
|
||||
}
|
||||
|
||||
private void withKey(String key, Consumer<String> pathConsumer) {
|
||||
keyPath.addLast(key);
|
||||
try {
|
||||
String path = String.join(".", keyPath);
|
||||
pathConsumer.accept(path);
|
||||
} finally {
|
||||
keyPath.removeLast();
|
||||
}
|
||||
}
|
||||
|
||||
private void ifKeyExists(String key, BiConsumer<String, String> pathConsumer) {
|
||||
withKey(key, (path) -> {
|
||||
if (attrs.containsKey(path)) {
|
||||
String value = attrs.get(path);
|
||||
if (value.trim().isEmpty()) {
|
||||
value = null;
|
||||
}
|
||||
pathConsumer.accept(path, value);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.clustering.Document;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Representation of a single logical "document" for clustering.
|
||||
*/
|
||||
final class InputDocument implements Document {
|
||||
private final Object id;
|
||||
private final Map<String, String> clusteredFields = new LinkedHashMap<>();
|
||||
private final String language;
|
||||
|
||||
InputDocument(Object docId, String language) {
|
||||
this.id = Objects.requireNonNull(docId);
|
||||
this.language = language;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visitFields(BiConsumer<String, String> fieldConsumer) {
|
||||
clusteredFields.forEach(fieldConsumer);
|
||||
}
|
||||
|
||||
Object getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
String language() {
|
||||
return language;
|
||||
}
|
||||
|
||||
void addClusteredField(String fieldName, String fieldValue) {
|
||||
assert !clusteredFields.containsKey(fieldName);
|
||||
clusteredFields.put(fieldName, fieldValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format(Locale.ROOT,
|
||||
"doc[%s, lang=%s, fields=%s]",
|
||||
getId(),
|
||||
language,
|
||||
clusteredFields.entrySet().stream().map(e -> e.getKey() + ": " + e.getValue()).collect(Collectors.joining(", ")));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.util.ResourceLookup;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Carrot2 resource provider from the provided list of filesystem paths.
|
||||
*/
|
||||
final class PathResourceLookup implements ResourceLookup {
|
||||
private final List<Path> locations;
|
||||
|
||||
PathResourceLookup(List<Path> locations) {
|
||||
if (locations == null || locations.isEmpty()) {
|
||||
throw new RuntimeException("At least one resource location is required.");
|
||||
}
|
||||
this.locations = locations;
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream open(String resource) throws IOException {
|
||||
Path p = locate(resource);
|
||||
if (p == null) {
|
||||
throw new IOException(
|
||||
"Resource "
|
||||
+ p
|
||||
+ " not found relative to: "
|
||||
+ locations.stream()
|
||||
.map(path -> path.toAbsolutePath().toString())
|
||||
.collect(Collectors.joining(", ")));
|
||||
}
|
||||
return new BufferedInputStream(Files.newInputStream(p));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean exists(String resource) {
|
||||
return locate(resource) != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String pathOf(String resource) {
|
||||
return "["
|
||||
+ locations.stream()
|
||||
.map(path -> path.resolve(resource).toAbsolutePath().toString())
|
||||
.collect(Collectors.joining(" | "))
|
||||
+ "]";
|
||||
}
|
||||
|
||||
private Path locate(String resource) {
|
||||
for (Path base : locations) {
|
||||
Path p = base.resolve(resource);
|
||||
if (Files.exists(p)) {
|
||||
return p;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,52 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
|
||||
/**
|
||||
* Base class for clustering engines performing cluster analysis on search
|
||||
* results.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public abstract class SearchClusteringEngine extends ClusteringEngine {
|
||||
/**
|
||||
* Do the clustering, return a clusters structure to be appended to
|
||||
* {@link SolrQueryResponse}.
|
||||
*/
|
||||
public abstract Object cluster(Query query, SolrDocumentList solrDocumentList,
|
||||
Map<SolrDocument,Integer> docIds, SolrQueryRequest sreq);
|
||||
|
||||
/**
|
||||
* Returns the set of field names to load.
|
||||
* Concrete classes can override this method if needed.
|
||||
* Default implementation returns null, that is, all stored fields are loaded.
|
||||
*
|
||||
* @return The set of field names to load.
|
||||
*/
|
||||
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,565 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TotalHits;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
import org.apache.solr.common.params.HighlightParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.common.util.SuppressForbidden;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.handler.clustering.ClusteringEngine;
|
||||
import org.apache.solr.handler.clustering.SearchClusteringEngine;
|
||||
import org.apache.solr.handler.component.HighlightComponent;
|
||||
import org.apache.solr.highlight.SolrHighlighter;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.DocSlice;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.carrot2.core.Cluster;
|
||||
import org.carrot2.core.Controller;
|
||||
import org.carrot2.core.ControllerFactory;
|
||||
import org.carrot2.core.Document;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.shaded.guava.common.base.MoreObjects;
|
||||
import org.carrot2.shaded.guava.common.base.Strings;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
|
||||
import org.carrot2.util.attribute.AttributeValueSet;
|
||||
import org.carrot2.util.attribute.AttributeValueSets;
|
||||
import org.carrot2.util.resource.ClassLoaderLocator;
|
||||
import org.carrot2.util.resource.IResource;
|
||||
import org.carrot2.util.resource.ResourceLookup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Search results clustering engine based on Carrot2 clustering algorithms.
|
||||
*
|
||||
* @see "http://project.carrot2.org"
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
/**
|
||||
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
|
||||
*/
|
||||
static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
|
||||
|
||||
/**
|
||||
* Name of Carrot2 document's field containing Solr document's identifier.
|
||||
*/
|
||||
private static final String SOLR_DOCUMENT_ID = "solrId";
|
||||
|
||||
/**
|
||||
* Name of Solr document's field containing the document's identifier. To avoid
|
||||
* repeating the content of documents in clusters on output, each cluster contains
|
||||
* identifiers of documents it contains.
|
||||
*/
|
||||
private String idFieldName;
|
||||
|
||||
/**
|
||||
* Carrot2 controller that manages instances of clustering algorithms
|
||||
*/
|
||||
private Controller controller = ControllerFactory.createPooling();
|
||||
|
||||
/**
|
||||
* {@link IClusteringAlgorithm} class used for actual clustering.
|
||||
*/
|
||||
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
|
||||
|
||||
/** Solr core we're bound to. */
|
||||
private SolrCore core;
|
||||
|
||||
@Override
|
||||
public boolean isAvailable() {
|
||||
return clusteringAlgorithmClass != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("rawtypes")
|
||||
public String init(NamedList config, final SolrCore core) {
|
||||
this.core = core;
|
||||
|
||||
String result = super.init(config, core);
|
||||
final SolrParams initParams = config.toSolrParams();
|
||||
|
||||
// Initialization attributes for Carrot2 controller.
|
||||
HashMap<String, Object> initAttributes = new HashMap<>();
|
||||
|
||||
// Customize Carrot2's resource lookup to first look for resources
|
||||
// using Solr's resource loader. If that fails, try loading from the classpath.
|
||||
ResourceLookup resourceLookup = new ResourceLookup(
|
||||
// Solr-specific resource loading.
|
||||
new SolrResourceLocator(core, initParams),
|
||||
// Using the class loader directly because this time we want to omit the prefix
|
||||
new ClassLoaderLocator(core.getResourceLoader().getClassLoader()));
|
||||
|
||||
DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
|
||||
.resourceLookup(resourceLookup);
|
||||
|
||||
// Make sure the requested Carrot2 clustering algorithm class is available
|
||||
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
|
||||
try {
|
||||
this.clusteringAlgorithmClass = core.getResourceLoader().findClass(
|
||||
carrotAlgorithmClassName, IClusteringAlgorithm.class);
|
||||
} catch (SolrException s) {
|
||||
if (!(s.getCause() instanceof ClassNotFoundException)) {
|
||||
throw s;
|
||||
}
|
||||
}
|
||||
|
||||
// Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
|
||||
// of this component. This by-name convention lookup is used to simplify configuring algorithms.
|
||||
String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("Initializing Clustering Engine '{}'", MoreObjects.firstNonNull(componentName, "<no 'name' attribute>"));
|
||||
}
|
||||
|
||||
if (!Strings.isNullOrEmpty(componentName)) {
|
||||
IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
|
||||
if (attributeXmls.length > 0) {
|
||||
if (attributeXmls.length > 1) {
|
||||
log.warn("More than one attribute file found, first one will be used: {}"
|
||||
, Arrays.toString(attributeXmls)); // nowarn
|
||||
}
|
||||
|
||||
withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> {
|
||||
try {
|
||||
AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open());
|
||||
AttributeValueSet defaultSet = avs.getDefaultAttributeValueSet();
|
||||
initAttributes.putAll(defaultSet.getAttributeValues());
|
||||
} catch (Exception e) {
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR,
|
||||
"Could not read attributes XML for clustering component: " + componentName, e);
|
||||
}
|
||||
return null;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Extract solrconfig attributes, they take precedence.
|
||||
extractCarrotAttributes(initParams, initAttributes);
|
||||
|
||||
// Customize the stemmer and tokenizer factories. The implementations we provide here
|
||||
// are included in the code base of Solr, so that it's possible to refactor
|
||||
// the Lucene APIs the factories rely on if needed.
|
||||
// Additionally, we set a custom lexical resource factory for Carrot2 that
|
||||
// will use both Carrot2 default stop words as well as stop words from
|
||||
// the StopFilter defined on the field.
|
||||
final AttributeBuilder attributeBuilder = BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes);
|
||||
attributeBuilder.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.TOKENIZER_FACTORY)) {
|
||||
attributeBuilder.tokenizerFactory(LuceneCarrot2TokenizerFactory.class);
|
||||
}
|
||||
if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.STEMMER_FACTORY)) {
|
||||
attributeBuilder.stemmerFactory(LuceneCarrot2StemmerFactory.class);
|
||||
}
|
||||
|
||||
// Pass the schema (via the core) to SolrStopwordsCarrot2LexicalDataFactory.
|
||||
initAttributes.put("solrCore", core);
|
||||
|
||||
// Carrot2 uses current thread's context class loader to get
|
||||
// certain classes (e.g. custom tokenizer/stemmer) at initialization time.
|
||||
// To make sure classes from contrib JARs are available,
|
||||
// we swap the context class loader for the time of clustering.
|
||||
withContextClassLoader(core.getResourceLoader().getClassLoader(), () -> this.controller.init(initAttributes));
|
||||
|
||||
SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField();
|
||||
if (uniqueField == null) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
|
||||
CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField");
|
||||
}
|
||||
this.idFieldName = uniqueField.getName();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object cluster(Query query, SolrDocumentList solrDocList,
|
||||
Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
|
||||
try {
|
||||
// Prepare attributes for Carrot2 clustering call
|
||||
Map<String, Object> attributes = new HashMap<>();
|
||||
List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
|
||||
attributes.put(AttributeNames.DOCUMENTS, documents);
|
||||
attributes.put(AttributeNames.QUERY, query.toString());
|
||||
|
||||
// Pass the fields on which clustering runs.
|
||||
attributes.put("solrFieldNames", getFieldsForClustering(sreq));
|
||||
|
||||
// Pass extra overriding attributes from the request, if any
|
||||
extractCarrotAttributes(sreq.getParams(), attributes);
|
||||
|
||||
// Perform clustering and convert to an output structure of clusters.
|
||||
//
|
||||
// Carrot2 uses current thread's context class loader to get
|
||||
// certain classes (e.g. custom tokenizer/stemmer) at runtime.
|
||||
// To make sure classes from contrib JARs are available,
|
||||
// we swap the context class loader for the time of clustering.
|
||||
return withContextClassLoader(core.getResourceLoader().getClassLoader(),
|
||||
() -> clustersToNamedList(controller.process(attributes,
|
||||
clusteringAlgorithmClass).getClusters(), sreq.getParams()));
|
||||
} catch (Exception e) {
|
||||
log.error("Carrot2 clustering failed", e);
|
||||
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
|
||||
HashSet<String> fields = new HashSet<>(getFieldsForClustering(sreq));
|
||||
fields.add(idFieldName);
|
||||
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
|
||||
fields.addAll(getCustomFieldsMap(solrParams).keySet());
|
||||
|
||||
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME);
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
fields.add(languageField);
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the names of fields that will be delivering the actual
|
||||
* content for clustering. Currently, there are two such fields: document
|
||||
* title and document content.
|
||||
*/
|
||||
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
|
||||
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
|
||||
if (StringUtils.isBlank(snippetFieldSpec)) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
|
||||
+ " must not be blank.");
|
||||
}
|
||||
|
||||
final Set<String> fields = new HashSet<>();
|
||||
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
|
||||
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares Carrot2 documents for clustering.
|
||||
*/
|
||||
private List<Document> getDocuments(SolrDocumentList solrDocList, Map<SolrDocument, Integer> docIds,
|
||||
Query query, final SolrQueryRequest sreq) throws IOException {
|
||||
SolrHighlighter highlighter = null;
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
SolrCore core = sreq.getCore();
|
||||
|
||||
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
|
||||
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
|
||||
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
|
||||
|
||||
// Maps Solr field names to Carrot2 custom field names
|
||||
Map<String, String> customFields = getCustomFieldsMap(solrParams);
|
||||
|
||||
// Parse language code map string into a map
|
||||
Map<String, String> languageCodeMap = new HashMap<>();
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
|
||||
final String[] split = pair.split(":");
|
||||
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
|
||||
languageCodeMap.put(split[0], split[1]);
|
||||
} else {
|
||||
log.warn("Unsupported format for {}: '{}'. Skipping this mapping."
|
||||
, CarrotParams.LANGUAGE_CODE_MAP, pair);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get the documents
|
||||
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
|
||||
|
||||
SolrQueryRequest req = null;
|
||||
String[] snippetFieldAry = null;
|
||||
if (produceSummary) {
|
||||
highlighter = HighlightComponent.getHighlighter(core);
|
||||
if (highlighter != null){
|
||||
Map<String, Object> args = new HashMap<>();
|
||||
snippetFieldAry = snippetFieldSpec.split("[, ]");
|
||||
args.put(HighlightParams.FIELDS, snippetFieldAry);
|
||||
args.put(HighlightParams.HIGHLIGHT, "true");
|
||||
args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
|
||||
args.put(HighlightParams.SIMPLE_POST, "");
|
||||
args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
|
||||
args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
|
||||
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
|
||||
@Override
|
||||
public SolrIndexSearcher getSearcher() {
|
||||
return sreq.getSearcher();
|
||||
}
|
||||
};
|
||||
} else {
|
||||
log.warn("No highlighter configured, cannot produce summary");
|
||||
produceSummary = false;
|
||||
}
|
||||
}
|
||||
|
||||
Iterator<SolrDocument> docsIter = solrDocList.iterator();
|
||||
List<Document> result = new ArrayList<>(solrDocList.size());
|
||||
|
||||
float[] scores = {1.0f};
|
||||
int[] docsHolder = new int[1];
|
||||
Query theQuery = query;
|
||||
|
||||
while (docsIter.hasNext()) {
|
||||
SolrDocument sdoc = docsIter.next();
|
||||
String snippet = null;
|
||||
|
||||
// TODO: docIds will be null when running distributed search.
|
||||
// See comment in ClusteringComponent#finishStage().
|
||||
if (produceSummary && docIds != null) {
|
||||
docsHolder[0] = docIds.get(sdoc).intValue();
|
||||
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f, TotalHits.Relation.EQUAL_TO);
|
||||
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
|
||||
if (highlights != null && highlights.size() == 1) {
|
||||
// should only be one value given our setup
|
||||
// should only be one document
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
|
||||
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (int j = 0; j < snippetFieldAry.length; j++) {
|
||||
// Join fragments with a period, so that Carrot2 does not create
|
||||
// cross-fragment phrases, such phrases rarely make sense.
|
||||
String [] highlt = tmp.get(snippetFieldAry[j]);
|
||||
if (highlt != null && highlt.length > 0) {
|
||||
for (int i = 0; i < highlt.length; i++) {
|
||||
sb.append(highlt[i]);
|
||||
sb.append(" . ");
|
||||
}
|
||||
}
|
||||
}
|
||||
snippet = sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
// If summaries not enabled or summary generation failed, use full content.
|
||||
if (snippet == null) {
|
||||
snippet = getConcatenated(sdoc, snippetFieldSpec);
|
||||
}
|
||||
|
||||
// Create a Carrot2 document
|
||||
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
|
||||
snippet, Objects.toString(sdoc.getFieldValue(urlField), ""));
|
||||
|
||||
// Store Solr id of the document, we need it to map document instances
|
||||
// found in clusters back to identifiers.
|
||||
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
|
||||
|
||||
// Set language
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
Collection<Object> languages = sdoc.getFieldValues(languageField);
|
||||
if (languages != null) {
|
||||
|
||||
// Use the first Carrot2-supported language
|
||||
for (Object l : languages) {
|
||||
String lang = Objects.toString(l, "");
|
||||
|
||||
if (languageCodeMap.containsKey(lang)) {
|
||||
lang = languageCodeMap.get(lang);
|
||||
}
|
||||
|
||||
// Language detection Library for Java uses dashes to separate
|
||||
// language variants, such as 'zh-cn', but Carrot2 uses underscores.
|
||||
if (lang.indexOf('-') > 0) {
|
||||
lang = lang.replace('-', '_');
|
||||
}
|
||||
|
||||
// If the language is supported by Carrot2, we'll get a non-null value
|
||||
final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
|
||||
if (carrot2Language != null) {
|
||||
carrotDocument.setLanguage(carrot2Language);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add custom fields
|
||||
if (customFields != null) {
|
||||
for (Entry<String, String> entry : customFields.entrySet()) {
|
||||
carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
result.add(carrotDocument);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expose clustering algorithm class for tests.
|
||||
*/
|
||||
Class<? extends IClusteringAlgorithm> getClusteringAlgorithmClass() {
|
||||
return clusteringAlgorithmClass;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares a map of Solr field names (keys) to the corresponding Carrot2
|
||||
* custom field names.
|
||||
*/
|
||||
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
|
||||
Map<String, String> customFields = new HashMap<>();
|
||||
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
|
||||
if (customFieldsSpec != null) {
|
||||
customFields = new HashMap<>();
|
||||
for (String customFieldSpec : customFieldsSpec) {
|
||||
String [] split = customFieldSpec.split(":");
|
||||
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
|
||||
customFields.put(split[0], split[1]);
|
||||
} else {
|
||||
log.warn("Unsupported format for {}: '{}'. Skipping this field definition."
|
||||
, CarrotParams.CUSTOM_FIELD_NAME, customFieldSpec);
|
||||
}
|
||||
}
|
||||
}
|
||||
return customFields;
|
||||
}
|
||||
|
||||
private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (String field : fieldsSpec.split("[, ]")) {
|
||||
Collection<Object> vals = sdoc.getFieldValues(field);
|
||||
if (vals == null) continue;
|
||||
Iterator<Object> ite = vals.iterator();
|
||||
while(ite.hasNext()){
|
||||
// Join multiple values with a period so that Carrot2 does not pick up
|
||||
// phrases that cross field value boundaries (in most cases it would
|
||||
// create useless phrases).
|
||||
result.append(Objects.toString(ite.next(), "")).append(" . ");
|
||||
}
|
||||
}
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
|
||||
SolrParams solrParams) {
|
||||
List<NamedList<Object>> result = new ArrayList<>();
|
||||
clustersToNamedList(carrotClusters, result, solrParams.getBool(
|
||||
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
|
||||
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
|
||||
return result;
|
||||
}
|
||||
|
||||
private void clustersToNamedList(List<Cluster> outputClusters,
|
||||
List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
|
||||
for (Cluster outCluster : outputClusters) {
|
||||
NamedList<Object> cluster = new SimpleOrderedMap<>();
|
||||
parent.add(cluster);
|
||||
|
||||
// Add labels
|
||||
List<String> labels = outCluster.getPhrases();
|
||||
if (labels.size() > maxLabels) {
|
||||
labels = labels.subList(0, maxLabels);
|
||||
}
|
||||
cluster.add("labels", labels);
|
||||
|
||||
// Add cluster score
|
||||
final Double score = outCluster.getScore();
|
||||
if (score != null) {
|
||||
cluster.add("score", score);
|
||||
}
|
||||
|
||||
// Add other topics marker
|
||||
if (outCluster.isOtherTopics()) {
|
||||
cluster.add("other-topics", outCluster.isOtherTopics());
|
||||
}
|
||||
|
||||
// Add documents
|
||||
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
|
||||
List<Object> docList = new ArrayList<>();
|
||||
cluster.add("docs", docList);
|
||||
for (Document doc : docs) {
|
||||
docList.add(doc.getField(SOLR_DOCUMENT_ID));
|
||||
}
|
||||
|
||||
// Add subclusters
|
||||
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
|
||||
List<NamedList<Object>> subclusters = new ArrayList<>();
|
||||
cluster.add("clusters", subclusters);
|
||||
clustersToNamedList(outCluster.getSubclusters(), subclusters,
|
||||
outputSubClusters, maxLabels);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts parameters that can possibly match some attributes of Carrot2 algorithms.
|
||||
*/
|
||||
private void extractCarrotAttributes(SolrParams solrParams,
|
||||
Map<String, Object> attributes) {
|
||||
// Extract all non-predefined parameters. This way, we'll be able to set all
|
||||
// parameters of Carrot2 algorithms without defining their names as constants.
|
||||
for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames
|
||||
.hasNext();) {
|
||||
String paramName = paramNames.next();
|
||||
if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) {
|
||||
attributes.put(paramName, solrParams.get(paramName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressForbidden(reason = "Uses context class loader as a workaround to inject correct classloader to 3rd party libs")
|
||||
private static <T> T withContextClassLoader(ClassLoader loader, Supplier<T> action) {
|
||||
Thread ct = Thread.currentThread();
|
||||
ClassLoader prev = ct.getContextClassLoader();
|
||||
try {
|
||||
ct.setContextClassLoader(loader);
|
||||
return action.get();
|
||||
} finally {
|
||||
ct.setContextClassLoader(prev);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1,73 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration).
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class CarrotParams {
|
||||
|
||||
private static String CARROT_PREFIX = "carrot.";
|
||||
|
||||
public static String ALGORITHM = CARROT_PREFIX + "algorithm";
|
||||
|
||||
public static String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
|
||||
public static String URL_FIELD_NAME = CARROT_PREFIX + "url";
|
||||
public static String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
|
||||
public static String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
|
||||
public static String CUSTOM_FIELD_NAME = CARROT_PREFIX + "custom";
|
||||
|
||||
public static String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
|
||||
public static String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
|
||||
public static String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
|
||||
|
||||
public static String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
|
||||
public static String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
|
||||
|
||||
public static String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
|
||||
|
||||
/**
|
||||
* Points to Carrot<sup>2</sup> resources
|
||||
*/
|
||||
public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
|
||||
|
||||
static final Set<String> CARROT_PARAM_NAMES = new HashSet<>(Arrays.asList(
|
||||
ALGORITHM,
|
||||
|
||||
TITLE_FIELD_NAME,
|
||||
URL_FIELD_NAME,
|
||||
SNIPPET_FIELD_NAME,
|
||||
LANGUAGE_FIELD_NAME,
|
||||
CUSTOM_FIELD_NAME,
|
||||
|
||||
PRODUCE_SUMMARY,
|
||||
SUMMARY_FRAGSIZE,
|
||||
SUMMARY_SNIPPETS,
|
||||
|
||||
NUM_DESCRIPTIONS,
|
||||
OUTPUT_SUB_CLUSTERS,
|
||||
RESOURCES_DIR,
|
||||
LANGUAGE_CODE_MAP));
|
||||
|
||||
/** No instances. */
|
||||
private CarrotParams() {}
|
||||
}
|
|
@ -1,246 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemmer;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.linguistic.IStemmer;
|
||||
import org.carrot2.text.linguistic.IStemmerFactory;
|
||||
import org.carrot2.util.ReflectionUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.tartarus.snowball.SnowballStemmer;
|
||||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
import org.tartarus.snowball.ext.DutchStemmer;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
import org.tartarus.snowball.ext.FinnishStemmer;
|
||||
import org.tartarus.snowball.ext.FrenchStemmer;
|
||||
import org.tartarus.snowball.ext.GermanStemmer;
|
||||
import org.tartarus.snowball.ext.HungarianStemmer;
|
||||
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||
import org.tartarus.snowball.ext.NorwegianStemmer;
|
||||
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||
import org.tartarus.snowball.ext.RomanianStemmer;
|
||||
import org.tartarus.snowball.ext.RussianStemmer;
|
||||
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||
import org.tartarus.snowball.ext.SwedishStemmer;
|
||||
import org.tartarus.snowball.ext.TurkishStemmer;
|
||||
|
||||
/**
|
||||
* An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
|
||||
* APIs. Should the relevant Lucene APIs need to change, the changes can be made
|
||||
* in this class.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
public IStemmer getStemmer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case ARABIC:
|
||||
return ArabicStemmerFactory.createStemmer();
|
||||
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return IdentityStemmer.INSTANCE;
|
||||
|
||||
default:
|
||||
/*
|
||||
* For other languages, try to use snowball's stemming.
|
||||
*/
|
||||
return SnowballStemmerFactory.createStemmer(language);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
|
||||
* project.
|
||||
*/
|
||||
private final static class SnowballStemmerFactory {
|
||||
/**
|
||||
* Static hard mapping from language codes to stemmer classes in Snowball.
|
||||
* This mapping is not dynamic because we want to keep the possibility to
|
||||
* obfuscate these classes.
|
||||
*/
|
||||
private static HashMap<LanguageCode, Class<? extends SnowballStemmer>> snowballStemmerClasses;
|
||||
static {
|
||||
snowballStemmerClasses = new HashMap<>();
|
||||
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
|
||||
PortugueseStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* An adapter converting Snowball programs into {@link IStemmer} interface.
|
||||
*/
|
||||
private static class SnowballStemmerAdapter implements IStemmer {
|
||||
private final SnowballStemmer snowballStemmer;
|
||||
|
||||
public SnowballStemmerAdapter(SnowballStemmer snowballStemmer) {
|
||||
this.snowballStemmer = snowballStemmer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
snowballStemmer.setCurrent(word.toString());
|
||||
if (snowballStemmer.stem()) {
|
||||
return snowballStemmer.getCurrent();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an {@link IStemmer} adapter for a
|
||||
* {@link SnowballStemmer} for a given language code. An identity stemmer is
|
||||
* returned for unknown languages.
|
||||
*/
|
||||
public static IStemmer createStemmer(LanguageCode language) {
|
||||
final Class<? extends SnowballStemmer> stemmerClazz = snowballStemmerClasses
|
||||
.get(language);
|
||||
|
||||
if (stemmerClazz == null) {
|
||||
log.warn("No Snowball stemmer class for: {}. "
|
||||
+ "Quality of clustering may be degraded.", language.name());
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
|
||||
try {
|
||||
return new SnowballStemmerAdapter(stemmerClazz.getConstructor().newInstance());
|
||||
} catch (Exception e) {
|
||||
log.warn("Could not instantiate snowball stemmer for language: {}"
|
||||
+ ". Quality of clustering may be degraded."
|
||||
, language.name(), e);
|
||||
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations for the
|
||||
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
|
||||
* to be present in classpath, otherwise an empty (identity) stemmer is
|
||||
* returned.
|
||||
*/
|
||||
private static class ArabicStemmerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
|
||||
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
|
||||
} catch (ClassNotFoundException e) {
|
||||
log
|
||||
.warn(
|
||||
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
|
||||
+ "of Arabic content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapter to lucene-contrib Arabic analyzers.
|
||||
*/
|
||||
private static class LuceneStemmerAdapter implements IStemmer {
|
||||
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
|
||||
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
|
||||
|
||||
private char[] buffer = new char[0];
|
||||
|
||||
private LuceneStemmerAdapter() {
|
||||
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
|
||||
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
if (word.length() > buffer.length) {
|
||||
buffer = new char[word.length()];
|
||||
}
|
||||
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
buffer[i] = word.charAt(i);
|
||||
}
|
||||
|
||||
int newLen = normalizer.normalize(buffer, word.length());
|
||||
newLen = delegate.stem(buffer, newLen);
|
||||
|
||||
if (newLen != word.length() || !equals(buffer, newLen, word)) {
|
||||
return CharBuffer.wrap(buffer, 0, newLen);
|
||||
}
|
||||
|
||||
// Same-same.
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean equals(char[] buffer, int len, CharSequence word) {
|
||||
assert len == word.length();
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (buffer[i] != word.charAt(i))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static IStemmer createStemmer() {
|
||||
try {
|
||||
return new LuceneStemmerAdapter();
|
||||
} catch (Exception e) {
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An implementation of {@link IStemmer} that always returns <code>null</code>
|
||||
* which means no stemming.
|
||||
*/
|
||||
private static class IdentityStemmer implements IStemmer {
|
||||
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
|
||||
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,167 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
|
||||
import org.carrot2.text.analysis.ITokenizer;
|
||||
import org.carrot2.text.linguistic.ITokenizerFactory;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.ExceptionUtils;
|
||||
import org.carrot2.util.ReflectionUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
|
||||
* Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
|
||||
* classpath at runtime, the default Carrot2's tokenizer is used. Should the
|
||||
* Lucene APIs need to change, the changes can be made in this class.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
@Override
|
||||
public ITokenizer getTokenizer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return ChineseTokenizerFactory.createTokenizer();
|
||||
|
||||
/*
|
||||
* We use our own analyzer for Arabic. Lucene's version has special
|
||||
* support for Nonspacing-Mark characters (see
|
||||
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||
* have them included as letters in the parser.
|
||||
*/
|
||||
case ARABIC:
|
||||
// Intentional fall-through.
|
||||
|
||||
default:
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||
* factory will fall back to the default white space tokenizer.
|
||||
*/
|
||||
private static final class ChineseTokenizerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
} catch (Throwable e) {
|
||||
log
|
||||
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||
if (e instanceof Error) {
|
||||
throw (Error) e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static ITokenizer createTokenizer() {
|
||||
try {
|
||||
return new ChineseTokenizer();
|
||||
} catch (Throwable e) {
|
||||
if (e instanceof OutOfMemoryError) {
|
||||
throw (OutOfMemoryError) e;
|
||||
}
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class ChineseTokenizer implements ITokenizer {
|
||||
private final static Pattern numeric = Pattern
|
||||
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||
|
||||
private Tokenizer sentenceTokenizer;
|
||||
private TokenStream wordTokenFilter;
|
||||
private CharTermAttribute term = null;
|
||||
|
||||
private final MutableCharArray tempCharSequence;
|
||||
private final Class<?> tokenFilterClass;
|
||||
|
||||
private ChineseTokenizer() throws Exception {
|
||||
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||
|
||||
// As Smart Chinese is not available during compile time,
|
||||
// we need to resort to reflection.
|
||||
final Class<?> tokenizerClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||
Reader.class).newInstance((Reader) null);
|
||||
this.tokenFilterClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public short nextToken() throws IOException {
|
||||
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||
if (hasNextToken) {
|
||||
short flags = 0;
|
||||
final char[] image = term.buffer();
|
||||
final int length = term.length();
|
||||
tempCharSequence.reset(image, 0, length);
|
||||
if (length == 1 && image[0] == ',') {
|
||||
// ChineseTokenizer seems to convert all punctuation to ','
|
||||
// characters
|
||||
flags = ITokenizer.TT_PUNCTUATION;
|
||||
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||
flags = ITokenizer.TT_NUMERIC;
|
||||
} else {
|
||||
flags = ITokenizer.TT_TERM;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
return ITokenizer.TT_EOF;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setTermBuffer(MutableCharArray array) {
|
||||
array.reset(term.buffer(), 0, term.length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) {
|
||||
try {
|
||||
sentenceTokenizer.setReader(input);
|
||||
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||
TokenStream.class).newInstance(sentenceTokenizer);
|
||||
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
|
||||
} catch (Exception e) {
|
||||
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,142 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.lang.invoke.MethodHandles;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.carrot2.util.resource.IResource;
|
||||
import org.carrot2.util.resource.IResourceLocator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* A {@link IResourceLocator} that delegates resource searches to {@link SolrCore}.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
class SolrResourceLocator implements IResourceLocator {
|
||||
private final SolrResourceLoader resourceLoader;
|
||||
private final String carrot2ResourcesDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
public SolrResourceLocator(SolrCore core, SolrParams initParams) {
|
||||
resourceLoader = core.getResourceLoader();
|
||||
|
||||
String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
|
||||
carrot2ResourcesDir = firstNonNull(resourcesDir, CarrotClusteringEngine.CARROT_RESOURCES_PREFIX);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T> T firstNonNull(T... args) {
|
||||
for (T t : args) {
|
||||
if (t != null) return t;
|
||||
}
|
||||
throw new NullPointerException("At least one element has to be non-null.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public IResource[] getAll(final String resource) {
|
||||
final String resourceName = carrot2ResourcesDir + "/" + resource;
|
||||
log.debug("Looking for Solr resource: {}", resourceName);
|
||||
|
||||
InputStream resourceStream = null;
|
||||
final byte [] asBytes;
|
||||
try {
|
||||
resourceStream = resourceLoader.openResource(resourceName);
|
||||
asBytes = IOUtils.toByteArray(resourceStream);
|
||||
} catch (IOException e) {
|
||||
log.debug("Resource not found in Solr's config: {}. Using the default {} from Carrot JAR."
|
||||
, resourceName, resource);
|
||||
return new IResource[] {};
|
||||
} finally {
|
||||
if (resourceStream != null) {
|
||||
try {
|
||||
resourceStream.close();
|
||||
} catch (IOException e) {
|
||||
// ignore.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Loaded Solr resource: {}", resourceName);
|
||||
|
||||
final IResource foundResource = new IResource() {
|
||||
@Override
|
||||
public InputStream open() {
|
||||
return new ByteArrayInputStream(asBytes);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
|
||||
// so simply rely on instance equivalence.
|
||||
return super.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
|
||||
// so simply rely on instance equivalence.
|
||||
return super.equals(obj);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Solr config resource: " + resourceName;
|
||||
}
|
||||
};
|
||||
|
||||
return new IResource[] { foundResource };
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
|
||||
// so simply rely on instance equivalence.
|
||||
return super.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
|
||||
// so simply rely on instance equivalence.
|
||||
return super.equals(obj);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
String configDir = "";
|
||||
try {
|
||||
configDir = "configDir=" + new File(resourceLoader.getConfigDir()).getAbsolutePath() + ", ";
|
||||
} catch (Exception ignored) {
|
||||
// If we get the exception, the resource loader implementation
|
||||
// probably does not support getConfigDir(). Not a big problem.
|
||||
}
|
||||
|
||||
return "SolrResourceLocator, " + configDir
|
||||
+ "Carrot2 relative lexicalResourcesDir=" + carrot2ResourcesDir;
|
||||
}
|
||||
}
|
|
@ -1,140 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
|
||||
import org.apache.lucene.analysis.core.StopFilterFactory;
|
||||
import org.apache.lucene.analysis.TokenFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.attribute.Init;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
|
||||
import org.carrot2.text.linguistic.ILexicalData;
|
||||
import org.carrot2.text.linguistic.ILexicalDataFactory;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
|
||||
/**
|
||||
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
|
||||
* words from a field's StopFilter to the default stop words used in Carrot2,
|
||||
* for all languages Carrot2 supports. Completely replacing Carrot2 stop words
|
||||
* with Solr's wouldn't make much sense because clustering needs more aggressive
|
||||
* stop words removal. In other words, if something is a stop word during
|
||||
* indexing, then it should also be a stop word during clustering, but not the
|
||||
* other way round.
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
@Bindable
|
||||
public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFactory {
|
||||
|
||||
@Init
|
||||
@Input
|
||||
@Attribute(key = "solrCore")
|
||||
public SolrCore core;
|
||||
|
||||
@Processing
|
||||
@Input
|
||||
@Attribute(key = "solrFieldNames")
|
||||
public Set<String> fieldNames;
|
||||
|
||||
/**
|
||||
* A lazily-built cache of stop words per field.
|
||||
*/
|
||||
private HashMap<String, List<CharArraySet>> solrStopWords = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Carrot2's default lexical resources to use in addition to Solr's stop
|
||||
* words.
|
||||
*/
|
||||
public DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
|
||||
|
||||
/**
|
||||
* Obtains stop words for a field from the associated
|
||||
* {@link StopFilterFactory}, if any.
|
||||
*/
|
||||
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
|
||||
// No need to synchronize here, Carrot2 ensures that instances
|
||||
// of this class are not used by multiple threads at a time.
|
||||
synchronized (solrStopWords) {
|
||||
if (!solrStopWords.containsKey(fieldName)) {
|
||||
solrStopWords.put(fieldName, new ArrayList<>());
|
||||
|
||||
IndexSchema schema = core.getLatestSchema();
|
||||
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
|
||||
if (fieldAnalyzer instanceof TokenizerChain) {
|
||||
final TokenFilterFactory[] filterFactories =
|
||||
((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
|
||||
for (TokenFilterFactory factory : filterFactories) {
|
||||
if (factory instanceof StopFilterFactory) {
|
||||
// StopFilterFactory holds the stop words in a CharArraySet
|
||||
CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
|
||||
solrStopWords.get(fieldName).add(stopWords);
|
||||
}
|
||||
|
||||
if (factory instanceof CommonGramsFilterFactory) {
|
||||
CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
|
||||
solrStopWords.get(fieldName).add(commonWords);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return solrStopWords.get(fieldName);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ILexicalData getLexicalData(LanguageCode languageCode) {
|
||||
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
|
||||
.getLexicalData(languageCode);
|
||||
|
||||
return new ILexicalData() {
|
||||
@Override
|
||||
public boolean isStopLabel(CharSequence word) {
|
||||
// Nothing in Solr maps to the concept of a stop label,
|
||||
// so return Carrot2's default here.
|
||||
return carrot2LexicalData.isStopLabel(word);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCommonWord(MutableCharArray word) {
|
||||
// Loop over the fields involved in clustering first
|
||||
for (String fieldName : fieldNames) {
|
||||
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
|
||||
if (stopWords.contains(word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check default Carrot2 stop words too
|
||||
return carrot2LexicalData.isCommonWord(word);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -14,10 +14,16 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* {@link org.apache.solr.handler.clustering.ClusteringComponent} and common APIs for specific implementations.
|
||||
|
||||
/**
|
||||
* A {@link org.apache.solr.handler.component.SearchComponent} for dynamic,
|
||||
* unsupervised grouping of
|
||||
* search results based on the content of their text fields or contextual
|
||||
* snippets around query-matching regions.
|
||||
*
|
||||
* <p>
|
||||
* The default implementation uses clustering algorithms from the
|
||||
* <a href="https://project.carrot2.org">Carrot<sup>2</sup> project</a>.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
org.apache.solr.handler.clustering.MockClusteringAlgorithmProvider
|
||||
org.apache.solr.handler.clustering.EchoClusteringAlgorithmProvider
|
||||
org.apache.solr.handler.clustering.ResourceCheckAlgorithmProvider
|
|
@ -1,10 +0,0 @@
|
|||
<attribute-sets default="overridden-attributes">
|
||||
<attribute-set id="overridden-attributes">
|
||||
<value-set>
|
||||
<label>defaults</label>
|
||||
<attribute key="MockClusteringAlgorithm.depth"><value value="1" /></attribute>
|
||||
<attribute key="MockClusteringAlgorithm.labels"><value value="3" /></attribute>
|
||||
<attribute key="MockClusteringAlgorithm.maxClusters"><value value="13" /></attribute>
|
||||
</value-set>
|
||||
</attribute-set>
|
||||
</attribute-sets>
|
|
@ -1 +0,0 @@
|
|||
customsolrstoplabel
|
|
@ -1 +0,0 @@
|
|||
customsolrstopword
|
|
@ -1 +0,0 @@
|
|||
customsolrstoplabelcustomdir
|
|
@ -1 +0,0 @@
|
|||
customsolrstopwordcustomdir
|
|
@ -1,246 +0,0 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Syntax:
|
||||
# "source" => "target"
|
||||
# "source".length() > 0 (source cannot be empty.)
|
||||
# "target".length() >= 0 (target can be empty.)
|
||||
|
||||
# example:
|
||||
# "À" => "A"
|
||||
# "\u00C0" => "A"
|
||||
# "\u00C0" => "\u0041"
|
||||
# "ß" => "ss"
|
||||
# "\t" => " "
|
||||
# "\n" => ""
|
||||
|
||||
# À => A
|
||||
"\u00C0" => "A"
|
||||
|
||||
# Á => A
|
||||
"\u00C1" => "A"
|
||||
|
||||
# Â => A
|
||||
"\u00C2" => "A"
|
||||
|
||||
# Ã => A
|
||||
"\u00C3" => "A"
|
||||
|
||||
# Ä => A
|
||||
"\u00C4" => "A"
|
||||
|
||||
# Å => A
|
||||
"\u00C5" => "A"
|
||||
|
||||
# Æ => AE
|
||||
"\u00C6" => "AE"
|
||||
|
||||
# Ç => C
|
||||
"\u00C7" => "C"
|
||||
|
||||
# È => E
|
||||
"\u00C8" => "E"
|
||||
|
||||
# É => E
|
||||
"\u00C9" => "E"
|
||||
|
||||
# Ê => E
|
||||
"\u00CA" => "E"
|
||||
|
||||
# Ë => E
|
||||
"\u00CB" => "E"
|
||||
|
||||
# Ì => I
|
||||
"\u00CC" => "I"
|
||||
|
||||
# Í => I
|
||||
"\u00CD" => "I"
|
||||
|
||||
# Î => I
|
||||
"\u00CE" => "I"
|
||||
|
||||
# Ï => I
|
||||
"\u00CF" => "I"
|
||||
|
||||
# IJ => IJ
|
||||
"\u0132" => "IJ"
|
||||
|
||||
# Ð => D
|
||||
"\u00D0" => "D"
|
||||
|
||||
# Ñ => N
|
||||
"\u00D1" => "N"
|
||||
|
||||
# Ò => O
|
||||
"\u00D2" => "O"
|
||||
|
||||
# Ó => O
|
||||
"\u00D3" => "O"
|
||||
|
||||
# Ô => O
|
||||
"\u00D4" => "O"
|
||||
|
||||
# Õ => O
|
||||
"\u00D5" => "O"
|
||||
|
||||
# Ö => O
|
||||
"\u00D6" => "O"
|
||||
|
||||
# Ø => O
|
||||
"\u00D8" => "O"
|
||||
|
||||
# Œ => OE
|
||||
"\u0152" => "OE"
|
||||
|
||||
# Þ
|
||||
"\u00DE" => "TH"
|
||||
|
||||
# Ù => U
|
||||
"\u00D9" => "U"
|
||||
|
||||
# Ú => U
|
||||
"\u00DA" => "U"
|
||||
|
||||
# Û => U
|
||||
"\u00DB" => "U"
|
||||
|
||||
# Ü => U
|
||||
"\u00DC" => "U"
|
||||
|
||||
# Ý => Y
|
||||
"\u00DD" => "Y"
|
||||
|
||||
# Ÿ => Y
|
||||
"\u0178" => "Y"
|
||||
|
||||
# à => a
|
||||
"\u00E0" => "a"
|
||||
|
||||
# á => a
|
||||
"\u00E1" => "a"
|
||||
|
||||
# â => a
|
||||
"\u00E2" => "a"
|
||||
|
||||
# ã => a
|
||||
"\u00E3" => "a"
|
||||
|
||||
# ä => a
|
||||
"\u00E4" => "a"
|
||||
|
||||
# å => a
|
||||
"\u00E5" => "a"
|
||||
|
||||
# æ => ae
|
||||
"\u00E6" => "ae"
|
||||
|
||||
# ç => c
|
||||
"\u00E7" => "c"
|
||||
|
||||
# è => e
|
||||
"\u00E8" => "e"
|
||||
|
||||
# é => e
|
||||
"\u00E9" => "e"
|
||||
|
||||
# ê => e
|
||||
"\u00EA" => "e"
|
||||
|
||||
# ë => e
|
||||
"\u00EB" => "e"
|
||||
|
||||
# ì => i
|
||||
"\u00EC" => "i"
|
||||
|
||||
# í => i
|
||||
"\u00ED" => "i"
|
||||
|
||||
# î => i
|
||||
"\u00EE" => "i"
|
||||
|
||||
# ï => i
|
||||
"\u00EF" => "i"
|
||||
|
||||
# ij => ij
|
||||
"\u0133" => "ij"
|
||||
|
||||
# ð => d
|
||||
"\u00F0" => "d"
|
||||
|
||||
# ñ => n
|
||||
"\u00F1" => "n"
|
||||
|
||||
# ò => o
|
||||
"\u00F2" => "o"
|
||||
|
||||
# ó => o
|
||||
"\u00F3" => "o"
|
||||
|
||||
# ô => o
|
||||
"\u00F4" => "o"
|
||||
|
||||
# õ => o
|
||||
"\u00F5" => "o"
|
||||
|
||||
# ö => o
|
||||
"\u00F6" => "o"
|
||||
|
||||
# ø => o
|
||||
"\u00F8" => "o"
|
||||
|
||||
# œ => oe
|
||||
"\u0153" => "oe"
|
||||
|
||||
# ß => ss
|
||||
"\u00DF" => "ss"
|
||||
|
||||
# þ => th
|
||||
"\u00FE" => "th"
|
||||
|
||||
# ù => u
|
||||
"\u00F9" => "u"
|
||||
|
||||
# ú => u
|
||||
"\u00FA" => "u"
|
||||
|
||||
# û => u
|
||||
"\u00FB" => "u"
|
||||
|
||||
# ü => u
|
||||
"\u00FC" => "u"
|
||||
|
||||
# ý => y
|
||||
"\u00FD" => "y"
|
||||
|
||||
# ÿ => y
|
||||
"\u00FF" => "y"
|
||||
|
||||
# ff => ff
|
||||
"\uFB00" => "ff"
|
||||
|
||||
# fi => fi
|
||||
"\uFB01" => "fi"
|
||||
|
||||
# fl => fl
|
||||
"\uFB02" => "fl"
|
||||
|
||||
# ffi => ffi
|
||||
"\uFB03" => "ffi"
|
||||
|
||||
# ffl => ffl
|
||||
"\uFB04" => "ffl"
|
||||
|
||||
# ſt => ft
|
||||
"\uFB05" => "ft"
|
||||
|
||||
# st => st
|
||||
"\uFB06" => "st"
|
|
@ -1,193 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
This is the Solr schema file. This file should be named "schema.xml" and
|
||||
should be in the conf directory under the solr home
|
||||
(i.e. ./solr/conf/schema.xml by default)
|
||||
or located where the classloader for the Solr webapp can find it.
|
||||
|
||||
This example schema is the recommended starting point for users.
|
||||
It should be kept correct and concise, usable out-of-the-box.
|
||||
|
||||
For more information, on how to customize this file, please see
|
||||
http://wiki.apache.org/solr/SchemaXml
|
||||
-->
|
||||
|
||||
<schema name="example" version="1.1">
|
||||
<!-- attribute "name" is the name of this schema and is only used for display purposes.
|
||||
Applications should change this to reflect the nature of the search collection.
|
||||
version="1.1" is Solr's version number for the schema syntax and semantics. It should
|
||||
not normally be changed by applications.
|
||||
1.0: multiValued attribute did not exist, all fields are multiValued by nature
|
||||
1.1: multiValued attribute introduced, false by default -->
|
||||
|
||||
|
||||
<!-- field type definitions. The "name" attribute is
|
||||
just a label to be used by field definitions. The "class"
|
||||
attribute and any other attributes determine the real
|
||||
behavior of the fieldType.
|
||||
Class names starting with "solr" refer to java classes in the
|
||||
org.apache.solr.analysis package.
|
||||
-->
|
||||
|
||||
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
|
||||
- StrField and TextField support an optional compressThreshold which
|
||||
limits compression (if enabled in the derived fields) to values which
|
||||
exceed a certain size (in characters).
|
||||
-->
|
||||
<schema name="example" version="1.6">
|
||||
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
<!-- boolean type: "true" or "false" -->
|
||||
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
<!-- The optional sortMissingLast and sortMissingFirst attributes are
|
||||
currently supported on types that are sorted internally as strings.
|
||||
- If sortMissingLast="true", then a sort on this field will cause documents
|
||||
without the field to come after documents with the field,
|
||||
regardless of the requested sort order (asc or desc).
|
||||
- If sortMissingFirst="true", then a sort on this field will cause documents
|
||||
without the field to come before documents with the field,
|
||||
regardless of the requested sort order.
|
||||
- If sortMissingLast="false" and sortMissingFirst="false" (the default),
|
||||
then default lucene sorting will be used which places docs without the
|
||||
field first in an ascending sort and last in a descending sort.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
|
||||
-->
|
||||
<fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="float" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="double" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" positionIncrementGap="0"/>
|
||||
|
||||
<!--
|
||||
Numeric field types that index each value at various levels of precision
|
||||
to accelerate range queries when the number of values between the range
|
||||
endpoints is large. See the javadoc for LegacyNumericRangeQuery for internal
|
||||
implementation details.
|
||||
|
||||
Smaller precisionStep values (specified in bits) will lead to more tokens
|
||||
indexed per value, slightly larger index size, and faster range queries.
|
||||
A precisionStep of 0 disables indexing at different precision levels.
|
||||
-->
|
||||
<fieldType name="tint" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
|
||||
<fieldType name="tfloat" class="${solr.tests.FloatFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
|
||||
<fieldType name="tlong" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
|
||||
<fieldType name="tdouble" class="${solr.tests.DoubleFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="8" positionIncrementGap="0"/>
|
||||
|
||||
|
||||
<!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and
|
||||
is a more restricted form of the canonical representation of dateTime
|
||||
http://www.w3.org/TR/xmlschema-2/#dateTime
|
||||
The trailing "Z" designates UTC time and is mandatory.
|
||||
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
|
||||
All other components are mandatory.
|
||||
|
||||
Expressions can also be used to denote calculations that should be
|
||||
performed relative to "NOW" to determine the value, ie...
|
||||
|
||||
NOW/HOUR
|
||||
... Round to the start of the current hour
|
||||
NOW-1DAY
|
||||
... Exactly 1 day prior to now
|
||||
NOW/DAY+6MONTHS+3DAYS
|
||||
... 6 months and 3 days in the future from the start of
|
||||
the current day
|
||||
|
||||
Consult the TrieDateField javadocs for more information.
|
||||
-->
|
||||
<fieldType name="date" class="${solr.tests.DateFieldType}" docValues="${solr.tests.numeric.dv}" sortMissingLast="true" omitNorms="true"/>
|
||||
|
||||
|
||||
<!-- The "RandomSortField" is not used to store or search any
|
||||
data. You can declare fields of this type it in your schema
|
||||
to generate psuedo-random orderings of your docs for sorting
|
||||
purposes. The ordering is generated based on the field name
|
||||
and the version of the index, As long as the index version
|
||||
remains unchanged, and the same field name is reused,
|
||||
the ordering of the docs will be consistent.
|
||||
If you want differend psuedo-random orderings of documents,
|
||||
for the same version of the index, use a dynamicField and
|
||||
change the name
|
||||
-->
|
||||
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
|
||||
|
||||
<!-- solr.TextField allows the specification of custom text analyzers
|
||||
specified as a tokenizer and a list of token filters. Different
|
||||
analyzers may be specified for indexing and querying.
|
||||
|
||||
The optional positionIncrementGap puts space between multiple fields of
|
||||
this type on the same document, with the purpose of preventing false phrase
|
||||
matching across fields.
|
||||
|
||||
For more info on customizing your analyzer chain, please see
|
||||
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
|
||||
-->
|
||||
|
||||
<!-- One can also specify an existing Analyzer class that has a
|
||||
default constructor via the class attribute on the analyzer element
|
||||
<fieldType name="text_greek" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/>
|
||||
</fieldType>
|
||||
-->
|
||||
|
||||
<!-- A text field that only splits on whitespace for exact matching of words -->
|
||||
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- A text field that uses WordDelimiterGraphFilter to enable splitting and matching of
|
||||
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
|
||||
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
|
||||
Synonyms and stopwords are customized by external files, and stemming is enabled.
|
||||
Duplicate tokens at the same position (which may result from Stemmed Synonyms or
|
||||
WordDelim parts) are removed.
|
||||
-->
|
||||
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<!-- in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
-->
|
||||
<!-- Case insensitive stop word removal.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory"
|
||||
ignoreCase="true"
|
||||
words="stopwords.txt"
|
||||
/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
|
||||
words="stopwords.txt"/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"
|
||||
splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
<filter class="solr.FlattenGraphFilterFactory" />
|
||||
<filter class="solr.FlattenGraphFilterFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<!--<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>-->
|
||||
<!--<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0"
|
||||
splitOnCaseChange="1"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.PorterStemFilterFactory"/>
|
||||
|
@ -195,156 +29,11 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
|
||||
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
|
||||
<fieldType name="textTight" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<!--<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.EnglishMinimalStemFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
<filter class="solr.FlattenGraphFilterFactory" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<!--<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>-->
|
||||
<filter class="solr.WordDelimiterGraphFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
|
||||
<filter class="solr.EnglishMinimalStemFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!--
|
||||
Setup simple analysis for spell checking
|
||||
-->
|
||||
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
-->
|
||||
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
|
||||
<analyzer>
|
||||
<!-- KeywordTokenizer does no actual tokenizing, so the entire
|
||||
input string is preserved as a single token
|
||||
-->
|
||||
<tokenizer class="solr.MockTokenizerFactory" pattern="keyword"/>
|
||||
<!-- The LowerCase TokenFilter does what you expect, which can be
|
||||
when you want your sorting to be case insensitive
|
||||
-->
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<!-- The TrimFilter removes any leading or trailing whitespace -->
|
||||
<filter class="solr.TrimFilterFactory"/>
|
||||
<!-- The PatternReplaceFilter gives you the flexibility to use
|
||||
Java Regular expression to replace any sequence of characters
|
||||
matching a pattern with an arbitrary replacement string,
|
||||
which may include back refrences to portions of the orriginal
|
||||
string matched by the pattern.
|
||||
|
||||
See the Java Regular Expression documentation for more
|
||||
infomation on pattern and replacement string syntax.
|
||||
|
||||
http://docs.oracle.com/javase/8/docs/api/java/util/regex/package-summary.html
|
||||
-->
|
||||
<filter class="solr.PatternReplaceFilterFactory"
|
||||
pattern="([^a-z])" replacement="" replace="all"
|
||||
/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- since fields of this type are by default not stored or indexed, any data added to
|
||||
them will be ignored outright
|
||||
-->
|
||||
<fieldType name="ignored" stored="false" indexed="false" class="solr.StrField"/>
|
||||
|
||||
|
||||
<!-- Valid attributes for fields:
|
||||
name: mandatory - the name for the field
|
||||
type: mandatory - the name of a previously defined type from the <fieldType>s
|
||||
indexed: true if this field should be indexed (searchable or sortable)
|
||||
stored: true if this field should be retrievable
|
||||
multiValued: true if this field may contain multiple values per document
|
||||
omitNorms: (expert) set to true to omit the norms associated with
|
||||
this field (this disables length normalization and index-time
|
||||
boosting for the field, and saves some memory). Only full-text
|
||||
fields or fields that need an index-time boost need norms.
|
||||
termVectors: [false] set to true to store the term vector for a given field.
|
||||
When using MoreLikeThis, fields used for similarity should be stored for
|
||||
best performance.
|
||||
-->
|
||||
|
||||
<field name="id" type="string" indexed="true" stored="true" required="true"/>
|
||||
<field name="url" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true"/>
|
||||
|
||||
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true"/>
|
||||
<field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<!-- catchall field, containing all other searchable text fields (implemented
|
||||
via copyField further on in this schema -->
|
||||
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
|
||||
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
|
||||
will be used if the name matches any of the patterns.
|
||||
RESTRICTION: the glob-like pattern in the name attribute must have
|
||||
a "*" only at the start or the end.
|
||||
EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i)
|
||||
Longer patterns will be matched first. if equal size patterns
|
||||
both match, the first appearing in the schema will be used. -->
|
||||
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
||||
<field name="testSet" type="string" indexed="true" stored="false" multiValued="false" required="true" />
|
||||
|
||||
<dynamicField name="random*" type="random"/>
|
||||
|
||||
<dynamicField name="*_dynamic" type="string" indexed="true" stored="true"/>
|
||||
<dynamicField name="dynamic_*" type="string" indexed="true" stored="true"/>
|
||||
|
||||
|
||||
<!-- uncomment the following to ignore any fields that don't already match an existing
|
||||
field name or dynamic field, rather than reporting them as an error.
|
||||
alternately, change the type="ignored" to some other type e.g. "text" if you want
|
||||
unknown fields indexed and/or stored by default -->
|
||||
<!--dynamicField name="*" type="ignored" /-->
|
||||
|
||||
|
||||
<!-- Field to use to determine and enforce document uniqueness.
|
||||
Unless this field is marked with required="false", it will be a required field
|
||||
-->
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!-- copyField commands copy one field to another at the time a document
|
||||
is added to the index. It's used either to index the same field differently,
|
||||
or to add multiple fields to the same field for easier/faster searching. -->
|
||||
<copyField source="url" dest="text"/>
|
||||
<copyField source="title" dest="text"/>
|
||||
<copyField source="body" dest="text"/>
|
||||
<copyField source="snippet" dest="text"/>
|
||||
|
||||
<!-- dynamic destination -->
|
||||
<copyField source="*_dynamic" dest="dynamic_*"/>
|
||||
|
||||
<copyField source="id" dest="range_facet_l"/>
|
||||
|
||||
</schema>
|
||||
</schema>
|
|
@ -1,4 +1,5 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
|
@ -31,410 +32,98 @@
|
|||
<lockType>single</lockType>
|
||||
<useCompoundFile>${useCompoundFile:false}</useCompoundFile>
|
||||
</indexConfig>
|
||||
|
||||
<!-- Enables JMX if and only if an existing MBeanServer is found, use
|
||||
this if you want to configure JMX through JVM parameters. Remove
|
||||
this to disable exposing Solr configuration and statistics to JMX.
|
||||
|
||||
If you want to connect to a particular server, specify the agentId
|
||||
e.g. <jmx agentId="myAgent" />
|
||||
|
||||
If you want to start a new MBeanServer, specify the serviceUrl
|
||||
e.g <jmx serviceurl="service:jmx:rmi:///jndi/rmi://localhost:9999/solr" />
|
||||
|
||||
For more details see http://wiki.apache.org/solr/SolrJmx
|
||||
-->
|
||||
<jmx />
|
||||
|
||||
<!-- the default high-performance update handler -->
|
||||
<updateHandler class="solr.DirectUpdateHandler2">
|
||||
|
||||
<!-- A prefix of "solr." for class names is an alias that
|
||||
causes solr to search appropriate packages, including
|
||||
org.apache.solr.(search|update|request|core|analysis)
|
||||
-->
|
||||
|
||||
<!-- Perform a <commit/> automatically under certain conditions:
|
||||
maxDocs - number of updates since last commit is greater than this
|
||||
maxTime - oldest uncommited update (in ms) is this long ago
|
||||
<autoCommit>
|
||||
<maxDocs>10000</maxDocs>
|
||||
<maxTime>1000</maxTime>
|
||||
</autoCommit>
|
||||
-->
|
||||
|
||||
</updateHandler>
|
||||
|
||||
|
||||
<query>
|
||||
<!-- Maximum number of clauses in a boolean query... can affect
|
||||
range or prefix queries that expand to big boolean
|
||||
queries. An exception is thrown if exceeded. -->
|
||||
<maxBooleanClauses>${solr.max.booleanClauses:1024}</maxBooleanClauses>
|
||||
|
||||
|
||||
<!-- Cache used by SolrIndexSearcher for filters (DocSets),
|
||||
unordered sets of *all* documents that match a query.
|
||||
When a new searcher is opened, its caches may be prepopulated
|
||||
or "autowarmed" using data from caches in the old searcher.
|
||||
autowarmCount is the number of items to prepopulate. For CaffeineCache,
|
||||
the autowarmed items will be the most recently accessed items.
|
||||
Parameters:
|
||||
class - the SolrCache implementation (currently only CaffeineCache)
|
||||
size - the maximum number of entries in the cache
|
||||
initialSize - the initial capacity (number of entries) of
|
||||
the cache. (seel java.util.HashMap)
|
||||
autowarmCount - the number of entries to prepopulate from
|
||||
an old cache.
|
||||
-->
|
||||
<filterCache
|
||||
class="solr.CaffeineCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="128"/>
|
||||
|
||||
<!-- queryResultCache caches results of searches - ordered lists of
|
||||
document ids (DocList) based on a query, a sort, and the range
|
||||
of documents requested. -->
|
||||
<queryResultCache
|
||||
class="solr.CaffeineCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="32"/>
|
||||
|
||||
<!-- documentCache caches Lucene Document objects (the stored fields for each document).
|
||||
Since Lucene internal document ids are transient, this cache will not be autowarmed. -->
|
||||
<documentCache
|
||||
class="solr.CaffeineCache"
|
||||
size="512"
|
||||
initialSize="512"
|
||||
autowarmCount="0"/>
|
||||
|
||||
<!-- If true, stored fields that are not requested will be loaded lazily.
|
||||
|
||||
This can result in a significant speed improvement if the usual case is to
|
||||
not load all stored fields, especially if the skipped fields are large compressed
|
||||
text fields.
|
||||
-->
|
||||
<enableLazyFieldLoading>true</enableLazyFieldLoading>
|
||||
|
||||
<!-- Example of a generic cache. These caches may be accessed by name
|
||||
through SolrIndexSearcher.getCache(),cacheLookup(), and cacheInsert().
|
||||
The purpose is to enable easy caching of user/application level data.
|
||||
The regenerator argument should be specified as an implementation
|
||||
of solr.search.CacheRegenerator if autowarming is desired. -->
|
||||
<!--
|
||||
<cache name="myUserCache"
|
||||
class="solr.CaffeineCache"
|
||||
size="4096"
|
||||
initialSize="1024"
|
||||
autowarmCount="1024"
|
||||
regenerator="org.mycompany.mypackage.MyRegenerator"
|
||||
/>
|
||||
-->
|
||||
|
||||
<!-- An optimization that attempts to use a filter to satisfy a search.
|
||||
If the requested sort does not include score, then the filterCache
|
||||
will be checked for a filter matching the query. If found, the filter
|
||||
will be used as the source of document ids, and then the sort will be
|
||||
applied to that.
|
||||
<useFilterForSortedQuery>true</useFilterForSortedQuery>
|
||||
-->
|
||||
|
||||
<!-- An optimization for use with the queryResultCache. When a search
|
||||
is requested, a superset of the requested number of document ids
|
||||
are collected. For example, if a search for a particular query
|
||||
requests matching documents 10 through 19, and queryWindowSize is 50,
|
||||
then documents 0 through 49 will be collected and cached. Any further
|
||||
requests in that range can be satisfied via the cache. -->
|
||||
<queryResultWindowSize>50</queryResultWindowSize>
|
||||
|
||||
<!-- Maximum number of documents to cache for any entry in the
|
||||
queryResultCache. -->
|
||||
<queryResultMaxDocsCached>200</queryResultMaxDocsCached>
|
||||
|
||||
<!-- a newSearcher event is fired whenever a new searcher is being prepared
|
||||
and there is a current searcher handling requests (aka registered). -->
|
||||
<!-- QuerySenderListener takes an array of NamedList and executes a
|
||||
local query request for each NamedList in sequence. -->
|
||||
<listener event="newSearcher" class="solr.QuerySenderListener">
|
||||
<arr name="queries">
|
||||
<lst> <str name="q">solr</str> <str name="start">0</str> <str name="rows">10</str> </lst>
|
||||
<lst> <str name="q">rocks</str> <str name="start">0</str> <str name="rows">10</str> </lst>
|
||||
<lst><str name="q">static newSearcher warming query from solrconfig.xml</str></lst>
|
||||
</arr>
|
||||
</listener>
|
||||
|
||||
<!-- a firstSearcher event is fired whenever a new searcher is being
|
||||
prepared but there is no current registered searcher to handle
|
||||
requests or to gain autowarming data from. -->
|
||||
<listener event="firstSearcher" class="solr.QuerySenderListener">
|
||||
<arr name="queries">
|
||||
<lst> <str name="q">fast_warm</str> <str name="start">0</str> <str name="rows">10</str> </lst>
|
||||
<lst><str name="q">static firstSearcher warming query from solrconfig.xml</str></lst>
|
||||
</arr>
|
||||
</listener>
|
||||
|
||||
<!-- If a search request comes in and there is no current registered searcher,
|
||||
then immediately register the still warming searcher and use it. If
|
||||
"false" then all requests will block until the first searcher is done
|
||||
warming. -->
|
||||
<useColdSearcher>false</useColdSearcher>
|
||||
|
||||
</query>
|
||||
|
||||
<requestDispatcher>
|
||||
<!--Make sure your system has some authentication before enabling remote streaming!
|
||||
<requestParsers enableRemoteStreaming="false" multipartUploadLimitInKB="-1" />
|
||||
-->
|
||||
|
||||
<!-- Set HTTP caching related parameters (for proxy caches and clients).
|
||||
|
||||
To get the behaviour of Solr 1.2 (ie: no caching related headers)
|
||||
use the never304="true" option and do not specify a value for
|
||||
<cacheControl>
|
||||
-->
|
||||
<!-- <httpCaching never304="true"> -->
|
||||
<httpCaching lastModifiedFrom="openTime"
|
||||
etagSeed="Solr">
|
||||
<!-- lastModFrom="openTime" is the default, the Last-Modified value
|
||||
(and validation against If-Modified-Since requests) will all be
|
||||
relative to when the current Searcher was opened.
|
||||
You can change it to lastModFrom="dirLastMod" if you want the
|
||||
value to exactly corrispond to when the physical index was last
|
||||
modified.
|
||||
|
||||
etagSeed="..." is an option you can change to force the ETag
|
||||
header (and validation against If-None-Match requests) to be
|
||||
differnet even if the index has not changed (ie: when making
|
||||
significant changes to your config file)
|
||||
|
||||
lastModifiedFrom and etagSeed are both ignored if you use the
|
||||
never304="true" option.
|
||||
-->
|
||||
<!-- If you include a <cacheControl> directive, it will be used to
|
||||
generate a Cache-Control header, as well as an Expires header
|
||||
if the value contains "max-age="
|
||||
|
||||
By default, no Cache-Control header is generated.
|
||||
|
||||
You can use the <cacheControl> option even if you have set
|
||||
never304="true"
|
||||
-->
|
||||
<!-- <cacheControl>max-age=30, public</cacheControl> -->
|
||||
</httpCaching>
|
||||
</requestDispatcher>
|
||||
|
||||
<requestHandler name="/select" class="solr.SearchHandler">
|
||||
<!-- default values for query parameters -->
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
<!--
|
||||
<int name="rows">10</int>
|
||||
<str name="fl">*</str>
|
||||
<str name="version">2.1</str>
|
||||
-->
|
||||
</lst>
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
</lst>
|
||||
<arr name="last-components">
|
||||
<str>clustering</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
|
||||
<requestHandler name="docClustering" class="solr.SearchHandler">
|
||||
<!-- default values for query parameters -->
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
<!--
|
||||
<int name="rows">10</int>
|
||||
<str name="fl">*</str>
|
||||
<str name="version">2.1</str>
|
||||
-->
|
||||
</lst>
|
||||
<arr name="last-components">
|
||||
<str>doc-clustering</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<!-- DisMaxRequestHandler allows easy searching across multiple fields
|
||||
for simple user-entered phrases. Its implementation is now
|
||||
just the standard SearchHandler with a default query parser
|
||||
of "dismax".
|
||||
see http://wiki.apache.org/solr/DisMaxRequestHandler
|
||||
-->
|
||||
|
||||
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering">
|
||||
<!-- Declare an engine -->
|
||||
<lst name="engine">
|
||||
<!-- The name, only one can be named "default" -->
|
||||
<str name="name">default</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">stc</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">mock</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">mock-external-attrs</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
|
||||
<!-- takes precedence over external XML -->
|
||||
<int name="MockClusteringAlgorithm.labels">4</int>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">echo</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">lexical-resource-check</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">lexical-resource-check-custom-resource-dir</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
|
||||
<str name="carrot.resourcesDir">clustering/custom</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">custom-duplicating-tokenizer</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoTokensClusteringAlgorithm</str>
|
||||
<str name="PreprocessingPipeline.tokenizerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingTokenizerFactory</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">custom-duplicating-stemmer</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoStemsClusteringAlgorithm</str>
|
||||
<str name="PreprocessingPipeline.stemmerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingStemmerFactory</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
|
||||
<!-- Declare an engine -->
|
||||
<lst name="engine">
|
||||
<!-- The name, only one can be named "default" -->
|
||||
<str name="name">mock</str>
|
||||
<str name="classname">org.apache.solr.handler.clustering.MockDocumentClusteringEngine</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering-name-default">
|
||||
<lst name="engine">
|
||||
<str name="name">stc</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">default</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
<str name="clustering.fields">title</str>
|
||||
<str name="clustering.algorithm">MockClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">mock</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering-name-decl-order">
|
||||
<lst name="engine">
|
||||
<bool name="optional">true</bool>
|
||||
<str name="name">unavailable</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.UnavailableAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
<str name="clustering.fields">title, snippet</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">stc</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
|
||||
<str name="clustering.fields">title, snippet</str>
|
||||
<str name="clustering.algorithm">STC</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">kmeans</str>
|
||||
<str name="clustering.fields">title, snippet</str>
|
||||
<str name="clustering.algorithm">Bisecting K-Means</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">mock</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
|
||||
<str name="clustering.fields">title</str>
|
||||
<str name="clustering.algorithm">MockClusteringAlgorithm</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">mock-solrconfig-attrs</str>
|
||||
<str name="clustering.fields">title, snippet</str>
|
||||
<str name="clustering.algorithm">MockClusteringAlgorithm</str>
|
||||
<bool name="clustering.includeOtherTopics">false</bool>
|
||||
<int name="maxClusters">2</int>
|
||||
<int name="hierarchyDepth">1</int>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">echo</str>
|
||||
<str name="clustering.algorithm">EchoClusteringAlgorithm</str>
|
||||
<str name="clustering.fields">title, snippet</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">testCustomLanguageResources</str>
|
||||
<str name="clustering.algorithm">ResourceCheckAlgorithm</str>
|
||||
<str name="clustering.fields">title</str>
|
||||
<str name="clustering.resources">testCustomLanguageResources</str>
|
||||
<bool name="clustering.includeOtherTopics">false</bool>
|
||||
<str name="text">
|
||||
was
|
||||
bar
|
||||
baz
|
||||
</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">testParamDefaultLanguage</str>
|
||||
<str name="clustering.fields">title</str>
|
||||
<str name="clustering.algorithm">ResourceCheckAlgorithm</str>
|
||||
<bool name="clustering.includeOtherTopics">false</bool>
|
||||
<str name="clustering.language">German</str>
|
||||
<str name="text">
|
||||
abc
|
||||
</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">testParamLanguageField</str>
|
||||
<str name="clustering.algorithm">ResourceCheckAlgorithm</str>
|
||||
<bool name="clustering.includeOtherTopics">false</bool>
|
||||
<str name="clustering.fields">title</str>
|
||||
<str name="clustering.languageField">lang</str>
|
||||
<str name="clustering.language">Italian</str>
|
||||
<str name="text">test</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering-name-dups">
|
||||
<lst name="engine">
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<highlighting>
|
||||
<!-- Configure the standard fragmenter -->
|
||||
<!-- This could most likely be commented out in the "default" case -->
|
||||
<fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
|
||||
<lst name="defaults">
|
||||
<int name="hl.fragsize">100</int>
|
||||
</lst>
|
||||
</fragmenter>
|
||||
|
||||
<!-- A regular-expression-based fragmenter (f.i., for sentence extraction) -->
|
||||
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
|
||||
<lst name="defaults">
|
||||
<!-- slightly smaller fragsizes work better because of slop -->
|
||||
<int name="hl.fragsize">70</int>
|
||||
<!-- allow 50% slop on fragment sizes -->
|
||||
<float name="hl.regex.slop">0.5</float>
|
||||
<!-- a basic sentence pattern -->
|
||||
<str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
|
||||
</lst>
|
||||
</fragmenter>
|
||||
|
||||
<!-- Configure the standard formatter -->
|
||||
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
|
||||
<lst name="defaults">
|
||||
<str name="hl.simple.pre"><![CDATA[<em>]]></str>
|
||||
<str name="hl.simple.post"><![CDATA[</em>]]></str>
|
||||
</lst>
|
||||
</formatter>
|
||||
</highlighting>
|
||||
|
||||
|
||||
<!-- queryResponseWriter plugins... query responses will be written using the
|
||||
writer specified by the 'wt' request parameter matching the name of a registered
|
||||
writer.
|
||||
The "default" writer is the default and will be used if 'wt' is not specified
|
||||
in the request. XMLResponseWriter will be used if nothing is specified here.
|
||||
The json, python, and ruby writers are also available by default.
|
||||
|
||||
<queryResponseWriter name="xml" class="solr.XMLResponseWriter" default="true"/>
|
||||
<queryResponseWriter name="json" class="solr.JSONResponseWriter"/>
|
||||
<queryResponseWriter name="python" class="solr.PythonResponseWriter"/>
|
||||
<queryResponseWriter name="ruby" class="solr.RubyResponseWriter"/>
|
||||
<queryResponseWriter name="php" class="solr.PHPResponseWriter"/>
|
||||
<queryResponseWriter name="phps" class="solr.PHPSerializedResponseWriter"/>
|
||||
|
||||
<queryResponseWriter name="custom" class="com.example.MyResponseWriter"/>
|
||||
-->
|
||||
|
||||
<!-- XSLT response writer transforms the XML output by any xslt file found
|
||||
in Solr's conf/xslt directory. Changes to xslt files are checked for
|
||||
every xsltCacheLifetimeSeconds.
|
||||
-->
|
||||
<queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
|
||||
<int name="xsltCacheLifetimeSeconds">5</int>
|
||||
</queryResponseWriter>
|
||||
|
||||
|
||||
<!-- example of registering a query parser
|
||||
<queryParser name="lucene" class="org.apache.solr.search.LuceneQParserPlugin"/>
|
||||
-->
|
||||
|
||||
<!-- example of registering a custom function parser
|
||||
<valueSourceParser name="myfunc" class="com.mycompany.MyValueSourceParser" />
|
||||
-->
|
||||
|
||||
<!-- config for the admin interface -->
|
||||
<admin>
|
||||
<defaultQuery>solr</defaultQuery>
|
||||
</admin>
|
||||
|
||||
</config>
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
pizza
|
||||
history
|
|
@ -14,12 +14,12 @@
|
|||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
# a couple of test stopwords to test that the words are really being
|
||||
# A couple of test stopwords to test that the words are really being
|
||||
# configured from this file:
|
||||
stopworda
|
||||
stopwordb
|
||||
|
||||
#Standard english stop words taken from Lucene's StopAnalyzer
|
||||
# Standard english stop words taken from Lucene's StopAnalyzer
|
||||
a
|
||||
an
|
||||
and
|
||||
|
@ -56,4 +56,3 @@ was
|
|||
will
|
||||
with
|
||||
solrownstopword
|
||||
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
#some test synonym mappings unlikely to appear in real input text
|
||||
aaa => aaaa
|
||||
bbb => bbbb1 bbbb2
|
||||
ccc => cccc1,cccc2
|
||||
a\=>a => b\=>b
|
||||
a\,a => b\,b
|
||||
fooaaa,baraaa,bazaaa
|
||||
|
||||
# Some synonym groups specific to this example
|
||||
GB,gib,gigabyte,gigabytes
|
||||
MB,mib,megabyte,megabytes
|
||||
Television, Televisions, TV, TVs
|
||||
#notice we use "gib" instead of "GiB" so any WordDelimiterGraphFilter coming
|
||||
#after us won't split it into two words.
|
||||
|
||||
# Synonym mappings can be used for spelling correction too
|
||||
pixima => pixma
|
||||
|
|
@ -0,0 +1 @@
|
|||
ba.+
|
|
@ -0,0 +1,2 @@
|
|||
foo
|
||||
bar
|
|
@ -0,0 +1,43 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<Configuration>
|
||||
<Appenders>
|
||||
<Console name="STDERR" target="SYSTEM_ERR">
|
||||
<PatternLayout>
|
||||
<Pattern>
|
||||
%maxLen{%-4r %-5p (%t) [%X{node_name} %X{collection} %X{shard} %X{replica} %X{core} %X{trace_id}] %c{1.} %m%notEmpty{
|
||||
=>%ex{short}}}{10240}%n
|
||||
</Pattern>
|
||||
</PatternLayout>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Logger name="org.apache.zookeeper" level="WARN"/>
|
||||
<Logger name="org.apache.hadoop" level="WARN"/>
|
||||
<Logger name="org.apache.directory" level="WARN"/>
|
||||
<Logger name="org.apache.solr.hadoop" level="INFO"/>
|
||||
<Logger name="org.eclipse.jetty" level="INFO"/>
|
||||
|
||||
<Root level="INFO">
|
||||
<AppenderRef ref="STDERR"/>
|
||||
</Root>
|
||||
</Loggers>
|
||||
</Configuration>
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
- Knowledge Discovery [6]
|
||||
- Patterns [6]
|
||||
- Data Mining Applications [5]
|
||||
- Statistical Analysis [4]
|
||||
- Computer [3]
|
||||
- Creating [3]
|
||||
- Data Mining Solutions [3]
|
||||
- Known as Data Mining [3]
|
||||
- Text Mining [3]
|
||||
- Databases KDD [2]
|
||||
- Extraction of Hidden Predictive [2]
|
||||
- Information from Large [2]
|
||||
- Open [2]
|
||||
- Powers [2]
|
||||
- Searching [2]
|
||||
- Tools [2]
|
||||
- Other topics [1]
|
|
@ -0,0 +1,10 @@
|
|||
- Knowledge Discovery [8]
|
||||
- Databases [6]
|
||||
- Patterns [6]
|
||||
- Analysis [5]
|
||||
- Applications [5]
|
||||
- Software [5]
|
||||
- Businesses [4]
|
||||
- Predictive [4]
|
||||
- Process [4]
|
||||
- Other topics [2]
|
|
@ -0,0 +1,2 @@
|
|||
- Lang: English
|
||||
- was[-, -] bar[ignoredWord, ignoredLabel] baz[-, ignoredLabel]
|
|
@ -0,0 +1,2 @@
|
|||
- Cluster 1 [3]
|
||||
- Cluster 2 [3]
|
|
@ -0,0 +1,8 @@
|
|||
- Foundations; Includes; Tutorials [4]
|
||||
- Institute; DMI; Agencies; Analyzing; Different; Group; June; Knowledge-discovery; Microsoft; Office; Perspectives; Projects; Reported; SourceWatch; Started; Stores; Summarizing; UW-Madison [4]
|
||||
- Integrated; Page; St@tServ [4]
|
||||
- Oracle; Social; Media; Pentaho; Visualization [4]
|
||||
- Patterns; Extraction; Managers [4]
|
||||
- SQL; Server; Techniques [4]
|
||||
- Predictive; Enterprise; Analytics [3]
|
||||
- Text; Searching; Correlations; Discovering; Fuel; Gleaned; Investor; Involves; Iterative; Raw; Relationships; SAS; Smarter; Snooping; Unnoticed [3]
|
|
@ -0,0 +1,17 @@
|
|||
- Knowledge Discovery [6]
|
||||
- Patterns [6]
|
||||
- Data Mining Applications [5]
|
||||
- Statistical Analysis [4]
|
||||
- Computer [3]
|
||||
- Creating [3]
|
||||
- Data Mining Solutions [3]
|
||||
- Known as Data Mining [3]
|
||||
- Text Mining [3]
|
||||
- Databases KDD [2]
|
||||
- Extraction of Hidden Predictive [2]
|
||||
- Information from Large [2]
|
||||
- Open [2]
|
||||
- Powers [2]
|
||||
- Searching [2]
|
||||
- Tools [2]
|
||||
- Other topics [1]
|
|
@ -0,0 +1,2 @@
|
|||
- Lang: German
|
||||
- abc[-, -]
|
|
@ -0,0 +1,9 @@
|
|||
- English
|
||||
- Lang: English
|
||||
- test[-, -]
|
||||
- French
|
||||
- Lang: French
|
||||
- test[-, -]
|
||||
- German
|
||||
- Lang: German
|
||||
- test[-, -]
|
|
@ -0,0 +1,12 @@
|
|||
- Cluster 1
|
||||
- Cluster 1.1 [3]
|
||||
- Cluster 1.2 [3]
|
||||
- Cluster 1.3 [3]
|
||||
- Cluster 2
|
||||
- Cluster 2.1 [3]
|
||||
- Cluster 2.2 [3]
|
||||
- Cluster 2.3 [3]
|
||||
- Cluster 3
|
||||
- Cluster 3.1 [3]
|
||||
- Cluster 3.2 [3]
|
||||
- Cluster 3.3 [3]
|
|
@ -0,0 +1,4 @@
|
|||
- Cluster 1 [9]
|
||||
- Cluster 2 [9]
|
||||
- Cluster 3 [9]
|
||||
- Other topics [3]
|
|
@ -0,0 +1,13 @@
|
|||
- Cluster 1
|
||||
- Cluster 1.1 [3]
|
||||
- Cluster 1.2 [3]
|
||||
- Cluster 1.3 [3]
|
||||
- Cluster 2
|
||||
- Cluster 2.1 [3]
|
||||
- Cluster 2.2 [3]
|
||||
- Cluster 2.3 [3]
|
||||
- Cluster 3
|
||||
- Cluster 3.1 [3]
|
||||
- Cluster 3.2 [3]
|
||||
- Cluster 3.3 [3]
|
||||
- Other topics [3]
|
|
@ -0,0 +1,2 @@
|
|||
- Cluster 1 [3]
|
||||
- Cluster 2 [3]
|
|
@ -0,0 +1,2 @@
|
|||
- Cluster 1 [3]
|
||||
- Cluster 2 [3]
|
|
@ -0,0 +1,10 @@
|
|||
- Knowledge Discovery [8]
|
||||
- Databases [6]
|
||||
- Patterns [6]
|
||||
- Analysis [5]
|
||||
- Applications [5]
|
||||
- Software [5]
|
||||
- Businesses [4]
|
||||
- Predictive [4]
|
||||
- Process [4]
|
||||
- Other topics [2]
|
|
@ -1,250 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
|
||||
protected static int numberOfDocs = 0;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
File testHome = createTempDir().toFile();
|
||||
FileUtils.copyDirectory(getFile("clustering/solr"), testHome);
|
||||
initCore("solrconfig.xml", "schema.xml", testHome.getAbsolutePath());
|
||||
numberOfDocs = 0;
|
||||
for (String[] doc : DOCUMENTS) {
|
||||
assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
|
||||
numberOfDocs++;
|
||||
}
|
||||
|
||||
// Add a multi-valued snippet
|
||||
final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
|
||||
multiValuedSnippet.addField("id", numberOfDocs++);
|
||||
multiValuedSnippet.addField("title", "Title");
|
||||
multiValuedSnippet.addField("url", "URL");
|
||||
multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
|
||||
multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
|
||||
multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
|
||||
assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
|
||||
|
||||
// Add a document with multi-field title and snippet
|
||||
final SolrInputDocument multiFieldDoc = new SolrInputDocument();
|
||||
multiFieldDoc.addField("id", numberOfDocs++);
|
||||
multiFieldDoc.addField("title", "Title field");
|
||||
multiFieldDoc.addField("heading", "Heading field");
|
||||
multiFieldDoc.addField("url", "URL");
|
||||
multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
|
||||
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
|
||||
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
|
||||
|
||||
// Add a document with one language supported by Carrot2
|
||||
final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
|
||||
docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
|
||||
docWithOneSupprtedLanguage.addField("title", "");
|
||||
docWithOneSupprtedLanguage.addField("url", "one_supported_language");
|
||||
docWithOneSupprtedLanguage.addField("lang", "zh-cn");
|
||||
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
|
||||
|
||||
// Add a document with more languages, one supported by Carrot2
|
||||
final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
|
||||
docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
|
||||
docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
|
||||
docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
|
||||
docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
|
||||
docWithOneSupprtedLanguageOfMany.addField("lang", "de");
|
||||
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
|
||||
|
||||
// Add a document with more languages, one supported by Carrot2
|
||||
final SolrInputDocument docWithCustomFields = new SolrInputDocument();
|
||||
docWithCustomFields.addField("id", numberOfDocs++);
|
||||
docWithCustomFields.addField("url", "custom_fields");
|
||||
docWithCustomFields.addField("intfield_i", 10);
|
||||
docWithCustomFields.addField("floatfield_f", 10.5);
|
||||
docWithCustomFields.addField("heading", "first");
|
||||
docWithCustomFields.addField("heading", "second");
|
||||
assertNull(h.validateUpdate(adoc(docWithCustomFields)));
|
||||
assertNull(h.validateUpdate(commit()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Expose package-scope methods from {@link ClusteringComponent} to tests.
|
||||
*/
|
||||
protected final Map<String,SearchClusteringEngine> getSearchClusteringEngines(ClusteringComponent comp) {
|
||||
return comp.getSearchClusteringEngines();
|
||||
}
|
||||
|
||||
final static String[][] DOCUMENTS = new String[][]{
|
||||
{"http://en.wikipedia.org/wiki/Data_mining",
|
||||
"Data Mining - Wikipedia",
|
||||
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."},
|
||||
|
||||
|
||||
{"http://en.wikipedia.org/wiki/Datamining",
|
||||
"Data mining - Wikipedia, the free encyclopedia",
|
||||
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."},
|
||||
|
||||
|
||||
{"http://www.statsoft.com/textbook/stdatmin.html",
|
||||
"Electronic Statistics Textbook: Data Mining Techniques",
|
||||
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."},
|
||||
|
||||
|
||||
{"http://www.thearling.com/text/dmwhite/dmwhite.htm",
|
||||
"An Introduction to Data Mining",
|
||||
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."},
|
||||
|
||||
|
||||
{"http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
|
||||
"Data Mining: What is Data Mining?",
|
||||
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."},
|
||||
|
||||
|
||||
{"http://www.spss.com/datamine",
|
||||
"Data Mining Software, Data Mining Applications and Data Mining Solutions",
|
||||
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."},
|
||||
|
||||
|
||||
{"http://www.kdnuggets.com/",
|
||||
"KD Nuggets",
|
||||
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."},
|
||||
|
||||
|
||||
{"http://www.answers.com/topic/data-mining",
|
||||
"data mining: Definition from Answers.com",
|
||||
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."},
|
||||
|
||||
|
||||
{"http://www.statsoft.com/products/dataminer.htm",
|
||||
"STATISTICA Data Mining and Predictive Modeling Solutions",
|
||||
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."},
|
||||
|
||||
|
||||
{"http://datamining.typepad.com/",
|
||||
"Data Mining: Text Mining, Visualization and Social Media",
|
||||
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."},
|
||||
|
||||
|
||||
{"http://www.twocrows.com/",
|
||||
"Two Crows Corporation",
|
||||
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."},
|
||||
|
||||
|
||||
{"http://www.thearling.com/",
|
||||
"Thearling.com",
|
||||
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."},
|
||||
|
||||
|
||||
{"http://www.ccsu.edu/datamining/",
|
||||
"CCSU - Data Mining",
|
||||
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."},
|
||||
|
||||
|
||||
{"http://www.oracle.com/technology/products/bi/odm",
|
||||
"Oracle Data Mining",
|
||||
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."},
|
||||
|
||||
|
||||
{"http://databases.about.com/od/datamining/a/datamining.htm",
|
||||
"Data Mining: An Introduction",
|
||||
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."},
|
||||
|
||||
|
||||
{"http://www.dmoz.org/Computers/Software/Databases/Data_Mining/",
|
||||
"Open Directory - Computers: Software: Databases: Data Mining",
|
||||
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."},
|
||||
|
||||
|
||||
{"http://www.cs.wisc.edu/dmi/",
|
||||
"DMI:Data Mining Institute",
|
||||
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."},
|
||||
|
||||
|
||||
{"http://www.the-data-mine.com/",
|
||||
"The Data Mine",
|
||||
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."},
|
||||
|
||||
|
||||
{"http://www.statserv.com/datamining.html",
|
||||
"St@tServ - About Data Mining",
|
||||
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."},
|
||||
|
||||
|
||||
{"http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm",
|
||||
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
|
||||
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."},
|
||||
|
||||
|
||||
{"http://www.pentaho.com/products/data_mining/",
|
||||
"Pentaho Commercial Open Source Business Intelligence: Data Mining",
|
||||
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."},
|
||||
|
||||
|
||||
{"http://www.investorhome.com/mining.htm",
|
||||
"Investor Home - Data Mining",
|
||||
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."},
|
||||
|
||||
|
||||
{"http://www.datamining.com/",
|
||||
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
|
||||
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."},
|
||||
|
||||
|
||||
{"http://www.sourcewatch.org/index.php?title=Data_mining",
|
||||
"Data mining - SourceWatch",
|
||||
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."},
|
||||
|
||||
|
||||
{"http://www.autonlab.org/tutorials/",
|
||||
"Statistical Data Mining Tutorials",
|
||||
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."},
|
||||
|
||||
|
||||
{"http://www.microstrategy.com/data-mining/index.asp",
|
||||
"Data Mining",
|
||||
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."},
|
||||
|
||||
|
||||
{"http://www.datamininglab.com/",
|
||||
"Elder Research",
|
||||
"Provides consulting and short courses in data mining and pattern discovery patterns in data."},
|
||||
|
||||
|
||||
{"http://www.sqlserverdatamining.com/",
|
||||
"SQL Server Data Mining > Home",
|
||||
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."},
|
||||
|
||||
|
||||
{"http://databases.about.com/cs/datamining/g/dmining.htm",
|
||||
"Data Mining",
|
||||
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."},
|
||||
|
||||
|
||||
{"http://www.sas.com/technologies/analytics/datamining/index.html",
|
||||
"Data Mining Software and Text Mining | SAS",
|
||||
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."}
|
||||
};
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.apache.solr.BaseDistributedSearchTestCase;
|
||||
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
|
||||
import org.apache.solr.client.solrj.response.Cluster;
|
||||
import org.apache.solr.client.solrj.response.ClusteringResponse;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@SuppressSSL
|
||||
public class ClusteringComponentDistributedTest extends BaseDistributedSearchTestCase {
|
||||
private final static String QUERY_TESTSET_SAMPLE_DOCUMENTS = "testSet:sampleDocs";
|
||||
|
||||
@Override
|
||||
public String getSolrHome() {
|
||||
return getFile("clustering/solr/collection1").getParent();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void indexDocs() throws Exception {
|
||||
del("*:*");
|
||||
|
||||
String[] languages = {
|
||||
"English",
|
||||
"French",
|
||||
"German",
|
||||
"Unknown",
|
||||
};
|
||||
|
||||
int docId = 0;
|
||||
for (String[] doc : SampleData.SAMPLE_DOCUMENTS) {
|
||||
index(
|
||||
"id", Integer.toString(docId),
|
||||
"title", doc[0],
|
||||
"snippet", doc[1],
|
||||
"testSet", "sampleDocs",
|
||||
"lang", languages[docId % languages.length]
|
||||
);
|
||||
docId++;
|
||||
}
|
||||
commit();
|
||||
}
|
||||
|
||||
@Test
|
||||
@ShardsFixed(num = 2)
|
||||
public void testLingoAlgorithm() throws Exception {
|
||||
compareToExpected(clusters(QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "lingo");
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@ShardsFixed(num = 2)
|
||||
public void testStcAlgorithm() throws Exception {
|
||||
compareToExpected(clusters(QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.add(ClusteringComponent.REQUEST_PARAM_ENGINE, "stc");
|
||||
}));
|
||||
}
|
||||
|
||||
private void compareToExpected(List<Cluster> actual) throws IOException {
|
||||
String resourceSuffix = "";
|
||||
String expected = ClusteringComponentTest.getTestResource(getClass(), resourceSuffix);
|
||||
ClusteringComponentTest.compareWhitespaceNormalized(toString(actual), expected);
|
||||
}
|
||||
|
||||
private List<Cluster> clusters(String query, Consumer<ModifiableSolrParams> paramsConsumer) throws Exception {
|
||||
handle.clear();
|
||||
handle.put("responseHeader", SKIP);
|
||||
handle.put("response", SKIP);
|
||||
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CommonParams.Q, query);
|
||||
params.add(CommonParams.ROWS, "1000");
|
||||
params.add(CommonParams.SORT, id + " desc");
|
||||
params.add(ClusteringComponent.COMPONENT_NAME, "true");
|
||||
paramsConsumer.accept(params);
|
||||
|
||||
QueryResponse response = query(true, params);
|
||||
|
||||
ClusteringResponse clusteringResponse = response.getClusteringResponse();
|
||||
Assert.assertNotNull(clusteringResponse);
|
||||
|
||||
return clusteringResponse.getClusters();
|
||||
}
|
||||
|
||||
private String toString(List<Cluster> clusters) {
|
||||
return toString(clusters, "", new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
private StringBuilder toString(List<Cluster> clusters, String indent, StringBuilder sb) {
|
||||
clusters.forEach(c -> {
|
||||
sb.append(indent);
|
||||
sb.append("- " + c.getLabels().stream().collect(Collectors.joining("; ")));
|
||||
if (!c.getDocs().isEmpty()) {
|
||||
sb.append(" [" + c.getDocs().size() + "]");
|
||||
}
|
||||
sb.append("\n");
|
||||
|
||||
if (!c.getClusters().isEmpty()) {
|
||||
toString(c.getClusters(), indent + " ", sb);
|
||||
}
|
||||
});
|
||||
return sb;
|
||||
}
|
||||
}
|
|
@ -16,128 +16,380 @@
|
|||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import com.carrotsearch.randomizedtesting.RandomizedContext;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.client.solrj.response.ClusteringResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.handler.component.QueryComponent;
|
||||
import org.apache.solr.handler.component.SearchComponent;
|
||||
import org.apache.solr.handler.component.SearchHandler;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.request.SolrRequestHandler;
|
||||
import org.apache.solr.response.ResultContext;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.QueryCommand;
|
||||
import org.apache.solr.search.QueryResult;
|
||||
import org.junit.Before;
|
||||
import org.carrot2.clustering.Cluster;
|
||||
import org.hamcrest.MatcherAssert;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Assert;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class ClusteringComponentTest extends AbstractClusteringTestCase {
|
||||
* Tests {@link Engine}.
|
||||
*/
|
||||
public class ClusteringComponentTest extends SolrTestCaseJ4 {
|
||||
private final static String QUERY_TESTSET_SAMPLE_DOCUMENTS = "testSet:sampleDocs";
|
||||
|
||||
@Before
|
||||
public void doBefore() {
|
||||
clearIndex();
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
File testHome = createTempDir().toFile();
|
||||
FileUtils.copyDirectory(getFile("clustering/solr"), testHome);
|
||||
initCore("solrconfig.xml", "schema.xml", testHome.getAbsolutePath());
|
||||
|
||||
String[] languages = {
|
||||
"English",
|
||||
"French",
|
||||
"German",
|
||||
"Unknown",
|
||||
};
|
||||
|
||||
int docId = 0;
|
||||
for (String[] doc : SampleData.SAMPLE_DOCUMENTS) {
|
||||
assertNull(h.validateUpdate(adoc(
|
||||
"id", Integer.toString(docId),
|
||||
"title", doc[0],
|
||||
"snippet", doc[1],
|
||||
"testSet", "sampleDocs",
|
||||
"lang", languages[docId % languages.length])));
|
||||
docId++;
|
||||
}
|
||||
|
||||
assertNull(h.validateUpdate(commit()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComponent() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
|
||||
SearchComponent sc = core.getSearchComponent("clustering");
|
||||
assertTrue("sc is null and it shouldn't be", sc != null);
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
|
||||
params.add(ClusteringComponent.COMPONENT_NAME, "true");
|
||||
params.add(CommonParams.Q, "*:*");
|
||||
|
||||
params.add(ClusteringParams.USE_SEARCH_RESULTS, "true");
|
||||
|
||||
|
||||
SolrRequestHandler handler = core.getRequestHandler("/select");
|
||||
SolrQueryResponse rsp;
|
||||
rsp = new SolrQueryResponse();
|
||||
rsp.addResponseHeader(new SimpleOrderedMap<>());
|
||||
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
|
||||
handler.handleRequest(req, rsp);
|
||||
NamedList<?> values = rsp.getValues();
|
||||
Object clusters = values.get("clusters");
|
||||
//System.out.println("Clusters: " + clusters);
|
||||
assertTrue("clusters is null and it shouldn't be", clusters != null);
|
||||
req.close();
|
||||
|
||||
params = new ModifiableSolrParams();
|
||||
params.add(ClusteringComponent.COMPONENT_NAME, "true");
|
||||
params.add(ClusteringParams.ENGINE_NAME, "mock");
|
||||
params.add(ClusteringParams.USE_COLLECTION, "true");
|
||||
params.add(QueryComponent.COMPONENT_NAME, "false");
|
||||
|
||||
handler = core.getRequestHandler("docClustering");
|
||||
|
||||
rsp = new SolrQueryResponse();
|
||||
rsp.addResponseHeader(new SimpleOrderedMap<>());
|
||||
req = new LocalSolrQueryRequest(core, params);
|
||||
handler.handleRequest(req, rsp);
|
||||
values = rsp.getValues();
|
||||
clusters = values.get("clusters");
|
||||
//System.out.println("Clusters: " + clusters);
|
||||
assertTrue("clusters is null and it shouldn't be", clusters != null);
|
||||
req.close();
|
||||
public void testLingoAlgorithm() throws Exception {
|
||||
compareToExpected(clusters("lingo", QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
|
||||
// tests ClusteringComponent.docListToSolrDocumentList
|
||||
@Test
|
||||
public void testDocListConversion() throws Exception {
|
||||
assertU("", adoc("id", "3234", "url", "ignoreme", "val_i", "1",
|
||||
"val_dynamic", "quick red fox"));
|
||||
assertU("", adoc("id", "3235", "url", "ignoreme", "val_i", "1",
|
||||
"val_dynamic", "quick green fox"));
|
||||
assertU("", adoc("id", "3236", "url", "ignoreme", "val_i", "1",
|
||||
"val_dynamic", "quick brown fox"));
|
||||
assertU("", commit());
|
||||
public void testStcAlgorithm() throws Exception {
|
||||
compareToExpected(clusters("stc", QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
h.getCore().withSearcher(srchr -> {
|
||||
QueryResult qr = new QueryResult();
|
||||
QueryCommand cmd = new QueryCommand();
|
||||
cmd.setQuery(new MatchAllDocsQuery());
|
||||
cmd.setLen(10);
|
||||
qr = srchr.search(qr, cmd);
|
||||
@Test
|
||||
public void testKmeansAlgorithm() throws Exception {
|
||||
compareToExpected(clusters("kmeans", QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
DocList docs = qr.getDocList();
|
||||
assertEquals("wrong docs size", 3, docs.size());
|
||||
Set<String> fields = new HashSet<>();
|
||||
fields.add("val_dynamic");
|
||||
fields.add("dynamic_val");
|
||||
fields.add("range_facet_l"); // copied from id
|
||||
@Test
|
||||
public void testParamSubclusters() throws Exception {
|
||||
compareToExpected("off", clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, false);
|
||||
}));
|
||||
compareToExpected("on", clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.set(EngineParameters.PARAM_INCLUDE_SUBCLUSTERS, true);
|
||||
}));
|
||||
}
|
||||
|
||||
SolrDocumentList list = ClusteringComponent.docListToSolrDocumentList(docs, srchr, fields, null);
|
||||
assertEquals("wrong list Size", docs.size(), list.size());
|
||||
for (SolrDocument document : list) {
|
||||
@Test
|
||||
public void testParamOtherTopics() throws Exception {
|
||||
compareToExpected(clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, false);
|
||||
}));
|
||||
}
|
||||
|
||||
assertTrue("unexpected field", ! document.containsKey("val_i"));
|
||||
assertTrue("unexpected id field", ! document.containsKey("id"));
|
||||
/**
|
||||
* We'll make two queries, one with- and another one without summary
|
||||
* and assert that documents are shorter when highlighter is in use.
|
||||
*/
|
||||
@Test
|
||||
public void testClusteringOnHighlights() throws Exception {
|
||||
String query = "+snippet:mine +" + QUERY_TESTSET_SAMPLE_DOCUMENTS;
|
||||
|
||||
assertTrue("original field", document.containsKey("val_dynamic"));
|
||||
assertTrue("dyn copy field", document.containsKey("dynamic_val"));
|
||||
assertTrue("copy field", document.containsKey("range_facet_l"));
|
||||
Consumer<ModifiableSolrParams> common = params -> {
|
||||
params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
|
||||
params.add(EngineParameters.PARAM_CONTEXT_SIZE, Integer.toString(80));
|
||||
params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(1));
|
||||
};
|
||||
|
||||
assertNotNull("original field null", document.get("val_dynamic"));
|
||||
assertNotNull("dyn copy field null", document.get("dynamic_val"));
|
||||
assertNotNull("copy field null", document.get("range_facet_l"));
|
||||
List<Cluster<SolrDocument>> highlighted = clusters("echo", query,
|
||||
common.andThen(params -> {
|
||||
params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
|
||||
}));
|
||||
|
||||
List<Cluster<SolrDocument>> full = clusters("echo", query,
|
||||
common.andThen(params -> {
|
||||
params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "false");
|
||||
}));
|
||||
|
||||
// Echo clustering algorithm just returns document fields as cluster labels
|
||||
// so highlighted snippets should never be longer than full field content.
|
||||
Assert.assertEquals(highlighted.size(), full.size());
|
||||
for (int i = 0; i < highlighted.size(); i++) {
|
||||
List<String> labels1 = highlighted.get(i).getLabels();
|
||||
List<String> labels2 = full.get(i).getLabels();
|
||||
assertEquals(labels1.size(), labels2.size());
|
||||
for (int j = 0; j < labels1.size(); j++) {
|
||||
MatcherAssert.assertThat("Summary shorter than original document?",
|
||||
labels1.get(j).length(),
|
||||
Matchers.lessThanOrEqualTo(labels2.get(j).length()));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* We'll make two queries, one short summaries and another one with longer
|
||||
* summaries and will check that the results differ.
|
||||
*/
|
||||
@Test
|
||||
public void testSummaryFragSize() throws Exception {
|
||||
String query = "+snippet:mine +" + QUERY_TESTSET_SAMPLE_DOCUMENTS;
|
||||
|
||||
Consumer<ModifiableSolrParams> common = params -> {
|
||||
params.add(EngineParameters.PARAM_PREFER_QUERY_CONTEXT, "true");
|
||||
params.add(EngineParameters.PARAM_FIELDS, "title, snippet");
|
||||
params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(1));
|
||||
};
|
||||
|
||||
List<Cluster<SolrDocument>> shortSummaries = clusters("echo", query,
|
||||
common.andThen(params -> {
|
||||
params.add(EngineParameters.PARAM_CONTEXT_SIZE, Integer.toString(30));
|
||||
}));
|
||||
|
||||
List<Cluster<SolrDocument>> longSummaries = clusters("echo", query,
|
||||
common.andThen(params -> {
|
||||
params.add(EngineParameters.PARAM_CONTEXT_COUNT, Integer.toString(80));
|
||||
}));
|
||||
|
||||
Assert.assertEquals(shortSummaries.size(), longSummaries.size());
|
||||
for (int i = 0; i < shortSummaries.size(); i++) {
|
||||
List<String> shortLabels = shortSummaries.get(i).getLabels();
|
||||
List<String> longLabels = longSummaries.get(i).getLabels();
|
||||
assertEquals(shortLabels.size(), longLabels.size());
|
||||
for (int j = 0; j < shortLabels.size(); j++) {
|
||||
MatcherAssert.assertThat("Shorter summary is longer than longer summary?",
|
||||
shortLabels.get(j).length(),
|
||||
Matchers.lessThanOrEqualTo(longLabels.get(j).length()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test passing algorithm parameters via SolrParams.
|
||||
*/
|
||||
@Test
|
||||
public void testPassingAttributes() throws Exception {
|
||||
compareToExpected(clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.set("maxClusters", 2);
|
||||
params.set("hierarchyDepth", 1);
|
||||
params.add(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test passing algorithm parameters via Solr configuration file.
|
||||
*/
|
||||
@Test
|
||||
public void testPassingAttributesViaSolrConfig() throws Exception {
|
||||
compareToExpected(clusters("mock-solrconfig-attrs", QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test maximum label truncation.
|
||||
*/
|
||||
@Test
|
||||
public void testParamMaxLabels() throws Exception {
|
||||
List<Cluster<SolrDocument>> clusters = clusters("mock", QUERY_TESTSET_SAMPLE_DOCUMENTS, params -> {
|
||||
params.set("labelsPerCluster", "5");
|
||||
params.set(EngineParameters.PARAM_INCLUDE_OTHER_TOPICS, "false");
|
||||
params.set(EngineParameters.PARAM_MAX_LABELS, "3");
|
||||
});
|
||||
|
||||
clusters.forEach(c -> {
|
||||
MatcherAssert.assertThat(c.getLabels(), Matchers.hasSize(3));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCustomLanguageResources() throws Exception {
|
||||
compareToExpected(clusters(
|
||||
"testCustomLanguageResources",
|
||||
QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParamDefaultLanguage() throws Exception {
|
||||
compareToExpected(clusters(
|
||||
"testParamDefaultLanguage",
|
||||
QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify that documents with an explicit language name
|
||||
* field are clustered in separate batches.
|
||||
*
|
||||
* @see EngineParameters#PARAM_LANGUAGE_FIELD
|
||||
*/
|
||||
@Test
|
||||
public void testParamLanguageField() throws Exception {
|
||||
compareToExpected(clusters(
|
||||
"testParamLanguageField",
|
||||
QUERY_TESTSET_SAMPLE_DOCUMENTS));
|
||||
}
|
||||
|
||||
private void compareToExpected(List<Cluster<SolrDocument>> clusters) throws IOException {
|
||||
compareToExpected("", clusters);
|
||||
}
|
||||
|
||||
private void compareToExpected(String resourceSuffix,
|
||||
List<Cluster<SolrDocument>> clusters) throws IOException {
|
||||
String actual = toString(clusters);
|
||||
String expected = getTestResource(getClass(), resourceSuffix);
|
||||
compareWhitespaceNormalized(actual, expected);
|
||||
}
|
||||
|
||||
static void compareWhitespaceNormalized(String actual, String expected) {
|
||||
Function<String, String> normalize = v -> v.replaceAll("\r", "").replaceAll("[ \t]+", " ").trim();
|
||||
|
||||
if (!normalize.apply(expected).equals(normalize.apply(actual))) {
|
||||
throw new AssertionError(String.format(Locale.ROOT,
|
||||
"The actual clusters structure differs from the expected one. Expected:\n%s\n\nActual:\n%s",
|
||||
expected,
|
||||
actual));
|
||||
}
|
||||
}
|
||||
|
||||
static String getTestResource(Class<?> clazz, String expectedResourceSuffix) throws IOException {
|
||||
RandomizedContext ctx = RandomizedContext.current();
|
||||
String resourceName = String.format(Locale.ROOT,
|
||||
"%s-%s%s.txt",
|
||||
ctx.getTargetClass().getSimpleName(),
|
||||
ctx.getTargetMethod().getName(),
|
||||
expectedResourceSuffix.isEmpty() ? "" : "-" + expectedResourceSuffix);
|
||||
|
||||
String expected;
|
||||
try (InputStream is = clazz.getResourceAsStream(resourceName)) {
|
||||
if (is == null) {
|
||||
throw new AssertionError("Test resource not found: " + resourceName + " (class-relative to " +
|
||||
clazz.getName() + ")");
|
||||
}
|
||||
|
||||
expected = new String(is.readAllBytes(), StandardCharsets.UTF_8);
|
||||
}
|
||||
return expected;
|
||||
}
|
||||
|
||||
private String toString(List<Cluster<SolrDocument>> clusters) {
|
||||
return toString(clusters, "", new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
private StringBuilder toString(List<Cluster<SolrDocument>> clusters, String indent, StringBuilder sb) {
|
||||
clusters.forEach(c -> {
|
||||
sb.append(indent);
|
||||
sb.append("- " + c.getLabels().stream().collect(Collectors.joining("; ")));
|
||||
if (!c.getDocuments().isEmpty()) {
|
||||
sb.append(" [" + c.getDocuments().size() + "]");
|
||||
}
|
||||
sb.append("\n");
|
||||
|
||||
if (!c.getClusters().isEmpty()) {
|
||||
toString(c.getClusters(), indent + " ", sb);
|
||||
}
|
||||
});
|
||||
return sb;
|
||||
}
|
||||
|
||||
private List<Cluster<SolrDocument>> clusters(String engineName, String query, Consumer<ModifiableSolrParams> paramsConsumer) {
|
||||
return clusters("/select", engineName, query, paramsConsumer);
|
||||
}
|
||||
|
||||
private List<Cluster<SolrDocument>> clusters(String engineName, String query) {
|
||||
return clusters("/select", engineName, query, params -> {
|
||||
});
|
||||
}
|
||||
|
||||
private List<Cluster<SolrDocument>> clusters(String handlerName, String engineName, String query,
|
||||
Consumer<ModifiableSolrParams> paramsConsumer) {
|
||||
SolrCore core = h.getCore();
|
||||
|
||||
ModifiableSolrParams reqParams = new ModifiableSolrParams();
|
||||
reqParams.add(ClusteringComponent.COMPONENT_NAME, "true");
|
||||
reqParams.add(ClusteringComponent.REQUEST_PARAM_ENGINE, engineName);
|
||||
reqParams.add(CommonParams.Q, query);
|
||||
reqParams.add(CommonParams.ROWS, "1000");
|
||||
paramsConsumer.accept(reqParams);
|
||||
|
||||
SearchHandler handler = (SearchHandler) core.getRequestHandler(handlerName);
|
||||
assertTrue("Clustering engine named '" + engineName + "' exists.", handler.getComponents().stream()
|
||||
.filter(c -> c instanceof ClusteringComponent)
|
||||
.flatMap(c -> ((ClusteringComponent) c).getEngineNames().stream())
|
||||
.anyMatch(localName -> Objects.equals(localName, engineName)));
|
||||
|
||||
SolrQueryResponse rsp = new SolrQueryResponse();
|
||||
rsp.addResponseHeader(new SimpleOrderedMap<>());
|
||||
try (SolrQueryRequest req = new LocalSolrQueryRequest(core, reqParams)) {
|
||||
handler.handleRequest(req, rsp);
|
||||
NamedList<?> values = rsp.getValues();
|
||||
@SuppressWarnings("unchecked")
|
||||
List<NamedList<Object>> clusters = (List<NamedList<Object>>) values.get("clusters");
|
||||
|
||||
String idField = core.getLatestSchema().getUniqueKeyField().getName();
|
||||
Map<String, SolrDocument> idToDoc = new HashMap<>();
|
||||
ResultContext resultContext = (ResultContext) rsp.getResponse();
|
||||
for (Iterator<SolrDocument> it = resultContext.getProcessedDocuments(); it.hasNext(); ) {
|
||||
SolrDocument doc = it.next();
|
||||
idToDoc.put(doc.getFirstValue(idField).toString(), doc);
|
||||
}
|
||||
|
||||
return clusters.stream().map(c -> toCluster(c, idToDoc)).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private Cluster<SolrDocument> toCluster(NamedList<Object> v, Map<String, SolrDocument> idToDoc) {
|
||||
Cluster<SolrDocument> c = new Cluster<>();
|
||||
v.forEach((key, value) -> {
|
||||
switch (key) {
|
||||
case ClusteringResponse.DOCS_NODE:
|
||||
((List<String>) value).forEach(docId -> c.addDocument(idToDoc.get(docId)));
|
||||
break;
|
||||
case ClusteringResponse.LABELS_NODE:
|
||||
((List<String>) value).forEach(c::addLabel);
|
||||
break;
|
||||
case ClusteringResponse.SCORE_NODE:
|
||||
c.setScore(((Number) value).doubleValue());
|
||||
break;
|
||||
case ClusteringResponse.CLUSTERS_NODE:
|
||||
((List<NamedList<Object>>) value).forEach(sub -> {
|
||||
c.addCluster(toCluster(sub, idToDoc));
|
||||
});
|
||||
break;
|
||||
case ClusteringResponse.IS_OTHER_TOPICS:
|
||||
// Just ignore the attribute.
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unknown output property " + key + " in cluster: " + v.jsonStr());
|
||||
}
|
||||
});
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.apache.solr.BaseDistributedSearchTestCase;
|
||||
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.junit.Test;
|
||||
|
||||
@SuppressSSL
|
||||
public class DistributedClusteringComponentTest extends
|
||||
BaseDistributedSearchTestCase {
|
||||
|
||||
@Override
|
||||
public String getSolrHome() {
|
||||
return getFile("clustering/solr/collection1").getParent();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test() throws Exception {
|
||||
del("*:*");
|
||||
int numberOfDocs = 0;
|
||||
for (String[] doc : AbstractClusteringTestCase.DOCUMENTS) {
|
||||
index(id, Integer.toString(numberOfDocs++), "url", doc[0], "title", doc[1], "snippet", doc[2]);
|
||||
}
|
||||
commit();
|
||||
handle.clear();
|
||||
// Only really care about the clusters for this test case, so drop the header and response
|
||||
handle.put("responseHeader", SKIP);
|
||||
handle.put("response", SKIP);
|
||||
query(
|
||||
ClusteringComponent.COMPONENT_NAME, "true",
|
||||
CommonParams.Q, "*:*",
|
||||
CommonParams.SORT, id + " desc",
|
||||
ClusteringParams.USE_SEARCH_RESULTS, "true");
|
||||
// destroy is not needed because distribTearDown method of base class does it.
|
||||
//destroyServers();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.attrs.AttrComposite;
|
||||
import org.carrot2.clustering.Cluster;
|
||||
import org.carrot2.clustering.ClusteringAlgorithm;
|
||||
import org.carrot2.clustering.Document;
|
||||
import org.carrot2.language.LanguageComponents;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Test-only pseudo clustering algorithm that creates
|
||||
* a cluster for each input document and sets the labels
|
||||
* of this cluster to the full content of clustered input
|
||||
* fields.
|
||||
*/
|
||||
public class EchoClusteringAlgorithm extends AttrComposite implements ClusteringAlgorithm {
|
||||
@Override
|
||||
public boolean supports(LanguageComponents languageComponents) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Class<?>> requiredLanguageComponents() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> documentStream, LanguageComponents languageComponents) {
|
||||
List<Cluster<T>> clusters = new ArrayList<>();
|
||||
documentStream.forEach(document -> {
|
||||
final Cluster<T> cluster = new Cluster<>();
|
||||
cluster.addDocument(document);
|
||||
document.visitFields((field, value) -> {
|
||||
cluster.addLabel(field + ":" + value);
|
||||
});
|
||||
clusters.add(cluster);
|
||||
});
|
||||
|
||||
return clusters;
|
||||
}
|
||||
}
|
|
@ -15,28 +15,20 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.search.DocSet;
|
||||
|
||||
import org.carrot2.clustering.ClusteringAlgorithmProvider;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class MockDocumentClusteringEngine extends DocumentClusteringEngine {
|
||||
* SPI provider of {@link EchoClusteringAlgorithm}.
|
||||
*/
|
||||
public class EchoClusteringAlgorithmProvider implements ClusteringAlgorithmProvider {
|
||||
@Override
|
||||
public NamedList<?> cluster(DocSet docs, SolrParams solrParams) {
|
||||
return new NamedList<>();
|
||||
public String name() {
|
||||
return EchoClusteringAlgorithm.class.getSimpleName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public NamedList<?> cluster(SolrParams solrParams) {
|
||||
return new NamedList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isAvailable() {
|
||||
return true;
|
||||
public EchoClusteringAlgorithm get() {
|
||||
return new EchoClusteringAlgorithm();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.attrs.AttrComposite;
|
||||
import org.carrot2.attrs.AttrInteger;
|
||||
import org.carrot2.clustering.Cluster;
|
||||
import org.carrot2.clustering.ClusteringAlgorithm;
|
||||
import org.carrot2.clustering.Document;
|
||||
import org.carrot2.language.LanguageComponents;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Creates a stable set of synthetic clusters based on the provided parameters.
|
||||
*/
|
||||
public class MockClusteringAlgorithm extends AttrComposite implements ClusteringAlgorithm {
|
||||
public AttrInteger docsInCluster =
|
||||
attributes.register(
|
||||
"docsInCluster",
|
||||
AttrInteger.builder().label("Number of documents in each cluster.")
|
||||
.min(1)
|
||||
.max(5)
|
||||
.defaultValue(3));
|
||||
|
||||
public AttrInteger hierarchyDepth =
|
||||
attributes.register(
|
||||
"hierarchyDepth",
|
||||
AttrInteger.builder().label("Levels of clusters hierarchy.")
|
||||
.min(1)
|
||||
.max(3)
|
||||
.defaultValue(2));
|
||||
|
||||
public AttrInteger maxClusters =
|
||||
attributes.register(
|
||||
"maxClusters",
|
||||
AttrInteger.builder().label("Maximum number of clusters at each hierarchy level.")
|
||||
.min(2)
|
||||
.max(100)
|
||||
.defaultValue(3));
|
||||
|
||||
public AttrInteger labelsPerCluster =
|
||||
attributes.register(
|
||||
"labelsPerCluster",
|
||||
AttrInteger.builder().label("Number of labels generated for each cluster.")
|
||||
.min(1)
|
||||
.max(5)
|
||||
.defaultValue(1));
|
||||
|
||||
@Override
|
||||
public boolean supports(LanguageComponents languageComponents) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Class<?>> requiredLanguageComponents() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> documentStream,
|
||||
LanguageComponents languageComponents) {
|
||||
List<T> documents = documentStream.collect(Collectors.toList());
|
||||
if (docsInCluster.get() > documents.size()) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
||||
Supplier<T> docSupplier = new Supplier<>() {
|
||||
Iterator<T> i = documents.iterator();
|
||||
|
||||
@Override
|
||||
public T get() {
|
||||
if (!i.hasNext()) {
|
||||
i = documents.iterator();
|
||||
}
|
||||
return i.next();
|
||||
}
|
||||
};
|
||||
|
||||
return createClusters(hierarchyDepth.get(), "Cluster ", docSupplier);
|
||||
}
|
||||
|
||||
private <T extends Document> List<Cluster<T>> createClusters(int level, String prefix,
|
||||
Supplier<T> docSupplier) {
|
||||
ArrayList<Cluster<T>> clusters = new ArrayList<>();
|
||||
for (int count = maxClusters.get(), idx = 1; count > 0; count--, idx++) {
|
||||
String label = prefix + (prefix.endsWith(" ") ? "" : ".") + idx;
|
||||
|
||||
Cluster<T> c = new Cluster<>();
|
||||
c.addLabel(label);
|
||||
for (int cnt = 1, max = labelsPerCluster.get(); cnt < max; cnt++) {
|
||||
c.addLabel("Label " + cnt);
|
||||
}
|
||||
c.setScore(level * count * 0.01);
|
||||
|
||||
if (level == 1) {
|
||||
for (int j = docsInCluster.get(); j > 0; j--) {
|
||||
c.addDocument(docSupplier.get());
|
||||
}
|
||||
} else {
|
||||
createClusters(level - 1, label, docSupplier).forEach(c::addCluster);
|
||||
}
|
||||
|
||||
clusters.add(c);
|
||||
}
|
||||
return clusters;
|
||||
}
|
||||
}
|
|
@ -14,20 +14,18 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.linguistic.IStemmer;
|
||||
import org.carrot2.text.linguistic.IStemmerFactory;
|
||||
import org.carrot2.clustering.ClusteringAlgorithmProvider;
|
||||
|
||||
public class DuplicatingStemmerFactory implements IStemmerFactory {
|
||||
public class MockClusteringAlgorithmProvider implements ClusteringAlgorithmProvider {
|
||||
@Override
|
||||
public IStemmer getStemmer(LanguageCode language) {
|
||||
return new IStemmer() {
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
return word.toString() + word.toString();
|
||||
}
|
||||
};
|
||||
public String name() {
|
||||
return MockClusteringAlgorithm.class.getSimpleName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public MockClusteringAlgorithm get() {
|
||||
return new MockClusteringAlgorithm();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.attrs.AttrComposite;
|
||||
import org.carrot2.attrs.AttrString;
|
||||
import org.carrot2.clustering.Cluster;
|
||||
import org.carrot2.clustering.ClusteringAlgorithm;
|
||||
import org.carrot2.clustering.Document;
|
||||
import org.carrot2.language.LanguageComponents;
|
||||
import org.carrot2.language.LexicalData;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Creates synthetic clusters with diagnostics of
|
||||
* {@link LanguageComponents} passed to the clustering method.
|
||||
*/
|
||||
class ResourceCheckAlgorithm extends AttrComposite implements ClusteringAlgorithm {
|
||||
public AttrString text =
|
||||
attributes.register(
|
||||
"text",
|
||||
AttrString.builder().label("Input text to analyze.")
|
||||
.defaultValue(null));
|
||||
|
||||
@Override
|
||||
public Set<Class<?>> requiredLanguageComponents() {
|
||||
return Set.of(LexicalData.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T extends Document> List<Cluster<T>> cluster(Stream<? extends T> documentStream,
|
||||
LanguageComponents languageComponents) {
|
||||
ArrayList<Cluster<T>> clusters = new ArrayList<>();
|
||||
|
||||
Cluster<T> cluster = new Cluster<>();
|
||||
cluster.addLabel("Lang: " + languageComponents.language());
|
||||
clusters.add(cluster);
|
||||
|
||||
cluster = new Cluster<>();
|
||||
clusters.add(cluster);
|
||||
|
||||
LexicalData lexicalData = languageComponents.get(LexicalData.class);
|
||||
cluster.addLabel(Arrays.stream(text.get().trim().split("[\\s]+"))
|
||||
.map(term -> String.format(Locale.ROOT,
|
||||
"%s[%s, %s]",
|
||||
term,
|
||||
lexicalData.ignoreWord(term) ? "ignoredWord" : "-",
|
||||
lexicalData.ignoreLabel(term) ? "ignoredLabel" : "-"))
|
||||
.collect(Collectors.joining(" ")));
|
||||
|
||||
return clusters;
|
||||
}
|
||||
}
|
|
@ -14,12 +14,18 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* {@link org.apache.solr.handler.clustering.carrot2.CarrotClusteringEngine} and related classes for use in the {@link org.apache.solr.handler.clustering.ClusteringComponent}.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
import org.carrot2.clustering.ClusteringAlgorithmProvider;
|
||||
|
||||
public class ResourceCheckAlgorithmProvider implements ClusteringAlgorithmProvider {
|
||||
@Override
|
||||
public String name() {
|
||||
return ResourceCheckAlgorithm.class.getSimpleName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResourceCheckAlgorithm get() {
|
||||
return new ResourceCheckAlgorithm();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,146 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering;
|
||||
|
||||
/**
|
||||
* Sample data for tests.
|
||||
*/
|
||||
final class SampleData {
|
||||
static final String[][] SAMPLE_DOCUMENTS =
|
||||
new String[][]{
|
||||
{
|
||||
"Data Mining - Wikipedia",
|
||||
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."
|
||||
},
|
||||
{
|
||||
"Data mining - Wikipedia, the free encyclopedia",
|
||||
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."
|
||||
},
|
||||
{
|
||||
"Electronic Statistics Textbook: Data Mining Techniques",
|
||||
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."
|
||||
},
|
||||
{
|
||||
"An Introduction to Data Mining",
|
||||
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."
|
||||
},
|
||||
{
|
||||
"Data Mining: What is Data Mining?",
|
||||
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."
|
||||
},
|
||||
{
|
||||
"Data Mining Software, Data Mining Applications and Data Mining Solutions",
|
||||
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."
|
||||
},
|
||||
{
|
||||
"KD Nuggets",
|
||||
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."
|
||||
},
|
||||
{
|
||||
"data mining: Definition from Answers.com",
|
||||
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."
|
||||
},
|
||||
{
|
||||
"STATISTICA Data Mining and Predictive Modeling Solutions",
|
||||
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."
|
||||
},
|
||||
{
|
||||
"Data Mining: Text Mining, Visualization and Social Media",
|
||||
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."
|
||||
},
|
||||
{
|
||||
"Two Crows Corporation",
|
||||
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."
|
||||
},
|
||||
{
|
||||
"Thearling.com",
|
||||
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."
|
||||
},
|
||||
{
|
||||
"CCSU - Data Mining",
|
||||
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."
|
||||
},
|
||||
{
|
||||
"Oracle Data Mining",
|
||||
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."
|
||||
},
|
||||
{
|
||||
"Data Mining: An Introduction",
|
||||
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."
|
||||
},
|
||||
{
|
||||
"Open Directory - Computers: Software: Databases: Data Mining",
|
||||
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."
|
||||
},
|
||||
{
|
||||
"DMI:Data Mining Institute",
|
||||
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."
|
||||
},
|
||||
{
|
||||
"The Data Mine",
|
||||
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."
|
||||
},
|
||||
{
|
||||
"St@tServ - About Data Mining",
|
||||
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."
|
||||
},
|
||||
{
|
||||
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
|
||||
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."
|
||||
},
|
||||
{
|
||||
"Pentaho Commercial Open Source Business Intelligence: Data Mining",
|
||||
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."
|
||||
},
|
||||
{
|
||||
"Investor Home - Data Mining",
|
||||
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."
|
||||
},
|
||||
{
|
||||
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
|
||||
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."
|
||||
},
|
||||
{
|
||||
"Data mining - SourceWatch",
|
||||
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."
|
||||
},
|
||||
{
|
||||
"Statistical Data Mining Tutorials",
|
||||
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."
|
||||
},
|
||||
{
|
||||
"Data Mining",
|
||||
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."
|
||||
},
|
||||
{
|
||||
"Elder Research",
|
||||
"Provides consulting and short courses in data mining and pattern discovery patterns in data."
|
||||
},
|
||||
{
|
||||
"SQL Server Data Mining > Home",
|
||||
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."
|
||||
},
|
||||
{
|
||||
"Data Mining",
|
||||
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."
|
||||
},
|
||||
{
|
||||
"Data Mining Software and Text Mining | SAS",
|
||||
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."
|
||||
}
|
||||
};
|
||||
}
|
|
@ -1,542 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Sort;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.handler.clustering.AbstractClusteringTestCase;
|
||||
import org.apache.solr.handler.clustering.ClusteringComponent;
|
||||
import org.apache.solr.handler.clustering.ClusteringEngine;
|
||||
import org.apache.solr.handler.clustering.SearchClusteringEngine;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.carrot2.clustering.lingo.LingoClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.util.attribute.AttributeUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||
@Test
|
||||
public void testCarrotLingo() throws Exception {
|
||||
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||
final int expectedNumClusters = 10;
|
||||
checkEngine(getClusteringEngine("default"), expectedNumClusters);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProduceSummary() throws Exception {
|
||||
// We'll make two queries, one with- and another one without summary
|
||||
// and assert that documents are shorter when highlighter is in use.
|
||||
final List<NamedList<Object>> noSummaryClusters = clusterWithHighlighting(false, 80);
|
||||
final List<NamedList<Object>> summaryClusters = clusterWithHighlighting(true, 80);
|
||||
|
||||
assertEquals("Equal number of clusters", noSummaryClusters.size(), summaryClusters.size());
|
||||
for (int i = 0; i < noSummaryClusters.size(); i++) {
|
||||
assertTrue("Summary shorter than original document",
|
||||
getLabels(noSummaryClusters.get(i)).get(1).length() >
|
||||
getLabels(summaryClusters.get(i)).get(1).length());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSummaryFragSize() throws Exception {
|
||||
// We'll make two queries, one short summaries and another one with longer
|
||||
// summaries and will check that the results differ.
|
||||
final List<NamedList<Object>> shortSummaryClusters = clusterWithHighlighting(true, 30);
|
||||
final List<NamedList<Object>> longSummaryClusters = clusterWithHighlighting(true, 80);
|
||||
|
||||
assertEquals("Equal number of clusters", shortSummaryClusters.size(), longSummaryClusters.size());
|
||||
for (int i = 0; i < shortSummaryClusters.size(); i++) {
|
||||
assertTrue("Summary shorter than original document",
|
||||
getLabels(shortSummaryClusters.get(i)).get(1).length() <
|
||||
getLabels(longSummaryClusters.get(i)).get(1).length());
|
||||
}
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> clusterWithHighlighting(
|
||||
boolean enableHighlighting, int fragSize) throws IOException {
|
||||
// Some documents don't have mining in the snippet
|
||||
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 7);
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> clusterWithHighlighting(
|
||||
boolean enableHighlighting, int fragSize, int summarySnippets,
|
||||
String term, int expectedNumDocuments) throws IOException {
|
||||
|
||||
final TermQuery query = new TermQuery(new Term("snippet", term));
|
||||
|
||||
final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
|
||||
summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
||||
summaryParams.add(CarrotParams.PRODUCE_SUMMARY,
|
||||
Boolean.toString(enableHighlighting));
|
||||
summaryParams
|
||||
.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
|
||||
summaryParams
|
||||
.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets));
|
||||
final List<NamedList<Object>> summaryClusters = checkEngine(
|
||||
getClusteringEngine("echo"), expectedNumDocuments,
|
||||
expectedNumDocuments, query, summaryParams);
|
||||
|
||||
return summaryClusters;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCarrotStc() throws Exception {
|
||||
checkEngine(getClusteringEngine("stc"), 3);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithoutSubclusters() throws Exception {
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
|
||||
1, 1, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExternalXmlAttributesFile() throws Exception {
|
||||
checkClusters(
|
||||
checkEngine(getClusteringEngine("mock-external-attrs"), 13),
|
||||
1, 4, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithSubclusters() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNumDescriptions() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
|
||||
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
|
||||
params), 1, 3, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClusterScores() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
|
||||
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
|
||||
AbstractClusteringTestCase.numberOfDocs, params);
|
||||
int i = 1;
|
||||
for (NamedList<Object> cluster : clusters) {
|
||||
final Double score = getScore(cluster);
|
||||
assertNotNull(score);
|
||||
assertEquals(0.25 * i++, score, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOtherTopics() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
|
||||
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
|
||||
AbstractClusteringTestCase.numberOfDocs, params);
|
||||
int i = 1;
|
||||
for (NamedList<Object> cluster : clusters) {
|
||||
assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCarrotAttributePassing() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
|
||||
params), 1, 3, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
|
||||
"online,customsolrstopword,customsolrstoplabel");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
|
||||
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
|
||||
}
|
||||
|
||||
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
|
||||
throws IOException {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
wordsToCheck);
|
||||
|
||||
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
|
||||
// stoplabels.mt, so we're expecting only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine(engineName), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSolrStopWordsUsedInCarrot2Clustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
|
||||
// "solrownstopword" is in stopwords.txt, so we're expecting
|
||||
// only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), Collections.singletonList("online"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSolrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
// Force string fields to be used for clustering. Does not make sense
|
||||
// in a real word, but does the job in the test.
|
||||
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
|
||||
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 2, params);
|
||||
assertEquals(Collections.singletonList("online"), getLabels(clusters.get(0)));
|
||||
assertEquals(Collections.singletonList("solrownstopword"), getLabels(clusters.get(1)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHighlightingOfMultiValueField() throws Exception {
|
||||
final String snippetWithoutSummary = getLabels(clusterWithHighlighting(
|
||||
false, 30, 3, "multi", 1).get(0)).get(1);
|
||||
assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First"));
|
||||
assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second"));
|
||||
assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third"));
|
||||
|
||||
final String snippetWithSummary = getLabels(clusterWithHighlighting(
|
||||
true, 30, 3, "multi", 1).get(0)).get(1);
|
||||
assertTrue("Snippet with summary shorter than full snippet",
|
||||
snippetWithoutSummary.length() > snippetWithSummary.length());
|
||||
assertTrue("Summary covers first value", snippetWithSummary.contains("First"));
|
||||
assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
|
||||
assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConcatenatingMultipleFields() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
|
||||
"snippet")), params).get(0));
|
||||
assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
|
||||
assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
|
||||
assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
|
||||
assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHighlightingMultipleFields() throws Exception {
|
||||
final TermQuery query = new TermQuery(new Term("snippet", "content"));
|
||||
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
|
||||
params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
|
||||
|
||||
final String snippetWithoutSummary = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
|
||||
assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
|
||||
assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
|
||||
|
||||
params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
|
||||
params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
|
||||
params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
|
||||
final String snippetWithSummary = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
|
||||
assertTrue("Snippet with summary shorter than full snippet",
|
||||
snippetWithoutSummary.length() > snippetWithSummary.length());
|
||||
assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
|
||||
assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOneCarrot2SupportedLanguage() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"one_supported_language")), params).get(0));
|
||||
assertEquals(3, labels.size());
|
||||
assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOneCarrot2SupportedLanguageOfMany() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"one_supported_language_of_many")), params).get(0));
|
||||
assertEquals(3, labels.size());
|
||||
assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLanguageCodeMapping() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
|
||||
params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"one_supported_language_of_many")), params).get(0));
|
||||
assertEquals(3, labels.size());
|
||||
assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPassingOfCustomFields() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.CUSTOM_FIELD_NAME, "intfield_i:intfield");
|
||||
params.add(CarrotParams.CUSTOM_FIELD_NAME, "floatfield_f:floatfield");
|
||||
params.add(CarrotParams.CUSTOM_FIELD_NAME, "heading:multi");
|
||||
|
||||
// Let the echo mock clustering algorithm know which custom field to echo
|
||||
params.add("custom-fields", "intfield,floatfield,multi");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"custom_fields")), params).get(0));
|
||||
assertEquals(5, labels.size());
|
||||
assertEquals("Integer field", "10", labels.get(2));
|
||||
assertEquals("Float field", "10.5", labels.get(3));
|
||||
assertEquals("List field", "[first, second]", labels.get(4));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCustomTokenizer() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("custom-duplicating-tokenizer"), 1, 15, new TermQuery(new Term("title",
|
||||
"field")), params).get(0));
|
||||
|
||||
// The custom test tokenizer duplicates each token's text
|
||||
assertTrue("First token", labels.get(0).contains("TitleTitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCustomStemmer() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("custom-duplicating-stemmer"), 1, 12, new TermQuery(new Term("title",
|
||||
"field")), params).get(0));
|
||||
|
||||
// The custom test stemmer duplicates and lowercases each token's text
|
||||
assertTrue("First token", labels.get(0).contains("titletitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDefaultEngineOrder() throws Exception {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-default");
|
||||
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
||||
assertEquals(
|
||||
Arrays.asList("stc", "default", "mock"),
|
||||
new ArrayList<>(engines.keySet()));
|
||||
assertEquals(
|
||||
LingoClusteringAlgorithm.class,
|
||||
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeclarationEngineOrder() throws Exception {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-decl-order");
|
||||
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
||||
assertEquals(
|
||||
Arrays.asList("unavailable", "lingo", "stc", "mock", "default"),
|
||||
new ArrayList<>(engines.keySet()));
|
||||
assertEquals(
|
||||
LingoClusteringAlgorithm.class,
|
||||
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeclarationNameDuplicates() throws Exception {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore().getSearchComponent("clustering-name-dups");
|
||||
Map<String,SearchClusteringEngine> engines = getSearchClusteringEngines(comp);
|
||||
assertEquals(
|
||||
Arrays.asList("", "default"),
|
||||
new ArrayList<>(engines.keySet()));
|
||||
assertEquals(
|
||||
MockClusteringAlgorithm.class,
|
||||
((CarrotClusteringEngine) engines.get(ClusteringEngine.DEFAULT_ENGINE_NAME)).getClusteringAlgorithmClass());
|
||||
}
|
||||
|
||||
private CarrotClusteringEngine getClusteringEngine(String engineName) {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore()
|
||||
.getSearchComponent("clustering");
|
||||
assertNotNull("clustering component should not be null", comp);
|
||||
CarrotClusteringEngine engine =
|
||||
(CarrotClusteringEngine) getSearchClusteringEngines(comp).get(engineName);
|
||||
assertNotNull("clustering engine for name: " + engineName
|
||||
+ " should not be null", engine);
|
||||
return engine;
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
|
||||
int expectedNumClusters) throws IOException {
|
||||
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
|
||||
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
|
||||
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
|
||||
}
|
||||
|
||||
|
||||
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
|
||||
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
|
||||
// Get all documents to cluster
|
||||
return h.getCore().withSearcher(searcher -> {
|
||||
DocList docList = searcher.getDocList(query, (Query) null, new Sort(), 0,
|
||||
numberOfDocs);
|
||||
assertEquals("docList size", expectedNumDocs, docList.matches());
|
||||
|
||||
ModifiableSolrParams solrParams = new ModifiableSolrParams();
|
||||
solrParams.add(clusteringParams);
|
||||
|
||||
// Perform clustering
|
||||
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
|
||||
Map<SolrDocument,Integer> docIds = new HashMap<>(docList.size());
|
||||
SolrDocumentList solrDocList = ClusteringComponent.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
|
||||
req.close();
|
||||
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
|
||||
checkClusters(results, false);
|
||||
return results;
|
||||
});
|
||||
}
|
||||
|
||||
private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
|
||||
int expectedLabelCount, int expectedSubclusterCount) {
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
NamedList<Object> cluster = results.get(i);
|
||||
checkCluster(cluster, expectedDocCount, expectedLabelCount,
|
||||
expectedSubclusterCount);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
checkCluster(results.get(i), hasSubclusters);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
|
||||
List<Object> docs = getDocs(cluster);
|
||||
assertNotNull("docs is null and it shouldn't be", docs);
|
||||
for (int j = 0; j < docs.size(); j++) {
|
||||
Object id = docs.get(j);
|
||||
assertNotNull("id is null and it shouldn't be", id);
|
||||
}
|
||||
|
||||
List<String> labels = getLabels(cluster);
|
||||
assertNotNull("labels is null but it shouldn't be", labels);
|
||||
|
||||
if (hasSubclusters) {
|
||||
List<NamedList<Object>> subclusters = getSubclusters(cluster);
|
||||
assertNotNull("subclusters is null but it shouldn't be", subclusters);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
|
||||
int expectedLabelCount, int expectedSubclusterCount) {
|
||||
checkCluster(cluster, expectedSubclusterCount > 0);
|
||||
assertEquals("number of docs in cluster", expectedDocCount,
|
||||
getDocs(cluster).size());
|
||||
assertEquals("number of labels in cluster", expectedLabelCount,
|
||||
getLabels(cluster).size());
|
||||
|
||||
if (expectedSubclusterCount > 0) {
|
||||
List<NamedList<Object>> subclusters = getSubclusters(cluster);
|
||||
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
|
||||
assertEquals("number of subclusters in cluster",
|
||||
expectedSubclusterCount, subclusters.size());
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
|
||||
return (List<NamedList<Object>>) cluster.get("clusters");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<String> getLabels(NamedList<Object> cluster) {
|
||||
return (List<String>) cluster.get("labels");
|
||||
}
|
||||
|
||||
private Double getScore(NamedList<Object> cluster) {
|
||||
return (Double) cluster.get("score");
|
||||
}
|
||||
|
||||
private Boolean isOtherTopics(NamedList<Object> cluster) {
|
||||
return (Boolean)cluster.get("other-topics");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<Object> getDocs(NamedList<Object> cluster) {
|
||||
return (List<Object>) cluster.get("docs");
|
||||
}
|
||||
}
|
|
@ -1,51 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
|
||||
import org.carrot2.text.analysis.ITokenizer;
|
||||
import org.carrot2.text.linguistic.ITokenizerFactory;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
|
||||
public class DuplicatingTokenizerFactory implements ITokenizerFactory {
|
||||
@Override
|
||||
public ITokenizer getTokenizer(LanguageCode language) {
|
||||
return new ITokenizer() {
|
||||
private final ExtendedWhitespaceTokenizer delegate = new ExtendedWhitespaceTokenizer();
|
||||
|
||||
@Override
|
||||
public void setTermBuffer(MutableCharArray buffer) {
|
||||
delegate.setTermBuffer(buffer);
|
||||
buffer.reset(buffer.toString() + buffer.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) {
|
||||
delegate.reset(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public short nextToken() throws IOException {
|
||||
return delegate.nextToken();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
import org.carrot2.core.Document;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.ProcessingComponentBase;
|
||||
import org.carrot2.core.ProcessingException;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
import org.carrot2.util.attribute.Output;
|
||||
|
||||
/**
|
||||
* A mock Carrot2 clustering algorithm that outputs input documents as clusters.
|
||||
* Useful only in tests.
|
||||
*/
|
||||
@Bindable(prefix = "EchoClusteringAlgorithm")
|
||||
public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
|
||||
IClusteringAlgorithm {
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.DOCUMENTS)
|
||||
public List<Document> documents;
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
public List<Cluster> clusters;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute(key = "custom-fields")
|
||||
public String customFields = "";
|
||||
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = new ArrayList<>();
|
||||
|
||||
for (Document document : documents) {
|
||||
final Cluster cluster = new Cluster();
|
||||
cluster.addPhrases(document.getTitle(), document.getSummary());
|
||||
if (document.getLanguage() != null) {
|
||||
cluster.addPhrases(document.getLanguage().name());
|
||||
}
|
||||
for (String field : customFields.split(",")) {
|
||||
Object value = document.getField(field);
|
||||
if (value != null) {
|
||||
cluster.addPhrases(value.toString());
|
||||
}
|
||||
}
|
||||
cluster.addDocuments(document);
|
||||
clusters.add(cluster);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
import org.carrot2.core.Document;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.ProcessingComponentBase;
|
||||
import org.carrot2.core.ProcessingException;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.preprocessing.PreprocessingContext;
|
||||
import org.carrot2.text.preprocessing.PreprocessingContext.AllStems;
|
||||
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
|
||||
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
import org.carrot2.util.attribute.Output;
|
||||
|
||||
/**
|
||||
* A mock Carrot2 clustering algorithm that outputs stem of each token of each
|
||||
* document as a separate cluster. Useful only in tests.
|
||||
*/
|
||||
@Bindable(prefix = "EchoTokensClusteringAlgorithm")
|
||||
public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase
|
||||
implements IClusteringAlgorithm {
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.DOCUMENTS)
|
||||
public List<Document> documents;
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
public List<Cluster> clusters;
|
||||
|
||||
public BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
|
||||
documents, "", LanguageCode.ENGLISH);
|
||||
final AllTokens allTokens = preprocessingContext.allTokens;
|
||||
final AllWords allWords = preprocessingContext.allWords;
|
||||
final AllStems allStems = preprocessingContext.allStems;
|
||||
clusters = new ArrayList<>();
|
||||
for (int i = 0; i < allTokens.image.length; i++) {
|
||||
if (allTokens.wordIndex[i] >= 0) {
|
||||
clusters.add(new Cluster(new String(
|
||||
allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]])));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,68 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
import org.carrot2.core.Document;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.ProcessingComponentBase;
|
||||
import org.carrot2.core.ProcessingException;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.preprocessing.PreprocessingContext;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
import org.carrot2.util.attribute.Output;
|
||||
|
||||
|
||||
/**
|
||||
* A mock Carrot2 clustering algorithm that outputs each token of each document
|
||||
* as a separate cluster. Useful only in tests.
|
||||
*/
|
||||
@Bindable(prefix = "EchoTokensClusteringAlgorithm")
|
||||
public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase
|
||||
implements IClusteringAlgorithm {
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.DOCUMENTS)
|
||||
public List<Document> documents;
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
public List<Cluster> clusters;
|
||||
|
||||
public BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
|
||||
documents, "", LanguageCode.ENGLISH);
|
||||
clusters = new ArrayList<>();
|
||||
for (char[] token : preprocessingContext.allTokens.image) {
|
||||
if (token != null) {
|
||||
clusters.add(new Cluster(new String(token)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,79 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.ProcessingComponentBase;
|
||||
import org.carrot2.core.ProcessingException;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.linguistic.ILexicalData;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
import org.carrot2.util.attribute.Output;
|
||||
|
||||
/**
|
||||
* A mock implementation of Carrot2 clustering algorithm for testing whether the
|
||||
* customized lexical resource lookup works correctly. This algorithm ignores
|
||||
* the input documents and instead for each word from {@link #wordsToCheck}, it
|
||||
* outputs a cluster labeled with the word only if the word is neither a stop
|
||||
* word nor a stop label.
|
||||
*/
|
||||
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
|
||||
public class LexicalResourcesCheckClusteringAlgorithm extends
|
||||
ProcessingComponentBase implements IClusteringAlgorithm {
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
public List<Cluster> clusters;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
public String wordsToCheck;
|
||||
|
||||
public BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = new ArrayList<>();
|
||||
if (wordsToCheck == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Test with Maltese so that the English clustering performed in other tests
|
||||
// is not affected by the test stopwords and stoplabels.
|
||||
ILexicalData lexicalData = preprocessing.lexicalDataFactory
|
||||
.getLexicalData(LanguageCode.MALTESE);
|
||||
|
||||
for (String word : wordsToCheck.split(",")) {
|
||||
if (!lexicalData.isCommonWord(new MutableCharArray(word))
|
||||
&& !lexicalData.isStopLabel(word)) {
|
||||
clusters.add(new Cluster(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,103 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.handler.clustering.carrot2;
|
||||
import org.carrot2.core.*;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.util.attribute.*;
|
||||
import org.carrot2.util.attribute.constraint.IntRange;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Bindable(prefix = "MockClusteringAlgorithm")
|
||||
public class MockClusteringAlgorithm extends ProcessingComponentBase implements
|
||||
IClusteringAlgorithm {
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.DOCUMENTS)
|
||||
public List<Document> documents;
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
public List<Cluster> clusters;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
@IntRange(min = 1, max = 5)
|
||||
public int depth = 2;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
@IntRange(min = 1, max = 5)
|
||||
public int labels = 1;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
@IntRange(min = 0)
|
||||
public int maxClusters = 0;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
public int otherTopicsModulo = 0;
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = new ArrayList<>();
|
||||
if (documents == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (maxClusters > 0) {
|
||||
documents = documents.subList(0, maxClusters);
|
||||
}
|
||||
|
||||
int documentIndex = 1;
|
||||
for (Document document : documents) {
|
||||
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
|
||||
Cluster cluster = createCluster(label.toString(), documentIndex, document);
|
||||
clusters.add(cluster);
|
||||
for (int i = 1; i <= depth; i++) {
|
||||
label.append(".");
|
||||
label.append(i);
|
||||
Cluster newCluster = createCluster(label.toString(), documentIndex, document);
|
||||
cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
|
||||
cluster = newCluster;
|
||||
}
|
||||
documentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
|
||||
Cluster cluster = new Cluster();
|
||||
cluster.setScore(documentIndex * 0.25);
|
||||
if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
|
||||
{
|
||||
cluster.setOtherTopics(true);
|
||||
}
|
||||
for (int i = 0; i < labels; i++) {
|
||||
cluster.addPhrases(labelBase + "#" + (i + 1));
|
||||
}
|
||||
cluster.addDocuments(documents);
|
||||
return cluster;
|
||||
}
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
7f13f63e2e213f6ea38364836408d2dc11f29804
|
|
@ -1,202 +0,0 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,9 +0,0 @@
|
|||
=========================================================================
|
||||
== Carrot2 Attributes Binder Notice ==
|
||||
=========================================================================
|
||||
Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
|
||||
All rights reserved.
|
||||
|
||||
This product includes software developed by the Carrot2 Project.
|
||||
|
||||
See http://project.carrot2.org/
|
|
@ -1,7 +1,7 @@
|
|||
|
||||
Carrot2 Project
|
||||
|
||||
Copyright (C) 2002-2013, Dawid Weiss, Stanisław Osiński.
|
||||
Copyright (C) 2002-2020, Dawid Weiss, Stanisław Osiński.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
|
@ -1,10 +1,10 @@
|
|||
=========================================================================
|
||||
== Carrot2 Notice ==
|
||||
=========================================================================
|
||||
Copyright (C) 2002-2013, Dawid Weiss, Stanislaw Osinski.
|
||||
Copyright (C) 2002-2020, Dawid Weiss, Stanislaw Osinski.
|
||||
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
|
||||
All rights reserved.
|
||||
|
||||
This product includes software developed by the Carrot2 Project.
|
||||
|
||||
See http://project.carrot2.org/
|
||||
See https://project.carrot2.org/
|
|
@ -0,0 +1 @@
|
|||
fb60ab80cfd69abe6cad1939f24bd5210501b177
|
|
@ -1 +0,0 @@
|
|||
539317dc171b8c92cca964e87686602800cf19b0
|
|
@ -1,202 +0,0 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -1,5 +0,0 @@
|
|||
This product includes software developed by
|
||||
Google, Inc. (http://code.google.com/p/guava-libraries/)
|
||||
|
||||
Repacked Carrot2 Guava at:
|
||||
https://github.com/carrot2/lib-repackaged
|
|
@ -1 +0,0 @@
|
|||
decabb42b88a8d40c1894984f4af8adb833f766b
|
|
@ -1 +0,0 @@
|
|||
045fda5ac6087bc82a209d8cdb73f8d0dbdcfc7b
|
|
@ -1,202 +0,0 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
This product includes software developed by
|
||||
the SimpleXML project (http://simple.sourceforge.net).
|
|
@ -1,11 +0,0 @@
|
|||
An override location of the clustering algorithm's resources
|
||||
attribute definitions and lexical resources.
|
||||
|
||||
A directory from which to load algorithm-specific stop words,
|
||||
stop labels and attribute definition XMLs.
|
||||
|
||||
For an overview of Carrot2 lexical resources, see:
|
||||
http://download.carrot2.org/head/manual/#chapter.lexical-resources
|
||||
|
||||
For an overview of Lingo3G lexical resources, see:
|
||||
http://download.carrotsearch.com/lingo3g/manual/#chapter.lexical-resources
|
|
@ -1,19 +0,0 @@
|
|||
<!--
|
||||
Default configuration for the bisecting k-means clustering algorithm.
|
||||
|
||||
This file can be loaded (and saved) by Carrot2 Workbench.
|
||||
http://project.carrot2.org/download.html
|
||||
-->
|
||||
<attribute-sets default="attributes">
|
||||
<attribute-set id="attributes">
|
||||
<value-set>
|
||||
<label>attributes</label>
|
||||
<attribute key="MultilingualClustering.defaultLanguage">
|
||||
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
|
||||
</attribute>
|
||||
<attribute key="MultilingualClustering.languageAggregationStrategy">
|
||||
<value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
|
||||
</attribute>
|
||||
</value-set>
|
||||
</attribute-set>
|
||||
</attribute-sets>
|
|
@ -1,24 +0,0 @@
|
|||
<!--
|
||||
Default configuration for the Lingo clustering algorithm.
|
||||
|
||||
This file can be loaded (and saved) by Carrot2 Workbench.
|
||||
http://project.carrot2.org/download.html
|
||||
-->
|
||||
<attribute-sets default="attributes">
|
||||
<attribute-set id="attributes">
|
||||
<value-set>
|
||||
<label>attributes</label>
|
||||
<!--
|
||||
The language to assume for clustered documents.
|
||||
For a list of allowed values, see:
|
||||
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
|
||||
-->
|
||||
<attribute key="MultilingualClustering.defaultLanguage">
|
||||
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
|
||||
</attribute>
|
||||
<attribute key="LingoClusteringAlgorithm.desiredClusterCountBase">
|
||||
<value type="java.lang.Integer" value="20"/>
|
||||
</attribute>
|
||||
</value-set>
|
||||
</attribute-set>
|
||||
</attribute-sets>
|
|
@ -1,19 +0,0 @@
|
|||
<!--
|
||||
Default configuration for the STC clustering algorithm.
|
||||
|
||||
This file can be loaded (and saved) by Carrot2 Workbench.
|
||||
http://project.carrot2.org/download.html
|
||||
-->
|
||||
<attribute-sets default="attributes">
|
||||
<attribute-set id="attributes">
|
||||
<value-set>
|
||||
<label>attributes</label>
|
||||
<attribute key="MultilingualClustering.defaultLanguage">
|
||||
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
|
||||
</attribute>
|
||||
<attribute key="MultilingualClustering.languageAggregationStrategy">
|
||||
<value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
|
||||
</attribute>
|
||||
</value-set>
|
||||
</attribute-set>
|
||||
</attribute-sets>
|
|
@ -1002,7 +1002,7 @@
|
|||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<!-- Clustering Component
|
||||
<!-- Search results clustering component
|
||||
|
||||
You'll need to set the solr.clustering.enabled system property
|
||||
when running solr to run with clustering enabled:
|
||||
|
@ -1014,69 +1014,64 @@
|
|||
enable="${solr.clustering.enabled:false}"
|
||||
class="solr.clustering.ClusteringComponent" >
|
||||
<!--
|
||||
Declaration of "engines" (clustering algorithms).
|
||||
Declaration of "engines" (named sets of configuration parameters).
|
||||
|
||||
The open source algorithms from Carrot2.org project:
|
||||
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
|
||||
* org.carrot2.clustering.stc.STCClusteringAlgorithm
|
||||
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
|
||||
Various algorithms are available (names are loaded via service provider
|
||||
extension point). The open source algorithms from Carrot2.org project:
|
||||
* Lingo
|
||||
* STC
|
||||
* Bisecting K-Means
|
||||
|
||||
Commercial algorithm Lingo3G (needs to be installed separately):
|
||||
* com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm
|
||||
Commercial algorithm Lingo3G from Carrot Search (needs to be installed separately):
|
||||
* Lingo3G
|
||||
-->
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">lingo3g</str>
|
||||
<bool name="optional">true</bool>
|
||||
<str name="carrot.algorithm">com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm</str>
|
||||
<str name="carrot.resourcesDir">clustering/carrot2</str>
|
||||
<str name="clustering.algorithm">Lingo3G</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<bool name="clustering.includeOtherTopics">true</bool>
|
||||
<str name="clustering.language">English</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
<str name="carrot.resourcesDir">clustering/carrot2</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<bool name="clustering.includeOtherTopics">true</bool>
|
||||
<str name="clustering.language">English</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">stc</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
|
||||
<str name="carrot.resourcesDir">clustering/carrot2</str>
|
||||
<str name="clustering.algorithm">STC</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<bool name="clustering.includeOtherTopics">true</bool>
|
||||
<str name="clustering.language">English</str>
|
||||
</lst>
|
||||
|
||||
<lst name="engine">
|
||||
<str name="name">kmeans</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str>
|
||||
<str name="carrot.resourcesDir">clustering/carrot2</str>
|
||||
<str name="clustering.algorithm">Bisecting K-Means</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<bool name="clustering.includeOtherTopics">true</bool>
|
||||
<str name="clustering.language">English</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<!-- A request handler for demonstrating the clustering component.
|
||||
This is meant as an example.
|
||||
In reality you will likely want to add the component to your
|
||||
already specified request handlers.
|
||||
This is meant as an example - in reality you will likely want
|
||||
to add the clustering component to your default request handler.
|
||||
-->
|
||||
<requestHandler name="/clustering"
|
||||
startup="lazy"
|
||||
enable="${solr.clustering.enabled:false}"
|
||||
class="solr.SearchHandler">
|
||||
<lst name="defaults">
|
||||
<!-- Enable clustering component by default. -->
|
||||
<bool name="clustering">true</bool>
|
||||
<bool name="clustering.results">true</bool>
|
||||
<!-- Field name with the logical "title" of a each document (optional) -->
|
||||
<str name="carrot.title">name</str>
|
||||
<!-- Field name with the logical "URL" of a each document (optional) -->
|
||||
<str name="carrot.url">id</str>
|
||||
<!-- Field name with the logical "content" of a each document (optional) -->
|
||||
<str name="carrot.snippet">features</str>
|
||||
<!-- Apply highlighter to the title/ content and use this for clustering. -->
|
||||
<bool name="carrot.produceSummary">true</bool>
|
||||
<!-- the maximum number of labels per cluster -->
|
||||
<!--<int name="carrot.numDescriptions">5</int>-->
|
||||
<!-- produce sub clusters -->
|
||||
<bool name="carrot.outputSubClusters">false</bool>
|
||||
|
||||
<!-- Configure the remaining request handler parameters. -->
|
||||
<str name="defType">edismax</str>
|
||||
<str name="qf">
|
||||
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
|
||||
|
@ -1085,6 +1080,8 @@
|
|||
<str name="rows">100</str>
|
||||
<str name="fl">*,score</str>
|
||||
</lst>
|
||||
|
||||
<!-- Append the clustering component at the end of the search handler's list of components. -->
|
||||
<arr name="last-components">
|
||||
<str>clustering</str>
|
||||
</arr>
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 130 KiB |
Binary file not shown.
After Width: | Height: | Size: 130 KiB |
Binary file not shown.
Before Width: | Height: | Size: 206 KiB |
Binary file not shown.
Before Width: | Height: | Size: 394 KiB After Width: | Height: | Size: 622 KiB |
|
@ -16,35 +16,102 @@
|
|||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
[IMPORTANT]
|
||||
====
|
||||
The clustering component implementation and API (parameters) have changed significantly
|
||||
in version 9.0. Please refer to the Solr Guide version matching your Solr release
|
||||
exactly.
|
||||
====
|
||||
|
||||
The *clustering* (or *cluster analysis*) plugin attempts to automatically discover groups of related search hits (documents) and assign human-readable labels to these groups.
|
||||
|
||||
By default in Solr, the clustering algorithm is applied to the search result of each single query -— this is called an _on-line_ clustering. While Solr contains an extension for full-index clustering (_off-line_ clustering) this section will focus on discussing on-line clustering only.
|
||||
The clustering algorithm in Solr is applied to documents included in search result of each single query -— this is called an _on-line_ clustering.
|
||||
|
||||
Clusters discovered for a given query can be perceived as _dynamic facets_. This is beneficial when regular faceting is difficult (field values are not known in advance) or when the queries are exploratory in nature. Take a look at the https://search.carrot2.org/#/search/web/solr/treemap[Carrot2] project's demo page to see an example of search results clustering in action (the groups in the visualization have been discovered automatically in search results to the right, there is no external information involved).
|
||||
Clusters discovered for a given query can be perceived as _dynamic facets_. This is beneficial when regular faceting is difficult (field values are not known in advance) or when the queries are exploratory in nature. Take a look at the https://search.carrot2.org/#/search/web/apache%20solr/treemap[Carrot^2^] project's demo page to see an example of search results clustering in action (the groups in the visualization have been discovered automatically in search results to the right, there is no external information involved).
|
||||
|
||||
image::images/result-clustering/carrot2.png[image,width=900]
|
||||
|
||||
The query issued to the system was _Solr_. It seems clear that faceting could not yield a similar set of groups, although the goals of both techniques are similar—to let the user explore the set of search results and either rephrase the query or narrow the focus to a subset of current documents. Clustering is also similar to <<result-grouping.adoc#result-grouping,Result Grouping>> in that it can help to look deeper into search results, beyond the top few hits.
|
||||
The query issued to the system was _Apache Solr_. It seems clear that faceting could not yield a similar set of groups, although the goals of both techniques are similar—to let the user explore the set of search results and either rephrase the query or narrow the focus to a subset of current documents. Clustering is also similar to <<result-grouping.adoc#result-grouping,Result Grouping>> in that it can help to look deeper into search results, beyond the top few hits.
|
||||
|
||||
== Clustering Concepts
|
||||
== Configuration Quick Starter
|
||||
|
||||
Each *document* passed to the clustering component is composed of several logical parts:
|
||||
The clustering extension works as a search component. It needs to be declared and configured in `solrconfig.xml`, for example:
|
||||
|
||||
* a unique identifier,
|
||||
* origin URL,
|
||||
* the title,
|
||||
* the main content,
|
||||
* a language code of the title and content.
|
||||
[source,xml]
|
||||
----
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering">
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="clustering.fields">title, content</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
----
|
||||
|
||||
The identifier part is mandatory, everything else is optional but at least one of the text fields (title or content) will be required to make the clustering process reasonable. It is important to remember that logical document parts must be mapped to a particular schema and its fields. The content (text) for clustering can be sourced from either a stored text field or context-filtered using a highlighter, all these options are explained below in the <<Clustering Configuration,configuration>> section.
|
||||
The above declares the clustering component with a single *engine* -- there may be
|
||||
multiple engines declared and switched at runtime. We will return to the details of
|
||||
how to configure engines later.
|
||||
|
||||
A *clustering algorithm* is the actual logic (implementation) that discovers relationships among the documents in the search result and forms human-readable cluster labels. Depending on the choice of the algorithm the clusters may (and probably will) vary. Solr comes with several algorithms implemented in the open source http://carrot2.org[Carrot2] project, commercial alternatives also exist.
|
||||
The clustering component must be attached to a `SearchHandler` and explicitly enabled
|
||||
via property `clustering`. It is important to attach it as the *last* component in the handler's pipeline, as shown below:
|
||||
|
||||
== Clustering Quick Start Example
|
||||
[source,xml]
|
||||
----
|
||||
<requestHandler name="/select" class="solr.SearchHandler">
|
||||
<lst name="defaults">
|
||||
<bool name="clustering">true</bool>
|
||||
<str name="clustering.engine">lingo</str>
|
||||
</lst>
|
||||
|
||||
The "```techproducts```" example included with Solr is pre-configured with all the necessary components for result clustering -- but they are disabled by default.
|
||||
<arr name="last-components">
|
||||
<str>clustering</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
----
|
||||
|
||||
To enable the clustering component contrib and a dedicated search handler configured to use it, specify a JVM System Property when running the example:
|
||||
Once attached, as in the example above, the clustering will be performed automatically
|
||||
on all documents matching the search handler's query. The clustering extension will take into
|
||||
account all text fields listed in `clustering.fields` parameter of the engine and will
|
||||
product a section of the response called `clusters` containing the discovered structure of
|
||||
groups, for example (JSON response for brevity):
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"clusters": [
|
||||
{
|
||||
"labels": ["Memory"],
|
||||
"score": 6.80,
|
||||
"docs":[ "0579B002",
|
||||
"EN7800GTX/2DHTV/256M",
|
||||
"TWINX2048-3200PRO",
|
||||
"VDBDB1A16",
|
||||
"VS1GB400C3"]},
|
||||
{
|
||||
"labels":["Coins and Notes"],
|
||||
"score":28.560285143284457,
|
||||
"docs":["EUR",
|
||||
"GBP",
|
||||
"NOK",
|
||||
"USD"]},
|
||||
{
|
||||
"labels":["TFT LCD"],
|
||||
"score":15.355729924203429,
|
||||
"docs":["3007WFP",
|
||||
"9885A004",
|
||||
"MA147LL/A",
|
||||
"VA902B"]}
|
||||
]
|
||||
}
|
||||
----
|
||||
|
||||
The `labels` element of each cluster is a dynamically discovered phrase that describes and applies to all document identifiers under `docs` element.
|
||||
|
||||
== Solr Distribution Example
|
||||
|
||||
The `techproducts` example included with Solr is pre-configured with all the necessary components for result clustering -- but they are disabled by default.
|
||||
|
||||
To enable the clustering component extension and the dedicated search handler configured to use it, specify a JVM System Property when running the example:
|
||||
|
||||
[source,bash]
|
||||
----
|
||||
|
@ -129,66 +196,73 @@ The output XML should include search hits and an array of automatically discover
|
|||
</response>
|
||||
----
|
||||
|
||||
There were a few clusters discovered for this query (`\*:*`), separating search hits into various categories: DDR, iPod, Hard Drive, etc. Each cluster has a label and score that indicates the "goodness" of the cluster. The score is algorithm-specific and is meaningful only in relation to the scores of other clusters in the same set. In other words, if cluster _A_ has a higher score than cluster _B_, cluster _A_ should be of better quality (have a better label and/or more coherent document set). Each cluster has an array of identifiers of documents belonging to it. These identifiers correspond to the `uniqueKey` field declared in the schema.
|
||||
A few clusters discovered for this query (`\*:*`), separate all search hits into various categories: DDR, iPod, Hard Drive, etc. Each cluster has a label and score that indicates the "goodness" of the cluster. The score is algorithm-specific and is meaningful only in relation to the scores of other clusters in the same set. In other words, if cluster _A_ has a higher score than cluster _B_, cluster _A_ should be of better quality (have a better label and/or more coherent document set). Each cluster has an array of identifiers of documents belonging to it. These identifiers correspond to the `uniqueKey` field declared in the schema.
|
||||
|
||||
Depending on the quality of input documents, some clusters may not make much sense. Some documents may be left out and not be clustered at all; these will be assigned to the synthetic _Other Topics_ group, marked with the `other-topics` property set to `true` (see the XML dump above for an example). The score of the other topics group is zero.
|
||||
Sometimes cluster labels may not make much sense (this depends on many factors -- text in clustered fields, number of documents, algorithm paramerters). Also, some documents may be left out and not be clustered at all; these will be assigned to the synthetic _Other Topics_ group, marked with the `other-topics` property set to `true` (see the XML dump above for an example). The score of the other topics group is zero.
|
||||
|
||||
== Installing the Clustering Contrib
|
||||
== Installation
|
||||
|
||||
The clustering contrib extension requires `dist/solr-clustering-*.jar` and all JARs under `contrib/clustering/lib`.
|
||||
|
||||
== Clustering Configuration
|
||||
You can include the required contrib JARs in `solrconfig.xml` as shown below (by default paths are relative to the Solr core so they may need adjustments to your configuration, or an explicit specification of the `$solr.install.dir`):
|
||||
|
||||
=== Declaration of the Clustering Search Component and Request Handler
|
||||
|
||||
Clustering extension is a search component and must be declared in `solrconfig.xml`. Such a component can be then appended to a request handler as the last component in the chain (because it requires search results which must be previously fetched by the search component).
|
||||
|
||||
An example configuration could look as shown below.
|
||||
|
||||
. Include the required contrib JARs. Note that by default paths are relative to the Solr core so they may need adjustments to your configuration, or an explicit specification of the `$solr.install.dir`.
|
||||
+
|
||||
[source,xml]
|
||||
----
|
||||
<lib dir="${solr.install.dir:../../..}/contrib/clustering/lib/" regex=".*\.jar" />
|
||||
<lib dir="${solr.install.dir:../../..}/dist/" regex="solr-clustering-\d.*\.jar" />
|
||||
----
|
||||
. Declaration of the search component. Each component can also declare multiple clustering pipelines ("engines"), which can be selected at runtime by passing `clustering.engine=(engine name)` URL parameter.
|
||||
+
|
||||
|
||||
== Configuration
|
||||
|
||||
=== Component Configuration
|
||||
|
||||
The following properties control `ClusteringComponent` state.
|
||||
|
||||
`clustering`::
|
||||
The component is by default disabled, even if properly declared and attached to a search handler. The `clustering` property must be set to `true` to enable it (this can be done by setting
|
||||
up default parameters in the search handler -- see below).
|
||||
|
||||
`clustering.engine`::
|
||||
Declares which engine to use. If not present, the first declared active engine is used.
|
||||
|
||||
|
||||
=== Clustering Engines
|
||||
|
||||
The declaration of clustering component in `solrconfig.xml` must include one or more predefined configurations called _engines_. For example, consider the configuration below:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<searchComponent name="clustering" class="solr.clustering.ClusteringComponent">
|
||||
<!-- Lingo clustering algorithm -->
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="clustering">
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
<str name="name">lingo</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
<str name="clustering.fields">title, content</str>
|
||||
</lst>
|
||||
|
||||
<!-- An example definition for the STC clustering algorithm. -->
|
||||
<lst name="engine">
|
||||
<str name="name">stc</str>
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
|
||||
<str name="name">stc</str>
|
||||
<str name="clustering.algorithm">STC</str>
|
||||
<str name="clustering.fields">title</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
----
|
||||
. A request handler to which we append the clustering component declared above.
|
||||
+
|
||||
|
||||
This declares two separate engines (`lingo` and `stc`): these configurations have a
|
||||
different clustering algorithm, and a different set of clustered document fields. The
|
||||
active engine can be selected by passing `clustering.engine=_name_` parameter
|
||||
at runtime (via URL) or as the default within the search handler's configuration,
|
||||
as shown below:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<requestHandler name="/clustering"
|
||||
class="solr.SearchHandler">
|
||||
<requestHandler name="/clustering" class="solr.SearchHandler">
|
||||
<lst name="defaults">
|
||||
<!-- Clustering component enabled. -->
|
||||
<bool name="clustering">true</bool>
|
||||
<bool name="clustering.results">true</bool>
|
||||
<str name="clustering.engine">stc</str>
|
||||
|
||||
<!-- Logical field to physical field mapping. -->
|
||||
<str name="carrot.url">id</str>
|
||||
<str name="carrot.title">doctitle</str>
|
||||
<str name="carrot.snippet">content</str>
|
||||
|
||||
<!-- Configure any other request handler parameters. We will cluster the
|
||||
top 100 search results so bump up the 'rows' parameter. -->
|
||||
<!-- Cluster the top 100 search results - bump up the 'rows' parameter. -->
|
||||
<str name="rows">100</str>
|
||||
<str name="fl">*,score</str>
|
||||
</lst>
|
||||
|
||||
<!-- Append clustering at the end of the list of search components. -->
|
||||
|
@ -198,149 +272,205 @@ An example configuration could look as shown below.
|
|||
</requestHandler>
|
||||
----
|
||||
|
||||
=== Configuration Parameters of the Clustering Component
|
||||
=== Clustering Engine Configuration Parameters
|
||||
|
||||
The following parameters of each clustering engine or the entire clustering component (depending where they are declared) are available.
|
||||
Each declared engine can be configured using a number of parameters described below.
|
||||
|
||||
`clustering`::
|
||||
When `true`, clustering component is enabled.
|
||||
`clustering.fields` (_required_)::
|
||||
A comma (or space) separated list of text fields which should contain the text
|
||||
content for clustering. At least one field must be provided. The fields are separate from search handler's `fl` parameter so that clustered fields don't have to be included in the response.
|
||||
|
||||
`clustering.engine`::
|
||||
Declares which clustering engine to use. If not present, the first declared engine will become the default one.
|
||||
`clustering.algorithm` (_required_)::
|
||||
The clustering algorithm is the actual logic (implementation) that discovers relationships among the documents and forms human-readable cluster labels. This parameter sets the name of the clustering algorithm this engine is going to use. Algorithms are supplied to Solr via Carrot^2^-defined service extension. By default, the following open-source algorithms should be available: `Lingo`, `STC`, `Bisecting K-Means`. A commercial clustering algorithm `Lingo3G` plugs into the same extension point and can be used, if it is available on classpath.
|
||||
|
||||
`clustering.results`::
|
||||
When `true`, the component will perform clustering of search results (this should be enabled).
|
||||
|
||||
`clustering.collection`::
|
||||
When `true`, the component will perform clustering of the whole document index (this section does not cover full-index clustering).
|
||||
|
||||
At the engine declaration level, the following parameters are supported.
|
||||
|
||||
`carrot.algorithm`::
|
||||
The algorithm class.
|
||||
|
||||
`carrot.resourcesDir`::
|
||||
Algorithm-specific resources and configuration files (stop words, other lexical resources, default settings). By default points to `conf/clustering/carrot2/`
|
||||
|
||||
`carrot.outputSubClusters`::
|
||||
If `true` and the algorithm supports hierarchical clustering, sub-clusters will also be emitted. Default value: true.
|
||||
|
||||
`carrot.numDescriptions`::
|
||||
Maximum number of per-cluster labels to return (if the algorithm assigns more than one label to a cluster).
|
||||
|
||||
The `carrot.algorithm` parameter should contain a fully qualified class name of an algorithm supported by the http://project.carrot2.org[Carrot2] framework. Currently, the following algorithms are available:
|
||||
|
||||
* `org.carrot2.clustering.lingo.LingoClusteringAlgorithm` (open source)
|
||||
* `org.carrot2.clustering.stc.STCClusteringAlgorithm` (open source)
|
||||
* `org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm` (open source)
|
||||
* `com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm` (commercial)
|
||||
.How to choose the Clustering Algorithm?
|
||||
****
|
||||
The question of which algorithm to choose depends on the amount of traffic, the expected result, and the input data (each algorithm will cluster the input slightly differently). There is no one answer which algorithm is "the best": Lingo3G provides hierarchical clusters, Lingo and STC provide flat clusters. STC is faster than Lingo, but arguably produces less intuitive clusters, Lingo3G is the fastest algorithm but is not free or open source... Experiment and pick one that suits your needs.
|
||||
|
||||
For a comparison of characteristics of these algorithms see the following links:
|
||||
|
||||
* http://doc.carrot2.org/#section.advanced-topics.fine-tuning.choosing-algorithm
|
||||
* http://project.carrot2.org/algorithms.html
|
||||
* http://carrotsearch.com/lingo3g-comparison.html
|
||||
* https://carrot2.github.io/release/4.0.4/doc/algorithms/
|
||||
* https://carrotsearch.com/lingo3g-comparison.html
|
||||
|
||||
The question of which algorithm to choose depends on the amount of traffic (STC is faster than Lingo, but arguably produces less intuitive clusters, Lingo3G is the fastest algorithm but is not free or open source), expected result (Lingo3G provides hierarchical clusters, Lingo and STC provide flat clusters), and the input data (each algorithm will cluster the input slightly differently). There is no one answer which algorithm is "the best".
|
||||
The clustering component lists all available algorithms, languages and algorithm-language compatibility at startup. You can peek at startup logs to see what's available
|
||||
in your Solr installation.
|
||||
****
|
||||
|
||||
=== Contextual and Full Field Clustering
|
||||
`clustering.maxLabels`::
|
||||
Maximum number of returned cluster labels (if the algorithm returns more labels, the list will
|
||||
be truncated). By default all labels are returned.
|
||||
|
||||
The clustering engine can apply clustering to the full content of (stored) fields or it can run an internal highlighter pass to extract context-snippets before clustering. Highlighting is recommended when the logical snippet field contains a lot of content (this would affect clustering performance). Highlighting can also increase the quality of clustering because the content passed to the algorithm will be more focused around the query (it will be query-specific context). The following parameters control the internal highlighter.
|
||||
`clustering.includeSubclusters`::
|
||||
If `true`, sub-clusters are included in the response for algorithms that support hierarchical
|
||||
clustering. `false` causes only top-level clusters to be returned.
|
||||
|
||||
`carrot.produceSummary`::
|
||||
When `true` the clustering component will run a highlighter pass on the content of logical fields pointed to by `carrot.title` and `carrot.snippet`. Otherwise full content of those fields will be clustered.
|
||||
`clustering.includeOtherTopics`::
|
||||
If `true`, a synthetic cluster called _Other Topics_, consisting of all documents not assigned to any other cluster is formed and returned. The default value of this parameter is `true` but if
|
||||
there is no need for this synthetic cluster, it can be set to `false`.
|
||||
|
||||
`carrot.fragSize`::
|
||||
The size, in characters, of the snippets (aka fragments) created by the highlighter. If not specified, the default highlighting fragsize (`hl.fragsize`) will be used.
|
||||
`clustering.resources`::
|
||||
Location of algorithm-specific resources and configuration files (stop words, other lexical resources, default settings). This property is `null` by default and all resources are read
|
||||
from their respective algorithm default resource pool (JARs). If this property is not empty,
|
||||
it resolves relative to Solr core's configuration directory. This parameter can be applied during Solr startup _only_, it can't be overriden per-request.
|
||||
|
||||
`carrot.summarySnippets`:: The number of summary snippets to generate for clustering. If not specified, the default highlighting snippet count (`hl.snippets`) will be used.
|
||||
[.text-center]
|
||||
🙪
|
||||
|
||||
=== Logical to Document Field Mapping
|
||||
There are more properties applying to engine configuration. We describe these in functional sections that follow.
|
||||
|
||||
As already mentioned in <<Clustering Concepts>>, the clustering component clusters "documents" consisting of logical parts that need to be mapped onto physical schema of data stored in Solr. The field mapping attributes provide a connection between fields and logical document parts. Note that the content of title and snippet fields must be *stored* so that it can be retrieved at search time.
|
||||
=== Full Field and Query-Context (Snippet) Clustering
|
||||
|
||||
`carrot.title`::
|
||||
The field (alternatively comma- or space-separated list of fields) that should be mapped to the logical document's title. The clustering algorithms typically give more weight to the content of the title field compared to the content (snippet). For best results, the field should contain concise, noise-free content. If there is no clear title in your data, you can leave this parameter blank.
|
||||
The clustering algorithm can consume full content of fields or just the left and right context around query-matching regions (so-called _snippets_). Contrary to the intuition, using query contexts can increase the quality of clustering even if it feeds less data to the algorithm. This is typically caused by the fact that snippets are more focused around the phrases and terms surrounding the query and the algorithm has a better signal-to-noise ratio of data to work with.
|
||||
|
||||
`carrot.snippet`::
|
||||
The field (alternatively comma- or space-separated list of fields) that should be mapped to the logical document's main content. If this mapping points to very large content fields the performance of clustering may drop significantly. An alternative then is to use query-context snippets for clustering instead of full field content. See the description of the `carrot.produceSummary` parameter for details.
|
||||
We recommend using query contexts when fields contain a lot of content (this would affect clustering performance).
|
||||
|
||||
`carrot.url`::
|
||||
The field that should be mapped to the logical document's content URL. Leave blank if not required.
|
||||
The following three properties control whether the context or full content are processed and how snippets are formed for clustering.
|
||||
|
||||
=== Clustering Multilingual Content
|
||||
`clustering.preferQueryContext`::
|
||||
If `true`, the engine will try to extract context around the query matching regions and use these contexts as input for the clustering algorithm.
|
||||
|
||||
The field mapping specification can include a `carrot.lang` parameter, which defines the field that stores http://www.loc.gov/standards/iso639-2/php/code_list.php[ISO 639-1] code of the language in which the title and content of the document are written. This information can be stored in the index based on apriori knowledge of the documents' source or a language detection filter applied at indexing time. All algorithms inside the Carrot2 framework will accept ISO codes of languages defined in https://github.com/carrot2/carrot2/blob/master/core/carrot2-core/src/org/carrot2/core/LanguageCode.java[LanguageCode enum].
|
||||
`clustering.contextSize`::
|
||||
The maximum size, in characters, of each snippet created by the context retrieval algorithm (internal highlighter).
|
||||
|
||||
The language hint makes it easier for clustering algorithms to separate documents from different languages on input and to pick the right language resources for clustering. If you do have multi-lingual query results (or query results in a language different than English), it is strongly advised to map the language field appropriately.
|
||||
`clustering.contextCount`::
|
||||
The maximum number of different, non-contiguous snippets from a single field.
|
||||
|
||||
`carrot.lang`::
|
||||
The field that stores ISO 639-1 code of the language of the document's text fields.
|
||||
=== Default Clustering Language
|
||||
|
||||
`carrot.lcmap`::
|
||||
A mapping of arbitrary strings into ISO 639 two-letter codes used by `carrot.lang`. The syntax of this parameter is the same as `langid.map.lcmap`, for example: `langid.map.lcmap=japanese:ja polish:pl english:en`
|
||||
The default implementations of clustering algorithms in Carrot^2^ (shipped with Solr)
|
||||
have built-in support (stemming, stop words) for preprocessing a number of languages. It is important to provide the clustering algorithm with a hint of what language should be used for clustering. This can be done in two ways -- by passing the name of the default language or by providing the language as a field with each document. The following two engine configuration parameters control this:
|
||||
|
||||
The default language can also be set using Carrot2-specific algorithm attributes (in this case the http://doc.carrot2.org/#section.attribute.lingo.MultilingualClustering.defaultLanguage[MultilingualClustering.defaultLanguage] attribute).
|
||||
`clustering.language`::
|
||||
Name of the default language to use for clustering. The default value of this field is `English`. The provided language must be available and the clustering algorithm must support it.
|
||||
|
||||
`clustering.languageField`::
|
||||
Name of the document field that stores the document's language. If the field does not exist
|
||||
for a document or the value is blank, the default language is used.
|
||||
|
||||
The list of supported languages can change dynamically (languages are loaded via external service provider extension) and may depend on the selected algorithm (algorithms can support a subset of languages for which resources are available). The clustering component will log all supported algorithm-language pairs at Solr startup, so you can inspect what's supported on your particular Solr instance. For example:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
2020-10-29 [...] Clustering algorithm Lingo3G loaded with support for the following languages: Dutch, English
|
||||
2020-10-29 [...] Clustering algorithm Lingo loaded with support for the following languages: Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish, Turkish
|
||||
2020-10-29 [...] Clustering algorithm Bisecting K-Means loaded with support for the following languages: Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish, Swedish, Turkish
|
||||
----
|
||||
|
||||
=== Handling Multilingual Content
|
||||
|
||||
It is often the case that the index (and query result) contains documents in _multiple_ languages. Clustering such search results is problematic. Ideally, the engine should translate (or understand) the content of documents and then group relevant information together, regardless of the language it is written in.
|
||||
|
||||
In reality, clustering algorithms are typically much simpler -- they infer similarity between documents from statistical properties of terms and phrases that occur in those documents. So texts written in different languages will not cluster too well.
|
||||
|
||||
To deal witht his situation the default clustering component implementation in Solr will try to first group all documents by their language and then apply clustering to each sub-group in that language. It is recommended to store the language of each document as its separate field and point at it using the `clustering.languageField` configuration property described above.
|
||||
|
||||
== Tweaking Algorithm Settings
|
||||
|
||||
The algorithms that come with Solr are using their default settings which may be inadequate for all data sets. All algorithms have lexical resources and resources (stop words, stemmers, parameters) that may require tweaking to get better clusters (and cluster labels). For Carrot2-based algorithms it is probably best to refer to a dedicated tuning application called Carrot2 Workbench (screenshot below). From this application one can export a set of algorithm attributes as an XML file, which can be then placed under the location pointed to by `carrot.resourcesDir`.
|
||||
The clustering algorithms that come with Solr use their default parameter values and language resources. We highly recommend tuning both for production uses. Improving the default language resources to include words and phrases common to a particular document domain will improve clustering quality significantly.
|
||||
|
||||
image::images/result-clustering/carrot2-workbench.png[image,scaledwidth=75.0%]
|
||||
Carrot^2^ algorithms have an extensive set of parameters and language resource tuning options. Please refer to https://carrot2.github.io/release/latest/[up-to-date project documentation]. In particular, the language resources section and each algorithm's attributes section.
|
||||
|
||||
=== Providing Defaults for Clustering
|
||||
|
||||
The default attributes for all engines (algorithms) declared in the clustering component are placed under `carrot.resourcesDir` and with an expected file name of `engineName-attributes.xml`. So for an engine named `lingo` and the default value of `carrot.resourcesDir`, the attributes would be read from a file in `conf/clustering/carrot2/lingo-attributes.xml`.
|
||||
=== Changing Clustering Algorithm Parameters
|
||||
|
||||
An example XML file changing the default language of documents to Polish is shown below.
|
||||
Clustering algorithm settings can be changed via Solr parameters either
|
||||
permanently (in the Engine's declaration) or per-request (via Solr URL parameters).
|
||||
|
||||
For example, let's assume the following engine configuration:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<attribute-sets default="attributes">
|
||||
<attribute-set id="attributes">
|
||||
<value-set>
|
||||
<label>attributes</label>
|
||||
<attribute key="MultilingualClustering.defaultLanguage">
|
||||
<value type="org.carrot2.core.LanguageCode" value="POLISH"/>
|
||||
</attribute>
|
||||
</value-set>
|
||||
</attribute-set>
|
||||
</attribute-sets>
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<str name="clustering.language">English</str>
|
||||
</lst>
|
||||
----
|
||||
|
||||
=== Tweaking Algorithms at Query-Time
|
||||
First, locate the configuration parameters for the Lingo algorithm
|
||||
at https://carrot2.github.io/release/latest/[Carrot^2^ documentation site]:
|
||||
|
||||
The clustering component and Carrot2 clustering algorithms can accept query-time attribute overrides. Note that certain things (for example lexical resources) can only be initialized once (at startup, via the XML configuration files).
|
||||
image::images/result-clustering/carrot2-docs-attrs1.png[image,scaledwidth=75%]
|
||||
|
||||
An example query that changes the `LingoClusteringAlgorithm.desiredClusterCountBase` parameter for the Lingo algorithm:
|
||||
Then locate the particular setting you'd like to change and note the
|
||||
REST API path to that setting (in this case the parameter is
|
||||
`minClusterSize` and its path is `preprocessing.documentAssigner.minClusterSize`):
|
||||
|
||||
image::images/result-clustering/carrot2-docs-attrs2.png[image,scaledwidth=75%]
|
||||
|
||||
Now add the full path-value pair to the engine's configuration:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<str name="clustering.language">English</str>
|
||||
|
||||
<int name="preprocessing.documentAssigner.minClusterSize">3</int>
|
||||
</lst>
|
||||
----
|
||||
|
||||
The following rules apply.
|
||||
|
||||
* The type of the parameter must be consistent with the type listed in Carrot^2^ specification.
|
||||
|
||||
* If the parameter is added to the engine's configuration in `solrconfig.xml`, the core must be reloaded for the changes to be picked up. Alternatively, pass the parameter via the request URL to change things dynamically on a per-request basis. For example, if you have the `techproducts` example running, this will cut the clusters to only those containing at least three documents:
|
||||
`http://localhost:8983/solr/techproducts/clustering?q=\*:*&rows=100&wt=json&preprocessing.documentAssigner.minClusterSize=3`
|
||||
|
||||
* For complex types, the parameter key with the name of the instantiated type must precede any of its own parameters.
|
||||
|
||||
=== Custom Language Resources
|
||||
|
||||
Clustering algorithms rely on language and domain-specific language resources to
|
||||
improve the quality of clusters (by discarding domain-specific noise and boilerplate language).
|
||||
|
||||
By default, language resources are read from the engine-declared algorithm default JAR. You can pass a custom location for these resources by specifying the `clustering.resources` parameter. The value of this parameter resolves to a location relative to Solr core's configuration directory. For example, the following definition:
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<lst name="engine">
|
||||
<str name="name">lingo</str>
|
||||
<str name="clustering.algorithm">Lingo</str>
|
||||
<str name="clustering.fields">name, features</str>
|
||||
<str name="clustering.language">English</str>
|
||||
|
||||
<str name="clustering.resources">lingo-resources</str>
|
||||
</lst>
|
||||
----
|
||||
|
||||
would result in the following log entry and expected resource location:
|
||||
|
||||
[source,text]
|
||||
http://localhost:8983/solr/techproducts/clustering?q=*:*&rows=100&LingoClusteringAlgorithm.desiredClusterCountBase=20
|
||||
----
|
||||
Clustering algorithm resources first looked up relative to: [.../example/techproducts/solr/techproducts/conf/lingo-resources]
|
||||
----
|
||||
|
||||
The clustering engine (the algorithm declared in `solrconfig.xml`) can also be changed at runtime by passing `clustering.engine=name` request attribute:
|
||||
The best way to start tuning algorithm resources is to copy all the defaults from its
|
||||
corresponding Carrot^2^ JAR file (or Carrot^2^ distribution).
|
||||
|
||||
[source,text]
|
||||
http://localhost:8983/solr/techproducts/clustering?q=*:*&rows=100&clustering.engine=kmeans
|
||||
== Performance Considerations
|
||||
|
||||
== Performance Considerations with Dynamic Clustering
|
||||
Clustering of search results comes with some performance considerations:
|
||||
|
||||
Dynamic clustering of search results comes with two major performance penalties:
|
||||
|
||||
* Increased cost of fetching a larger-than-usual number of search results (50, 100 or more documents),
|
||||
* The cost of fetching a larger-than-usual number of search results (50, 100 or more documents),
|
||||
* Additional computational cost of the clustering itself.
|
||||
* In distributed mode the content of document fields for clustering is collected from shards and adds some additional network overhead.
|
||||
|
||||
For simple queries, the clustering time will usually dominate the fetch time. If the document content is very long the retrieval of stored content can become a bottleneck. The performance impact of clustering can be lowered in several ways:
|
||||
For simple queries, the clustering time will usually dominate everything else. If document fields are very long, the retrieval of stored content can become a bottleneck.
|
||||
|
||||
* feed less content to the clustering algorithm by enabling `carrot.produceSummary` attribute,
|
||||
* perform clustering on selected fields (titles only) to make the input smaller,
|
||||
* use a faster algorithm (STC instead of Lingo, Lingo3G instead of STC),
|
||||
* tune the performance attributes related directly to a specific algorithm.
|
||||
The performance impact of clustering can be lowered in several ways.
|
||||
|
||||
* Cluster less data: use query context (snippets) instead of full field content (`clustering.preferQueryContext=true`).
|
||||
* Perform clustering on just a subset of document fields or curate fields for clustering (add abstracts at indexing-time) to make the input smaller.
|
||||
* Tune the performance attributes related directly to a specific algorithm.
|
||||
* Try a different, faster algorithm (STC instead of Lingo, Lingo3G instead of STC).
|
||||
|
||||
Some of these techniques are described in _Apache SOLR and Carrot2 integration strategies_ document, available at http://carrot2.github.io/solr-integration-strategies. The topic of improving performance is also included in the Carrot2 manual at http://doc.carrot2.org/#section.advanced-topics.fine-tuning.performance.
|
||||
|
||||
== Additional Resources
|
||||
|
||||
The following resources provide additional information about the clustering component in Solr and its potential applications.
|
||||
|
||||
* Apache Solr and Carrot2 integration strategies: http://carrot2.github.io/solr-integration-strategies
|
||||
* Clustering and Visualization of Solr search results (Berlin BuzzWords conference, 2011): http://2011.berlinbuzzwords.de/sites/2011.berlinbuzzwords.de/files/solr-clustering-visualization.pdf
|
||||
* Clustering and Visualization of Solr search results (Berlin BuzzWords conference, *2011*): http://2011.berlinbuzzwords.de/sites/2011.berlinbuzzwords.de/files/solr-clustering-visualization.pdf
|
||||
|
|
|
@ -26,7 +26,6 @@ import java.util.Objects;
|
|||
* It is a direct mapping for the Json object Solr is returning.
|
||||
*/
|
||||
public class Cluster {
|
||||
|
||||
private List<String> labels;
|
||||
private double score;
|
||||
private List<String> docIds;
|
||||
|
@ -43,10 +42,10 @@ public class Cluster {
|
|||
* @param docIds the list of document Ids belonging to the cluster
|
||||
*/
|
||||
public Cluster(List<String> labels, double score, List<String> docIds, List<Cluster> subclusters, boolean otherTopics) {
|
||||
this.labels = labels;
|
||||
this.labels = Objects.requireNonNullElse(labels, Collections.emptyList());
|
||||
this.score = score;
|
||||
this.docIds = docIds;
|
||||
this.subclusters = subclusters;
|
||||
this.docIds = Objects.requireNonNullElse(docIds, Collections.emptyList());
|
||||
this.subclusters = Objects.requireNonNullElse(subclusters, Collections.emptyList());
|
||||
this.otherTopics = otherTopics;
|
||||
}
|
||||
|
||||
|
@ -93,7 +92,7 @@ public class Cluster {
|
|||
this.docIds = docIds;
|
||||
}
|
||||
|
||||
public List<Cluster> getSubclusters() {
|
||||
public List<Cluster> getClusters() {
|
||||
return subclusters;
|
||||
}
|
||||
|
||||
|
|
|
@ -26,11 +26,12 @@ import org.apache.solr.common.util.NamedList;
|
|||
* Encapsulates responses from ClusteringComponent
|
||||
*/
|
||||
public class ClusteringResponse {
|
||||
private static final String CLUSTERS_NODE = "clusters";
|
||||
private static final String LABELS_NODE = "labels";
|
||||
private static final String DOCS_NODE = "docs";
|
||||
private static final String SCORE_NODE = "score";
|
||||
private static final String IS_OTHER_TOPICS = "other-topics";
|
||||
public static final String CLUSTERS_NODE = "clusters";
|
||||
public static final String LABELS_NODE = "labels";
|
||||
public static final String DOCS_NODE = "docs";
|
||||
public static final String SCORE_NODE = "score";
|
||||
public static final String IS_OTHER_TOPICS = "other-topics";
|
||||
|
||||
private List<Cluster> clusters;
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
|
@ -53,7 +54,7 @@ public class ClusteringResponse {
|
|||
break;
|
||||
|
||||
case SCORE_NODE:
|
||||
score = (Double) e.getValue();
|
||||
score = ((Number) e.getValue()).doubleValue();
|
||||
break;
|
||||
|
||||
case CLUSTERS_NODE:
|
||||
|
|
|
@ -57,7 +57,7 @@ public class TestClusteringResponse extends SolrJettyTestBase {
|
|||
checkCluster(clusters.get(2), Arrays.asList("label3"), Arrays.asList("id7", "id8"), 1.26d, false);
|
||||
checkCluster(clusters.get(3), Arrays.asList("label4"), Arrays.asList("id9"), 0d, true);
|
||||
|
||||
List<Cluster> sub = clusters.get(0).getSubclusters();
|
||||
List<Cluster> sub = clusters.get(0).getClusters();
|
||||
checkCluster(sub.get(0), Arrays.asList("label1.sub1"), Arrays.asList("id1", "id2"), 0.0d, false);
|
||||
checkCluster(sub.get(1), Arrays.asList("label1.sub2"), Arrays.asList("id2"), 0.0d, false);
|
||||
assertEquals(sub.size(), 2);
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
# Run ./gradlew --write-locks to regenerate this file
|
||||
com.adobe.xmp:xmpcore:5.1.3 (1 constraints: 0b050a36)
|
||||
com.carrotsearch:hppc:0.8.2 (2 constraints: b00ffaa6)
|
||||
com.carrotsearch:hppc:0.8.2 (2 constraints: ad0fc5a6)
|
||||
com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.7.6 (1 constraints: 11051036)
|
||||
com.carrotsearch.thirdparty:simple-xml-safe:2.7.1 (1 constraints: a60a82ca)
|
||||
com.cybozu.labs:langdetect:1.1-20120112 (1 constraints: 5c066d5e)
|
||||
com.drewnoakes:metadata-extractor:2.11.0 (1 constraints: 3605323b)
|
||||
com.epam:parso:2.0.11 (1 constraints: 36052c3b)
|
||||
com.fasterxml.jackson.core:jackson-annotations:2.10.1 (2 constraints: 331dcd4e)
|
||||
com.fasterxml.jackson.core:jackson-core:2.10.1 (3 constraints: 633586b7)
|
||||
com.fasterxml.jackson.core:jackson-databind:2.10.1 (3 constraints: 941aba96)
|
||||
com.fasterxml.jackson.core:jackson-annotations:2.10.1 (1 constraints: 84122d21)
|
||||
com.fasterxml.jackson.core:jackson-core:2.10.1 (2 constraints: b42a896b)
|
||||
com.fasterxml.jackson.core:jackson-databind:2.10.1 (2 constraints: 840f2597)
|
||||
com.fasterxml.jackson.dataformat:jackson-dataformat-smile:2.10.1 (1 constraints: 3605303b)
|
||||
com.github.ben-manes.caffeine:caffeine:2.8.4 (1 constraints: 10051136)
|
||||
com.github.virtuald:curvesapi:1.06 (1 constraints: db04f530)
|
||||
|
@ -78,7 +77,7 @@ org.apache.commons:commons-compress:1.19 (1 constraints: df04fa30)
|
|||
org.apache.commons:commons-configuration2:2.1.1 (1 constraints: 0605f935)
|
||||
org.apache.commons:commons-csv:1.7 (1 constraints: ac04212c)
|
||||
org.apache.commons:commons-exec:1.3 (1 constraints: a8041d2c)
|
||||
org.apache.commons:commons-lang3:3.9 (4 constraints: 702e84c7)
|
||||
org.apache.commons:commons-lang3:3.9 (3 constraints: 2b24bbb0)
|
||||
org.apache.commons:commons-math3:3.6.1 (1 constraints: 0c050d36)
|
||||
org.apache.commons:commons-text:1.6 (1 constraints: ab04202c)
|
||||
org.apache.curator:curator-client:2.13.0 (1 constraints: 3805383b)
|
||||
|
@ -127,12 +126,10 @@ org.bouncycastle:bcmail-jdk15on:1.64 (1 constraints: df04ff30)
|
|||
org.bouncycastle:bcpkix-jdk15on:1.64 (1 constraints: df04ff30)
|
||||
org.bouncycastle:bcprov-jdk15on:1.64 (1 constraints: df04ff30)
|
||||
org.brotli:dec:0.1.2 (1 constraints: 0505f035)
|
||||
org.carrot2:carrot2-mini:3.16.2 (1 constraints: 3e05493b)
|
||||
org.carrot2:carrot2-core:4.0.4 (1 constraints: 0a050336)
|
||||
org.carrot2:morfologik-fsa:2.1.5 (1 constraints: d70d9836)
|
||||
org.carrot2:morfologik-polish:2.1.5 (1 constraints: 0a05fd35)
|
||||
org.carrot2:morfologik-stemming:2.1.5 (2 constraints: 0b12640c)
|
||||
org.carrot2.attributes:attributes-binder:1.3.3 (1 constraints: a30a73ca)
|
||||
org.carrot2.shaded:carrot2-guava:18.0 (2 constraints: b31b3b7b)
|
||||
org.ccil.cowan.tagsoup:tagsoup:1.2.1 (1 constraints: 0605f735)
|
||||
org.checkerframework:checker-qual:2.0.0 (1 constraints: 140ae5b4)
|
||||
org.codehaus.janino:commons-compiler:3.0.9 (2 constraints: d910f7d1)
|
||||
|
@ -172,7 +169,7 @@ org.ow2.asm:asm:7.2 (2 constraints: 900e3e5e)
|
|||
org.ow2.asm:asm-commons:7.2 (1 constraints: ad042e2c)
|
||||
org.rrd4j:rrd4j:3.5 (1 constraints: ac04252c)
|
||||
org.slf4j:jcl-over-slf4j:1.7.24 (1 constraints: 4005473b)
|
||||
org.slf4j:slf4j-api:1.7.24 (15 constraints: a3ba2a7b)
|
||||
org.slf4j:slf4j-api:1.7.24 (14 constraints: ccafc13c)
|
||||
org.tallison:jmatio:1.5 (1 constraints: aa041f2c)
|
||||
org.tukaani:xz:1.8 (1 constraints: ad04222c)
|
||||
org.xerial.snappy:snappy-java:1.1.7.6 (1 constraints: 6f05a240)
|
||||
|
|
|
@ -76,7 +76,7 @@ org.aspectj:aspectjrt=1.8.0
|
|||
org.bitbucket.b_c:jose4j=0.6.5
|
||||
org.bouncycastle:*=1.64
|
||||
org.brotli:dec=0.1.2
|
||||
org.carrot2:carrot2-mini=3.16.2
|
||||
org.carrot2:carrot2-core=4.0.4
|
||||
org.carrot2:morfologik-*=2.1.5
|
||||
org.ccil.cowan.tagsoup:tagsoup=1.2.1
|
||||
org.codehaus.janino:*=3.0.9
|
||||
|
|
Loading…
Reference in New Issue