SOLR-5202: Support easier overrides of Carrot2 clustering attributes via XML data sets exported from the Workbench. Polished clustering configuration examples.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1520677 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2013-09-06 20:02:57 +00:00
parent e4bcd35ac0
commit 43467403b2
13 changed files with 223 additions and 159 deletions

View File

@ -59,11 +59,13 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
private Map<String, SearchClusteringEngine> searchClusteringEngines = new HashMap<String, SearchClusteringEngine>();
private Map<String, DocumentClusteringEngine> documentClusteringEngines = new HashMap<String, DocumentClusteringEngine>();
/**
* Base name for all spell checker query parameters. This name is also used to
* Base name for all component parameters. This name is also used to
* register this component with SearchHandler.
*/
public static final String COMPONENT_NAME = "clustering";
private NamedList initParams;
@Override

View File

@ -30,7 +30,6 @@ public class ClusteringEngine {
public String init(NamedList config, SolrCore core) {
name = (String) config.get(ENGINE_NAME);
return name;
}

View File

@ -46,6 +46,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.ClusteringEngine;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
@ -66,6 +67,8 @@ import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
import org.carrot2.util.attribute.AttributeValueSet;
import org.carrot2.util.attribute.AttributeValueSets;
import org.carrot2.util.resource.ClassLoaderLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
@ -73,11 +76,11 @@ import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Objects;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import com.google.common.io.Closer;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
@ -122,8 +125,19 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
public SolrResourceLocator(SolrCore core, SolrParams initParams) {
resourceLoader = core.getResourceLoader();
carrot2ResourcesDir = initParams.get(
CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
@SuppressWarnings("deprecation")
String lexicalResourcesDir = initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR);
String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
carrot2ResourcesDir = firstNonNull(resourcesDir, lexicalResourcesDir, CARROT_RESOURCES_PREFIX);
}
@SuppressWarnings("unchecked")
public static <T> T firstNonNull(T... args) {
for (T t : args) {
if (t != null) return t;
}
throw new NullPointerException("At least one element has to be non-null.");
}
@Override
@ -269,8 +283,52 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
String result = super.init(config, core);
final SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any.
// Initialization attributes for Carrot2 controller.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
// Customize Carrot2's resource lookup to first look for resources
// using Solr's resource loader. If that fails, try loading from the classpath.
ResourceLookup resourceLookup = new ResourceLookup(
// Solr-specific resource loading.
new SolrResourceLocator(core, initParams),
// Using the class loader directly because this time we want to omit the prefix
new ClassLoaderLocator(core.getResourceLoader().getClassLoader()));
DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
.resourceLookup(resourceLookup);
// Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
// of this component. This by-name convention lookup is used to simplify configuring algorithms.
String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
log.info("Initializing Clustering Engine '" + Objects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
if (!Strings.isNullOrEmpty(componentName)) {
IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
if (attributeXmls.length > 0) {
if (attributeXmls.length > 1) {
log.warn("More than one attribute file found, first one will be used: "
+ Arrays.toString(attributeXmls));
}
Thread ct = Thread.currentThread();
ClassLoader prev = ct.getContextClassLoader();
try {
ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open());
AttributeValueSet defaultSet = avs.getDefaultAttributeValueSet();
initAttributes.putAll(defaultSet.getAttributeValues());
} catch (Exception e) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"Could not read attributes XML for clustering component: "
+ componentName, e);
} finally {
ct.setContextClassLoader(prev);
}
}
}
// Extract solrconfig attributes, they take precedence.
extractCarrotAttributes(initParams, initAttributes);
// Customize the stemmer and tokenizer factories. The implementations we provide here
@ -291,15 +349,6 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Pass the schema (via the core) to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrCore", core);
// Customize Carrot2's resource lookup to first look for resources
// using Solr's resource loader. If that fails, try loading from the classpath.
DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes).resourceLookup(
new ResourceLookup(
// Solr-specific resource loading.
new SolrResourceLocator(core, initParams),
// Using the class loader directly because this time we want to omit the prefix
new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
// Carrot2 uses current thread's context class loader to get
// certain classes (e.g. custom tokenizer/stemmer) at initialization time.
// To make sure classes from contrib JARs are available,
@ -322,7 +371,9 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
this.clusteringAlgorithmClass = core.getResourceLoader().findClass(carrotAlgorithmClassName, IClusteringAlgorithm.class);
this.clusteringAlgorithmClass = core.getResourceLoader().findClass(
carrotAlgorithmClassName, IClusteringAlgorithm.class);
return result;
}
@ -440,8 +491,9 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document
if (highlights != null && highlights.size() == 1) {
// should only be one value given our setup
// should only be one document
@SuppressWarnings("unchecked")
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);

View File

@ -43,9 +43,21 @@ public final class CarrotParams {
public static String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
public static String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
public static String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
public static String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
/**
* Use {@link #RESOURCES_DIR}.
*/
@Deprecated
public static String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
/**
* A replacement property pointing to Carrot<sup>2</sup> resources
* (a more generic version of the deprecated {@link #LEXICAL_RESOURCES_DIR}).
*/
public static String RESOURCES_DIR = CARROT_PREFIX + "resourcesDir";
static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM,
@ -62,6 +74,7 @@ public final class CarrotParams {
NUM_DESCRIPTIONS,
OUTPUT_SUB_CLUSTERS,
LEXICAL_RESOURCES_DIR,
RESOURCES_DIR,
LANGUAGE_CODE_MAP);
/** No instances. */

View File

@ -0,0 +1,10 @@
<attribute-sets default="overridden-attributes">
<attribute-set id="overridden-attributes">
<value-set>
<label>defaults</label>
<attribute key="MockClusteringAlgorithm.depth"><value value="1" /></attribute>
<attribute key="MockClusteringAlgorithm.labels"><value value="3" /></attribute>
<attribute key="MockClusteringAlgorithm.maxClusters"><value value="13" /></attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -327,6 +327,12 @@
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">mock-external-attrs</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
<!-- takes precedence over external XML -->
<int name="MockClusteringAlgorithm.labels">4</int>
</lst>
<lst name="engine">
<str name="name">echo</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoClusteringAlgorithm</str>
@ -338,6 +344,11 @@
<lst name="engine">
<str name="name">lexical-resource-check-custom-resource-dir</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
<str name="carrot.resourcesDir">clustering/custom</str>
</lst>
<lst name="engine">
<str name="name">lexical-resource-check-custom-resource-dir-deprecated</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
<str name="carrot.lexicalResourcesDir">clustering/custom</str>
</lst>
<lst name="engine">

View File

@ -122,7 +122,14 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
1, 1, 0);
1, 1, 0);
}
@Test
public void testExternalXmlAttributesFile() throws Exception {
checkClusters(
checkEngine(getClusteringEngine("mock-external-attrs"), 13),
1, 4, 0);
}
@Test
@ -189,6 +196,12 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
@Test
public void testLexicalResourcesFromSolrConfigCustomDirDeprecated() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir-deprecated",
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
throws IOException {
ModifiableSolrParams params = new ModifiableSolrParams();

View File

@ -49,6 +49,12 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
@IntRange(min = 1, max = 5)
private int labels = 1;
@Input
@Processing
@Attribute
@IntRange(min = 0)
private int maxClusters = 0;
@Input
@Processing
@Attribute
@ -61,6 +67,10 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
return;
}
if (maxClusters > 0) {
documents = documents.subList(0, maxClusters);
}
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);

View File

@ -1369,113 +1369,7 @@
</arr>
</requestHandler>
<!-- Clustering Component
http://wiki.apache.org/solr/ClusteringComponent
You'll need to set the solr.clustering.enabled system property
when running solr to run with clustering enabled:
java -Dsolr.clustering.enabled=true -jar start.jar
-->
<searchComponent name="clustering"
enable="${solr.clustering.enabled:false}"
class="solr.clustering.ClusteringComponent" >
<!-- Declare an engine -->
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">default</str>
<!-- Class name of Carrot2 clustering algorithm.
Currently available algorithms are:
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
* org.carrot2.clustering.stc.STCClusteringAlgorithm
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
See http://project.carrot2.org/algorithms.html for the
algorithm's characteristics.
-->
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
<!-- Overriding values for Carrot2 default algorithm attributes.
For a description of all available attributes, see:
http://download.carrot2.org/stable/manual/#chapter.components.
Use attribute key as name attribute of str elements
below. These can be further overridden for individual
requests by specifying attribute key as request parameter
name and attribute value as parameter value.
-->
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
<!-- Location of Carrot2 lexical resources.
A directory from which to load Carrot2-specific stop words
and stop labels. Absolute or relative to Solr config directory.
If a specific resource (e.g. stopwords.en) is present in the
specified dir, it will completely override the corresponding
default one that ships with Carrot2.
For an overview of Carrot2 lexical resources, see:
http://download.carrot2.org/head/manual/#chapter.lexical-resources
-->
<str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
<!-- The language to assume for the documents.
For a list of allowed values, see:
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
-->
<str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
</lst>
<lst name="engine">
<str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
</lst>
</searchComponent>
<!-- A request handler for demonstrating the clustering component
This is purely as an example.
In reality you will likely want to add the component to your
already specified request handlers.
-->
<requestHandler name="/clustering"
startup="lazy"
enable="${solr.clustering.enabled:false}"
class="solr.SearchHandler">
<lst name="defaults">
<bool name="clustering">true</bool>
<str name="clustering.engine">default</str>
<bool name="clustering.results">true</bool>
<!-- The title field -->
<str name="carrot.title">name</str>
<str name="carrot.url">id</str>
<!-- The field to cluster on -->
<str name="carrot.snippet">features</str>
<!-- produce summaries -->
<bool name="carrot.produceSummary">true</bool>
<!-- the maximum number of labels per cluster -->
<!--<int name="carrot.numDescriptions">5</int>-->
<!-- produce sub clusters -->
<bool name="carrot.outputSubClusters">false</bool>
<str name="defType">edismax</str>
<str name="qf">
text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
</str>
<str name="q.alt">*:*</str>
<str name="rows">10</str>
<str name="fl">*,score</str>
</lst>
<arr name="last-components">
<str>clustering</str>
</arr>
</requestHandler>
<!-- Clustering Component. (Omitted here. See the default Solr example for a typical configuration.) -->
<!-- Terms Component

View File

@ -0,0 +1,24 @@
<!--
Default configuration for the Lingo clustering algorithm.
This file can be loaded (and saved) by Carrot2 Workbench.
http://project.carrot2.org/download.html
-->
<attribute-sets default="attributes">
<attribute-set id="attributes">
<value-set>
<label>attributes</label>
<!--
The language to assume for clustered documents.
For a list of allowed values, see:
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
-->
<attribute key="MultilingualClustering.defaultLanguage">
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
</attribute>
<attribute key="LingoClusteringAlgorithm.desiredClusterCountBase">
<value type="java.lang.Integer" value="20"/>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -0,0 +1,19 @@
<!--
Default configuration for the bisecting k-means clustering algorithm.
This file can be loaded (and saved) by Carrot2 Workbench.
http://project.carrot2.org/download.html
-->
<attribute-sets default="attributes">
<attribute-set id="attributes">
<value-set>
<label>attributes</label>
<attribute key="MultilingualClustering.defaultLanguage">
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
</attribute>
<attribute key="MultilingualClustering.languageAggregationStrategy">
<value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -0,0 +1,19 @@
<!--
Default configuration for the STC clustering algorithm.
This file can be loaded (and saved) by Carrot2 Workbench.
http://project.carrot2.org/download.html
-->
<attribute-sets default="attributes">
<attribute-set id="attributes">
<value-set>
<label>attributes</label>
<attribute key="MultilingualClustering.defaultLanguage">
<value type="org.carrot2.core.LanguageCode" value="ENGLISH"/>
</attribute>
<attribute key="MultilingualClustering.languageAggregationStrategy">
<value type="org.carrot2.text.clustering.MultilingualClustering$LanguageAggregationStrategy" value="FLATTEN_MAJOR_LANGUAGE"/>
</attribute>
</value-set>
</attribute-set>
</attribute-sets>

View File

@ -1382,59 +1382,57 @@
<searchComponent name="clustering"
enable="${solr.clustering.enabled:true}"
class="solr.clustering.ClusteringComponent" >
<!-- Declare an engine -->
<!-- Declare a named clustering engine. Only one engine can be named
"default" (and it becomes the default one for the search component).
-->
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">default</str>
<!-- Class name of Carrot2 clustering algorithm.
<!-- Class name of a clustering algorithm compatible with the Carrot2
framework.
Currently available algorithms are:
Currently available open source algorithms are:
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
* org.carrot2.clustering.stc.STCClusteringAlgorithm
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
See http://project.carrot2.org/algorithms.html for the
algorithm's characteristics.
See http://project.carrot2.org/algorithms.html for more information.
A commercial algorithm Lingo3G (needs to be installed separately) is defined as:
* com.carrotsearch.lingo3g.Lingo3GClusteringAlgorithm
-->
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
<!-- Overriding values for Carrot2 default algorithm attributes.
<!-- Override location of the clustering algorithm's resources
(attribute definitions and lexical resources).
For a description of all available attributes, see:
http://download.carrot2.org/stable/manual/#chapter.components.
Use attribute key as name attribute of str elements
below. These can be further overridden for individual
requests by specifying attribute key as request parameter
name and attribute value as parameter value.
-->
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
<!-- Location of Carrot2 lexical resources.
A directory from which to load Carrot2-specific stop words
and stop labels. Absolute or relative to Solr config directory.
A directory from which to load algorithm-specific stop words,
stop labels and attribute definition XMLs.
Absolute or relative to Solr config directory.
If a specific resource (e.g. stopwords.en) is present in the
specified dir, it will completely override the corresponding
default one that ships with Carrot2.
default one that typically ships with each algorithm.
For an overview of Carrot2 lexical resources, see:
http://download.carrot2.org/head/manual/#chapter.lexical-resources
-->
<str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
<!-- The language to assume for the documents.
For a list of allowed values, see:
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
For an overview of Lingo3G lexical resources, see:
http://download.carrotsearch.com/lingo3g/manual/#chapter.lexical-resources
-->
<str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
<!-- <str name="carrot.resourcesDir">clustering/carrot2</str> -->
</lst>
<!-- An example definition for the STC clustering algorithm. -->
<lst name="engine">
<str name="name">stc</str>
<str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
</lst>
<!-- An example definition for the bisecting kmeans clustering algorithm. -->
<lst name="engine">
<str name="name">kmeans</str>
<str name="carrot.algorithm">org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm</str>
</lst>
</searchComponent>
<!-- A request handler for demonstrating the clustering component