SOLR-2448: Upgrade of Carrot2 to version 3.5.0 and a number of related clustering improvements (SOLR-2449, SOLR-2450, SOLR-2505)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103722 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stanisław Osiński 2011-05-16 13:19:46 +00:00
parent 77ac8172af
commit 548806b7f7
22 changed files with 963 additions and 452 deletions

View File

@ -106,14 +106,6 @@
</license>
</licenses>
<repositories>
<repository>
<id>carrot2.org</id>
<name>Carrot2 Maven2 repository</name>
<url>http://download.carrot2.org/maven2/</url>
<snapshots>
<updatePolicy>never</updatePolicy>
</snapshots>
</repository>
<repository>
<id>apache.snapshots</id>
<name>Apache Snapshot Repository</name>
@ -306,7 +298,7 @@
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>carrot2-core</artifactId>
<version>3.4.2</version>
<version>3.5.0</version>
</dependency>
<dependency>
<groupId>org.codehaus.woodstox</groupId>

View File

@ -26,7 +26,7 @@ Versions of Major Components
---------------------
Apache Lucene trunk
Apache Tika 0.8
Carrot2 3.4.2
Carrot2 3.5.0
Velocity 1.6.4 and Velocity Tools 2.0
Apache UIMA 2.3.1-SNAPSHOT

View File

@ -9,11 +9,19 @@ CHANGES
$Id$
================== Release 4.0.0-dev ==================
(No Changes)
* SOLR-2448: Search results clustering updates: bisecting k-means
clustering algorithm added, loading of Carrot2 stop words from
<solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
(Stanislaw Osinski, Dawid Weiss).
================== Release 3.2.0-dev ==================
(No Changes)
* SOLR-2448: Search results clustering updates: bisecting k-means
clustering algorithm added, loading of Carrot2 stop words from
<solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
(Stanislaw Osinski, Dawid Weiss).
================== Release 3.1.0-dev ==================

View File

@ -1,2 +0,0 @@
AnyObjectId[f872cbc8eec94f7d5b29a73f99cd13089848a3cd] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[adc127c48137d03e252f526de84a07c8d6bda521] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[05c00b3fbfe234cd33477291432af9d172f13e15] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[0da24b80aab135dc5811731b4e8aa69a77256d8a] was removed in git history.
Apache SVN contains full history.

View File

@ -18,9 +18,11 @@ package org.apache.solr.handler.clustering.carrot2;
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -37,6 +39,7 @@ import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.util.resource.ClassLoaderLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
*
* @link http://project.carrot2.org
*/
@SuppressWarnings("unchecked")
public class CarrotClusteringEngine extends SearchClusteringEngine {
private transient static Logger log = LoggerFactory
private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
/**
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
*/
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
/**
* Name of Carrot2 document's field containing Solr document's identifier.
*/
private static final String SOLR_DOCUMENT_ID = "solrId";
/**
* Name of Solr document's field containing the document's identifier. To avoid
* repeating the content of documents in clusters on output, each cluster contains
* identifiers of documents it contains.
*/
private String idFieldName;
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
private String idFieldName;
@Override
@Deprecated
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
@ -101,6 +126,10 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
// Pass the fields on which clustering runs to the
// SolrStopwordsCarrot2LexicalDataFactory
attributes.put("solrFieldNames", getFieldsForClustering(sreq));
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
@ -113,22 +142,68 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
}
}
@Override
@Override
@SuppressWarnings({ "unchecked", "rawtypes" })
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
SolrParams initParams = SolrParams.toSolrParams(config);
final SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
// Customize the language model factory. The implementation we provide here
// is included in the code base of Solr, so that it's possible to refactor
// the Lucene APIs the factory relies on if needed.
initAttributes.put("PreprocessingPipeline.languageModelFactory",
LuceneLanguageModelFactory.class);
this.controller.init(initAttributes);
// Customize the stemmer and tokenizer factories. The implementations we provide here
// are included in the code base of Solr, so that it's possible to refactor
// the Lucene APIs the factories rely on if needed.
// Additionally, we set a custom lexical resource factory for Carrot2 that
// will use both Carrot2 default stop words as well as stop words from
// the StopFilter defined on the field.
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrIndexSchema", core.getSchema());
// Customize Carrot2's resource lookup to first look for resources
// using Solr's resource loader. If that fails, try loading from the classpath.
DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
.resourceLookup(new ResourceLookup(new IResourceLocator() {
@Override
public IResource[] getAll(final String resource) {
final SolrResourceLoader resourceLoader = core.getResourceLoader();
final String carrot2ResourcesDir = resourceLoader.getConfigDir()
+ initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
try {
log.debug("Looking for " + resource + " in "
+ carrot2ResourcesDir);
final InputStream resourceStream = resourceLoader
.openResource(carrot2ResourcesDir + "/" + resource);
log.info(resource + " loaded from " + carrot2ResourcesDir);
final IResource foundResource = new IResource() {
@Override
public InputStream open() throws IOException {
return resourceStream;
}
};
return new IResource[] { foundResource };
} catch (RuntimeException e) {
// No way to distinguish if the resource was found but failed
// to load or wasn't found at all, so we simply fall back
// to Carrot2 defaults here by returning an empty locations array.
log.debug(resource + " not found in " + carrot2ResourcesDir
+ ". Using the default " + resource + " from Carrot JAR.");
return new IResource[] {};
}
}
},
// Using the class loader directly because this time we want to omit the prefix
new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Make sure the requested Carrot2 clustering algorithm class is available
@ -148,17 +223,29 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
// Names of fields to deliver content for clustering
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
fields.add(idFieldName);
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
return fields;
}
/**
* Returns the names of fields that will be delivering the actual
* content for clustering. Currently, there are two such fields: document
* title and document content.
*/
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
SolrParams solrParams = sreq.getParams();
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
}
return Sets.newHashSet(titleField, snippetField);
}
/**
* Prepares Carrot2 documents for clustering.
*/
@ -180,7 +267,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
if (produceSummary == true) {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
Map args = new HashMap();
Map<String, Object> args = Maps.newHashMap();
snippetFieldAry = new String[]{snippetField};
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
@ -214,11 +301,12 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
if (produceSummary && docIds != null) {
docsHolder[0] = docIds.get(sdoc).intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
NamedList tmp = (NamedList) highlights.getVal(0);
String [] highlt = (String[]) tmp.get(snippetField);
@SuppressWarnings("unchecked")
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
String [] highlt = tmp.get(snippetField);
if (highlt != null && highlt.length == 1) {
snippet = highlt[0];
}
@ -226,27 +314,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
}
Document carrotDocument = new Document(getValue(sdoc, titleField),
snippet, (String)sdoc.getFieldValue(urlField));
carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
result.add(carrotDocument);
}
return result;
}
@Deprecated
protected String getValue(org.apache.lucene.document.Document doc,
String field) {
StringBuilder result = new StringBuilder();
String[] vals = doc.getValues(field);
for (int i = 0; i < vals.length; i++) {
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append(vals[i]).append(" . ");
}
return result.toString().trim();
}
protected String getValue(SolrDocument sdoc, String field) {
StringBuilder result = new StringBuilder();
Collection<Object> vals = sdoc.getFieldValues(field);
@ -261,9 +335,9 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
return result.toString().trim();
}
private List clustersToNamedList(List<Cluster> carrotClusters,
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
List result = new ArrayList();
List<NamedList<Object>> result = Lists.newArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
@ -271,25 +345,40 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
}
private void clustersToNamedList(List<Cluster> outputClusters,
List parent, boolean outputSubClusters, int maxLabels) {
List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
NamedList cluster = new SimpleOrderedMap();
NamedList<Object> cluster = new SimpleOrderedMap<Object>();
parent.add(cluster);
// Add labels
List<String> labels = outCluster.getPhrases();
if (labels.size() > maxLabels)
if (labels.size() > maxLabels) {
labels = labels.subList(0, maxLabels);
}
cluster.add("labels", labels);
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
List docList = new ArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
docList.add(doc.getField("solrId"));
// Add cluster score
final Double score = outCluster.getScore();
if (score != null) {
cluster.add("score", score);
}
if (outputSubClusters) {
List subclusters = new ArrayList();
// Add other topics marker
if (outCluster.isOtherTopics()) {
cluster.add("other-topics", outCluster.isOtherTopics());
}
// Add documents
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
List<Object> docList = Lists.newArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
docList.add(doc.getField(SOLR_DOCUMENT_ID));
}
// Add subclusters
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
List<NamedList<Object>> subclusters = Lists.newArrayList();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);

View File

@ -35,6 +35,8 @@ public interface CarrotParams {
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);

View File

@ -1,354 +1,241 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.util.HashMap;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizer;
import org.apache.lucene.analysis.ar.ArabicStemmer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.DefaultLanguageModelFactory;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.IdentityStemmer;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.ExceptionUtils;
import org.carrot2.util.ReflectionUtils;
import org.carrot2.util.attribute.Bindable;
import org.slf4j.Logger;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.ext.DanishStemmer;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.EnglishStemmer;
import org.tartarus.snowball.ext.FinnishStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import org.tartarus.snowball.ext.GermanStemmer;
import org.tartarus.snowball.ext.HungarianStemmer;
import org.tartarus.snowball.ext.ItalianStemmer;
import org.tartarus.snowball.ext.NorwegianStemmer;
import org.tartarus.snowball.ext.PortugueseStemmer;
import org.tartarus.snowball.ext.RomanianStemmer;
import org.tartarus.snowball.ext.RussianStemmer;
import org.tartarus.snowball.ext.SpanishStemmer;
import org.tartarus.snowball.ext.SwedishStemmer;
import org.tartarus.snowball.ext.TurkishStemmer;
/**
* A Solr-specific language model factory for Carrot2. This factory is the only
* element in Carrot2 that depends on Lucene APIs, so should the APIs need to
* change, the changes can be made in this class.
*/
@Bindable(prefix = "DefaultLanguageModelFactory")
public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneLanguageModelFactory.class);
/**
* Provide an {@link IStemmer} implementation for a given language.
*/
@Override
protected IStemmer createStemmer(LanguageCode language) {
switch (language) {
case ARABIC:
return ArabicStemmerFactory.createStemmer();
case CHINESE_SIMPLIFIED:
return IdentityStemmer.INSTANCE;
default:
/*
* For other languages, try to use snowball's stemming.
*/
return SnowballStemmerFactory.createStemmer(language);
}
}
@Override
protected ITokenizer createTokenizer(LanguageCode language) {
switch (language) {
case CHINESE_SIMPLIFIED:
return ChineseTokenizerFactory.createTokenizer();
/*
* We use our own analyzer for Arabic. Lucene's version has special
* support for Nonspacing-Mark characters (see
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
* have them included as letters in the parser.
*/
case ARABIC:
// Intentional fall-through.
default:
return new ExtendedWhitespaceTokenizer();
}
}
/**
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
* project.
*/
private final static class SnowballStemmerFactory {
/**
* Static hard mapping from language codes to stemmer classes in Snowball.
* This mapping is not dynamic because we want to keep the possibility to
* obfuscate these classes.
*/
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
static {
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
snowballStemmerClasses
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
snowballStemmerClasses
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
PortugueseStemmer.class);
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
}
/**
* An adapter converting Snowball programs into {@link IStemmer} interface.
*/
private static class SnowballStemmerAdapter implements IStemmer {
private final SnowballProgram snowballStemmer;
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
this.snowballStemmer = snowballStemmer;
}
public CharSequence stem(CharSequence word) {
snowballStemmer.setCurrent(word.toString());
if (snowballStemmer.stem()) {
return snowballStemmer.getCurrent();
} else {
return null;
}
}
}
/**
* Create and return an {@link IStemmer} adapter for a
* {@link SnowballProgram} for a given language code. An identity stemmer is
* returned for unknown languages.
*/
public static IStemmer createStemmer(LanguageCode language) {
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
.get(language);
if (stemmerClazz == null) {
logger.warn("No Snowball stemmer class for: " + language.name()
+ ". Quality of clustering may be degraded.");
return IdentityStemmer.INSTANCE;
}
try {
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
} catch (Exception e) {
logger.warn("Could not instantiate snowball stemmer"
+ " for language: " + language.name()
+ ". Quality of clustering may be degraded.", e);
return IdentityStemmer.INSTANCE;
}
}
}
/**
* Factory of {@link IStemmer} implementations for the
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
* to be present in classpath, otherwise an empty (identity) stemmer is
* returned.
*/
private static class ArabicStemmerFactory {
static {
try {
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
} catch (ClassNotFoundException e) {
logger
.warn(
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ "of Arabic content may be degraded. For best quality clusters, "
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
e);
}
}
/**
* Adapter to lucene-contrib Arabic analyzers.
*/
private static class LuceneStemmerAdapter implements IStemmer {
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
private char[] buffer = new char[0];
private LuceneStemmerAdapter() throws Exception {
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
}
public CharSequence stem(CharSequence word) {
if (word.length() > buffer.length) {
buffer = new char[word.length()];
}
for (int i = 0; i < word.length(); i++) {
buffer[i] = word.charAt(i);
}
int newLen = normalizer.normalize(buffer, word.length());
newLen = delegate.stem(buffer, newLen);
if (newLen != word.length() || !equals(buffer, newLen, word)) {
return CharBuffer.wrap(buffer, 0, newLen);
}
// Same-same.
return null;
}
private boolean equals(char[] buffer, int len, CharSequence word) {
assert len == word.length();
for (int i = 0; i < len; i++) {
if (buffer[i] != word.charAt(i))
return false;
}
return true;
}
}
public static IStemmer createStemmer() {
try {
return new LuceneStemmerAdapter();
} catch (Throwable e) {
return IdentityStemmer.INSTANCE;
}
}
}
/**
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
* factory will fall back to the default white space tokenizer.
*/
private static final class ChineseTokenizerFactory {
static {
try {
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
} catch (Throwable e) {
logger
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ "of Chinese content may be degraded. For best quality clusters, "
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
}
}
static ITokenizer createTokenizer() {
try {
return new ChineseTokenizer();
} catch (Throwable e) {
return new ExtendedWhitespaceTokenizer();
}
}
private final static class ChineseTokenizer implements ITokenizer {
private final static Pattern numeric = Pattern
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
private Tokenizer sentenceTokenizer;
private TokenStream wordTokenFilter;
private CharTermAttribute term = null;
private final MutableCharArray tempCharSequence;
private final Class<?> tokenFilterClass;
private ChineseTokenizer() throws Exception {
this.tempCharSequence = new MutableCharArray(new char[0]);
// As Smart Chinese is not available during compile time,
// we need to resort to reflection.
final Class<?> tokenizerClass = ReflectionUtils
.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
Reader.class).newInstance((Reader) null);
this.tokenFilterClass = ReflectionUtils
.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
}
public short nextToken() throws IOException {
final boolean hasNextToken = wordTokenFilter.incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = term.buffer();
final int length = term.length();
tempCharSequence.reset(image, 0, length);
if (length == 1 && image[0] == ',') {
// ChineseTokenizer seems to convert all punctuation to ','
// characters
flags = ITokenizer.TT_PUNCTUATION;
} else if (numeric.matcher(tempCharSequence).matches()) {
flags = ITokenizer.TT_NUMERIC;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
return ITokenizer.TT_EOF;
}
public void setTermBuffer(MutableCharArray array) {
array.reset(term.buffer(), 0, term.length());
}
public void reset(Reader input) throws IOException {
try {
sentenceTokenizer.reset(input);
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
TokenStream.class).newInstance(sentenceTokenizer);
} catch (Exception e) {
throw ExceptionUtils.wrapAsRuntimeException(e);
}
}
}
}
}
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.CharBuffer;
import java.util.HashMap;
import org.apache.lucene.analysis.ar.ArabicNormalizer;
import org.apache.lucene.analysis.ar.ArabicStemmer;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.IStemmerFactory;
import org.carrot2.util.ReflectionUtils;
import org.slf4j.Logger;
import org.tartarus.snowball.SnowballProgram;
import org.tartarus.snowball.ext.DanishStemmer;
import org.tartarus.snowball.ext.DutchStemmer;
import org.tartarus.snowball.ext.EnglishStemmer;
import org.tartarus.snowball.ext.FinnishStemmer;
import org.tartarus.snowball.ext.FrenchStemmer;
import org.tartarus.snowball.ext.GermanStemmer;
import org.tartarus.snowball.ext.HungarianStemmer;
import org.tartarus.snowball.ext.ItalianStemmer;
import org.tartarus.snowball.ext.NorwegianStemmer;
import org.tartarus.snowball.ext.PortugueseStemmer;
import org.tartarus.snowball.ext.RomanianStemmer;
import org.tartarus.snowball.ext.RussianStemmer;
import org.tartarus.snowball.ext.SpanishStemmer;
import org.tartarus.snowball.ext.SwedishStemmer;
import org.tartarus.snowball.ext.TurkishStemmer;
/**
* An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
* APIs. Should the relevant Lucene APIs need to change, the changes can be made
* in this class.
*/
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneCarrot2StemmerFactory.class);
@Override
public IStemmer getStemmer(LanguageCode language) {
switch (language) {
case ARABIC:
return ArabicStemmerFactory.createStemmer();
case CHINESE_SIMPLIFIED:
return IdentityStemmer.INSTANCE;
default:
/*
* For other languages, try to use snowball's stemming.
*/
return SnowballStemmerFactory.createStemmer(language);
}
}
/**
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
* project.
*/
private final static class SnowballStemmerFactory {
/**
* Static hard mapping from language codes to stemmer classes in Snowball.
* This mapping is not dynamic because we want to keep the possibility to
* obfuscate these classes.
*/
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
static {
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
snowballStemmerClasses
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
snowballStemmerClasses
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
PortugueseStemmer.class);
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
}
/**
* An adapter converting Snowball programs into {@link IStemmer} interface.
*/
private static class SnowballStemmerAdapter implements IStemmer {
private final SnowballProgram snowballStemmer;
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
this.snowballStemmer = snowballStemmer;
}
public CharSequence stem(CharSequence word) {
snowballStemmer.setCurrent(word.toString());
if (snowballStemmer.stem()) {
return snowballStemmer.getCurrent();
} else {
return null;
}
}
}
/**
* Create and return an {@link IStemmer} adapter for a
* {@link SnowballProgram} for a given language code. An identity stemmer is
* returned for unknown languages.
*/
public static IStemmer createStemmer(LanguageCode language) {
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
.get(language);
if (stemmerClazz == null) {
logger.warn("No Snowball stemmer class for: " + language.name()
+ ". Quality of clustering may be degraded.");
return IdentityStemmer.INSTANCE;
}
try {
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
} catch (Exception e) {
logger.warn("Could not instantiate snowball stemmer"
+ " for language: " + language.name()
+ ". Quality of clustering may be degraded.", e);
return IdentityStemmer.INSTANCE;
}
}
}
/**
* Factory of {@link IStemmer} implementations for the
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
* to be present in classpath, otherwise an empty (identity) stemmer is
* returned.
*/
private static class ArabicStemmerFactory {
static {
try {
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
} catch (ClassNotFoundException e) {
logger
.warn(
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ "of Arabic content may be degraded. For best quality clusters, "
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
e);
}
}
/**
* Adapter to lucene-contrib Arabic analyzers.
*/
private static class LuceneStemmerAdapter implements IStemmer {
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
private char[] buffer = new char[0];
private LuceneStemmerAdapter() throws Exception {
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
}
public CharSequence stem(CharSequence word) {
if (word.length() > buffer.length) {
buffer = new char[word.length()];
}
for (int i = 0; i < word.length(); i++) {
buffer[i] = word.charAt(i);
}
int newLen = normalizer.normalize(buffer, word.length());
newLen = delegate.stem(buffer, newLen);
if (newLen != word.length() || !equals(buffer, newLen, word)) {
return CharBuffer.wrap(buffer, 0, newLen);
}
// Same-same.
return null;
}
private boolean equals(char[] buffer, int len, CharSequence word) {
assert len == word.length();
for (int i = 0; i < len; i++) {
if (buffer[i] != word.charAt(i))
return false;
}
return true;
}
}
public static IStemmer createStemmer() {
try {
return new LuceneStemmerAdapter();
} catch (Throwable e) {
return IdentityStemmer.INSTANCE;
}
}
}
/**
* An implementation of {@link IStemmer} that always returns <code>null</code>
* which means no stemming.
*/
private static class IdentityStemmer implements IStemmer {
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
@Override
public CharSequence stem(CharSequence word) {
return null;
}
}
}

View File

@ -0,0 +1,156 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ITokenizerFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.ExceptionUtils;
import org.carrot2.util.ReflectionUtils;
import org.slf4j.Logger;
/**
* An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
* Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
* classpath at runtime, the default Carrot2's tokenizer is used. Should the
* Lucene APIs need to change, the changes can be made in this class.
*/
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneCarrot2TokenizerFactory.class);
@Override
public ITokenizer getTokenizer(LanguageCode language) {
switch (language) {
case CHINESE_SIMPLIFIED:
return ChineseTokenizerFactory.createTokenizer();
/*
* We use our own analyzer for Arabic. Lucene's version has special
* support for Nonspacing-Mark characters (see
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
* have them included as letters in the parser.
*/
case ARABIC:
// Intentional fall-through.
default:
return new ExtendedWhitespaceTokenizer();
}
}
/**
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
* factory will fall back to the default white space tokenizer.
*/
private static final class ChineseTokenizerFactory {
static {
try {
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
} catch (Throwable e) {
logger
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ "of Chinese content may be degraded. For best quality clusters, "
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
}
}
static ITokenizer createTokenizer() {
try {
return new ChineseTokenizer();
} catch (Throwable e) {
return new ExtendedWhitespaceTokenizer();
}
}
private final static class ChineseTokenizer implements ITokenizer {
private final static Pattern numeric = Pattern
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
private Tokenizer sentenceTokenizer;
private TokenStream wordTokenFilter;
private CharTermAttribute term = null;
private final MutableCharArray tempCharSequence;
private final Class<?> tokenFilterClass;
private ChineseTokenizer() throws Exception {
this.tempCharSequence = new MutableCharArray(new char[0]);
// As Smart Chinese is not available during compile time,
// we need to resort to reflection.
final Class<?> tokenizerClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
Reader.class).newInstance((Reader) null);
this.tokenFilterClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
}
public short nextToken() throws IOException {
final boolean hasNextToken = wordTokenFilter.incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = term.buffer();
final int length = term.length();
tempCharSequence.reset(image, 0, length);
if (length == 1 && image[0] == ',') {
// ChineseTokenizer seems to convert all punctuation to ','
// characters
flags = ITokenizer.TT_PUNCTUATION;
} else if (numeric.matcher(tempCharSequence).matches()) {
flags = ITokenizer.TT_NUMERIC;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
return ITokenizer.TT_EOF;
}
public void setTermBuffer(MutableCharArray array) {
array.reset(term.buffer(), 0, term.length());
}
public void reset(Reader input) throws IOException {
try {
sentenceTokenizer.reset(input);
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
TokenStream.class).newInstance(sentenceTokenizer);
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
} catch (Exception e) {
throw ExceptionUtils.wrapAsRuntimeException(e);
}
}
}
}
}

View File

@ -0,0 +1,141 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collection;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.solr.analysis.CommonGramsFilterFactory;
import org.apache.solr.analysis.StopFilterFactory;
import org.apache.solr.analysis.TokenFilterFactory;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.schema.IndexSchema;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.slf4j.Logger;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
/**
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
* words from a field's StopFilter to the default stop words used in Carrot2,
* for all languages Carrot2 supports. Completely replacing Carrot2 stop words
* with Solr's wouldn't make much sense because clustering needs more aggressive
* stop words removal. In other words, if something is a stop word during
* indexing, then it should also be a stop word during clustering, but not the
* other way round.
*/
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements
ILexicalDataFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
@Init
@Input
@Attribute(key = "solrIndexSchema")
private IndexSchema schema;
@Processing
@Input
@Attribute(key = "solrFieldNames")
private Set<String> fieldNames;
/**
* A lazily-built cache of stop words per field.
*/
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
/**
* Carrot2's default lexical resources to use in addition to Solr's stop
* words.
*/
private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
/**
* Obtains stop words for a field from the associated
* {@link StopFilterFactory}, if any.
*/
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
// No need to synchronize here, Carrot2 ensures that instances
// of this class are not used by multiple threads at a time.
if (!solrStopWords.containsKey(fieldName)) {
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
.getAnalyzer();
if (fieldAnalyzer instanceof TokenizerChain) {
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
.getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet, but
// the getStopWords() method returns a Set<?>, so we need to cast.
solrStopWords.put(fieldName,
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
}
if (factory instanceof CommonGramsFilterFactory) {
solrStopWords.put(fieldName,
(CharArraySet) ((CommonGramsFilterFactory) factory)
.getCommonWords());
}
}
}
}
return solrStopWords.get(fieldName);
}
@Override
public ILexicalData getLexicalData(LanguageCode languageCode) {
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
.getLexicalData(languageCode);
return new ILexicalData() {
@Override
public boolean isStopLabel(CharSequence word) {
// Nothing in Solr maps to the concept of a stop label,
// so return Carrot2's default here.
return carrot2LexicalData.isStopLabel(word);
}
@Override
public boolean isCommonWord(MutableCharArray word) {
// Loop over the fields involved in clustering first
for (String fieldName : fieldNames) {
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
if (stopWords.contains(word)) {
return true;
}
}
}
// Check default Carrot2 stop words too
return carrot2LexicalData.isCommonWord(word);
}
};
}
}

View File

@ -17,6 +17,11 @@ package org.apache.solr.handler.clustering.carrot2;
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUtils;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.collect.ImmutableList;
/**
*
*/
@SuppressWarnings("unchecked")
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
1, 1, 0);
}
@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
}
@Test
@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
public void testClusterScores() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
AbstractClusteringTestCase.numberOfDocs, params);
int i = 1;
for (NamedList<Object> cluster : clusters) {
final Double score = getScore(cluster);
assertNotNull(score);
assertEquals(0.25 * i++, score, 0);
}
}
@Test
public void testOtherTopics() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
AbstractClusteringTestCase.numberOfDocs, params);
int i = 1;
for (NamedList<Object> cluster : clusters) {
assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
}
}
@Test
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
params), 1, 3, 0);
}
@Test
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
"online,customsolrstopword,customsolrstoplabel");
}
@Test
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
throws IOException {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
wordsToCheck);
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
// stoplabels.en, so we're expecting only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine(engineName), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
}
@Test
public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
// "solrownstopword" is in stopwords.txt, so we're expecting
// only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
}
@Test
public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
// Force string fields to be used for clustering. Does not make sense
// in a real word, but does the job in the test.
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 2, params);
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
assertEquals(ImmutableList.of("solrownstopword"),
getLabels(clusters.get(1)));
}
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
return engine;
}
private List checkEngine(CarrotClusteringEngine engine,
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
}
private List checkEngine(CarrotClusteringEngine engine,
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
}
private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
List results = (List)engine.cluster(query, solrDocList, docIds, req);
@SuppressWarnings("unchecked")
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
}
}
private void checkClusters(List results, int expectedDocCount,
private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
NamedList cluster = (NamedList) results.get(i);
NamedList<Object> cluster = results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
private void checkClusters(List results, boolean hasSubclusters) {
private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
checkCluster((NamedList) results.get(i), hasSubclusters);
checkCluster(results.get(i), hasSubclusters);
}
}
private void checkCluster(NamedList cluster, boolean hasSubclusters) {
List docs = (List) cluster.get("docs");
private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
List labels = (List) cluster.get("labels");
List<String> labels = getLabels(cluster);
assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) {
List subclusters = (List) cluster.get("clusters");
List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
private void checkCluster(NamedList cluster, int expectedDocCount,
private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
((List) cluster.get("docs")).size());
getDocs(cluster).size());
assertEquals("number of labels in cluster", expectedLabelCount,
((List) cluster.get("labels")).size());
getLabels(cluster).size());
if (expectedSubclusterCount > 0) {
List subclusters = (List) cluster.get("clusters");
List<NamedList<Object>> subclusters = getSubclusters(cluster);
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
@SuppressWarnings("unchecked")
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
return (List<NamedList<Object>>) cluster.get("clusters");
}
@SuppressWarnings("unchecked")
private List<String> getLabels(NamedList<Object> cluster) {
return (List<String>) cluster.get("labels");
}
private Double getScore(NamedList<Object> cluster) {
return (Double) cluster.get("score");
}
private Boolean isOtherTopics(NamedList<Object> cluster) {
return (Boolean)cluster.get("other-topics");
}
@SuppressWarnings("unchecked")
private List<Object> getDocs(NamedList<Object> cluster) {
return (List<Object>) cluster.get("docs");
}
}

View File

@ -0,0 +1,82 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock implementation of Carrot2 clustering algorithm for testing whether the
* customized lexical resource lookup works correctly. This algorithm ignores
* the input documents and instead for each word from {@link #wordsToCheck}, it
* outputs a cluster labeled with the word only if the word is neither a stop
* word nor a stop label.
*/
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
public class LexicalResourcesCheckClusteringAlgorithm extends
ProcessingComponentBase implements IClusteringAlgorithm {
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
@Input
@Processing
@Attribute
private String wordsToCheck;
private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
if (wordsToCheck == null) {
return;
}
// Test with Maltese so that the English clustering performed in other tests
// is not affected by the test stopwords and stoplabels.
ILexicalData lexicalData = preprocessing.lexicalDataFactory
.getLexicalData(LanguageCode.MALTESE);
for (String word : wordsToCheck.split(",")) {
if (!lexicalData.isCommonWord(new MutableCharArray(word))
&& !lexicalData.isStopLabel(word)) {
clusters.add(new Cluster(word));
}
}
}
}

View File

@ -49,6 +49,11 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
@IntRange(min = 1, max = 5)
private int labels = 1;
@Input
@Processing
@Attribute
private int otherTopicsModulo = 0;
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
@ -59,21 +64,26 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
Cluster cluster = createCluster(label.toString(), document);
Cluster cluster = createCluster(label.toString(), documentIndex, document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
Cluster newCluster = createCluster(label.toString(), document);
cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
Cluster newCluster = createCluster(label.toString(), documentIndex, document);
cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
private Cluster createCluster(String labelBase, Document... documents) {
private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
Cluster cluster = new Cluster();
cluster.setScore(documentIndex * 0.25);
if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
{
cluster.setOtherTopics(true);
}
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}

View File

@ -0,0 +1 @@
customsolrstoplabelcustomdir

View File

@ -0,0 +1 @@
customsolrstopwordcustomdir

View File

@ -396,6 +396,15 @@
<str name="name">mock</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">lexical-resource-check</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
</lst>
<lst name="engine">
<str name="name">lexical-resource-check-custom-resource-dir</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
<str name="carrot.lexicalResourcesDir">clustering/custom</str>
</lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">

View File

@ -55,4 +55,5 @@ to
was
will
with
solrownstopword

View File

@ -1198,17 +1198,20 @@
<lst name="engine">
<!-- The name, only one can be named "default" -->
<str name="name">default</str>
<!-- Class name of Carrot2 clustering algorithm.
<!-- Class name of Carrot2 clustering algorithm.
Currently available algorithms are:
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
* org.carrot2.clustering.stc.STCClusteringAlgorithm
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
See http://project.carrot2.org/algorithms.html for the
algorithm's characteristics.
-->
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
<!-- Overriding values for Carrot2 default algorithm attributes.
For a description of all available attributes, see:
@ -1219,9 +1222,22 @@
name and attribute value as parameter value.
-->
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
<!-- Location of Carrot2 lexical resources.
A directory from which to load Carrot2-specific stop words
and stop labels. Absolute or relative to Solr config directory.
If a specific resource (e.g. stopwords.en) is present in the
specified dir, it will completely override the corresponding
default one that ships with Carrot2.
For an overview of Carrot2 lexical resources, see:
http://download.carrot2.org/head/manual/#chapter.lexical-resources
-->
<str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
<!-- The language to assume for the documents.
For a list of allowed values, see:
http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
-->