mirror of https://github.com/apache/lucene.git
SOLR-2448: Upgrade of Carrot2 to version 3.5.0 and a number of related clustering improvements (SOLR-2449, SOLR-2450, SOLR-2505)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1103722 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
77ac8172af
commit
548806b7f7
|
@ -106,14 +106,6 @@
|
|||
</license>
|
||||
</licenses>
|
||||
<repositories>
|
||||
<repository>
|
||||
<id>carrot2.org</id>
|
||||
<name>Carrot2 Maven2 repository</name>
|
||||
<url>http://download.carrot2.org/maven2/</url>
|
||||
<snapshots>
|
||||
<updatePolicy>never</updatePolicy>
|
||||
</snapshots>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>apache.snapshots</id>
|
||||
<name>Apache Snapshot Repository</name>
|
||||
|
@ -306,7 +298,7 @@
|
|||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
<artifactId>carrot2-core</artifactId>
|
||||
<version>3.4.2</version>
|
||||
<version>3.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.woodstox</groupId>
|
||||
|
|
|
@ -26,7 +26,7 @@ Versions of Major Components
|
|||
---------------------
|
||||
Apache Lucene trunk
|
||||
Apache Tika 0.8
|
||||
Carrot2 3.4.2
|
||||
Carrot2 3.5.0
|
||||
Velocity 1.6.4 and Velocity Tools 2.0
|
||||
Apache UIMA 2.3.1-SNAPSHOT
|
||||
|
||||
|
|
|
@ -9,11 +9,19 @@ CHANGES
|
|||
$Id$
|
||||
================== Release 4.0.0-dev ==================
|
||||
|
||||
(No Changes)
|
||||
* SOLR-2448: Search results clustering updates: bisecting k-means
|
||||
clustering algorithm added, loading of Carrot2 stop words from
|
||||
<solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
|
||||
for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
|
||||
(Stanislaw Osinski, Dawid Weiss).
|
||||
|
||||
================== Release 3.2.0-dev ==================
|
||||
|
||||
(No Changes)
|
||||
* SOLR-2448: Search results clustering updates: bisecting k-means
|
||||
clustering algorithm added, loading of Carrot2 stop words from
|
||||
<solr.home>/conf/carrot2 (SOLR-2449), using Solr's stopwords.txt
|
||||
for clustering (SOLR-2450), output of cluster scores (SOLR-2505)
|
||||
(Stanislaw Osinski, Dawid Weiss).
|
||||
|
||||
================== Release 3.1.0-dev ==================
|
||||
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[f872cbc8eec94f7d5b29a73f99cd13089848a3cd] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[adc127c48137d03e252f526de84a07c8d6bda521] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[05c00b3fbfe234cd33477291432af9d172f13e15] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[0da24b80aab135dc5811731b4e8aa69a77256d8a] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -18,9 +18,11 @@ package org.apache.solr.handler.clustering.carrot2;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -37,6 +39,7 @@ import org.apache.solr.common.params.SolrParams;
|
|||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.handler.clustering.SearchClusteringEngine;
|
||||
import org.apache.solr.handler.component.HighlightComponent;
|
||||
import org.apache.solr.highlight.SolrHighlighter;
|
||||
|
@ -52,9 +55,17 @@ import org.carrot2.core.ControllerFactory;
|
|||
import org.carrot2.core.Document;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
|
||||
import org.carrot2.util.resource.ClassLoaderLocator;
|
||||
import org.carrot2.util.resource.IResource;
|
||||
import org.carrot2.util.resource.IResourceLocator;
|
||||
import org.carrot2.util.resource.ResourceLookup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
/**
|
||||
|
@ -64,19 +75,33 @@ import com.google.common.collect.Sets;
|
|||
*
|
||||
* @link http://project.carrot2.org
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
private transient static Logger log = LoggerFactory
|
||||
private transient static Logger log = LoggerFactory
|
||||
.getLogger(CarrotClusteringEngine.class);
|
||||
|
||||
/**
|
||||
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
|
||||
*/
|
||||
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
|
||||
|
||||
/**
|
||||
* Name of Carrot2 document's field containing Solr document's identifier.
|
||||
*/
|
||||
private static final String SOLR_DOCUMENT_ID = "solrId";
|
||||
|
||||
/**
|
||||
* Name of Solr document's field containing the document's identifier. To avoid
|
||||
* repeating the content of documents in clusters on output, each cluster contains
|
||||
* identifiers of documents it contains.
|
||||
*/
|
||||
private String idFieldName;
|
||||
|
||||
/**
|
||||
* Carrot2 controller that manages instances of clustering algorithms
|
||||
*/
|
||||
private Controller controller = ControllerFactory.createPooling();
|
||||
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
|
||||
|
||||
private String idFieldName;
|
||||
|
||||
@Override
|
||||
@Deprecated
|
||||
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
|
||||
|
@ -101,6 +126,10 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
attributes.put(AttributeNames.DOCUMENTS, documents);
|
||||
attributes.put(AttributeNames.QUERY, query.toString());
|
||||
|
||||
// Pass the fields on which clustering runs to the
|
||||
// SolrStopwordsCarrot2LexicalDataFactory
|
||||
attributes.put("solrFieldNames", getFieldsForClustering(sreq));
|
||||
|
||||
// Pass extra overriding attributes from the request, if any
|
||||
extractCarrotAttributes(sreq.getParams(), attributes);
|
||||
|
||||
|
@ -113,22 +142,68 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@Override
|
||||
@SuppressWarnings({ "unchecked", "rawtypes" })
|
||||
public String init(NamedList config, final SolrCore core) {
|
||||
String result = super.init(config, core);
|
||||
SolrParams initParams = SolrParams.toSolrParams(config);
|
||||
final SolrParams initParams = SolrParams.toSolrParams(config);
|
||||
|
||||
// Initialize Carrot2 controller. Pass initialization attributes, if any.
|
||||
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
|
||||
extractCarrotAttributes(initParams, initAttributes);
|
||||
|
||||
// Customize the language model factory. The implementation we provide here
|
||||
// is included in the code base of Solr, so that it's possible to refactor
|
||||
// the Lucene APIs the factory relies on if needed.
|
||||
initAttributes.put("PreprocessingPipeline.languageModelFactory",
|
||||
LuceneLanguageModelFactory.class);
|
||||
this.controller.init(initAttributes);
|
||||
// Customize the stemmer and tokenizer factories. The implementations we provide here
|
||||
// are included in the code base of Solr, so that it's possible to refactor
|
||||
// the Lucene APIs the factories rely on if needed.
|
||||
// Additionally, we set a custom lexical resource factory for Carrot2 that
|
||||
// will use both Carrot2 default stop words as well as stop words from
|
||||
// the StopFilter defined on the field.
|
||||
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
|
||||
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
|
||||
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
|
||||
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
|
||||
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
|
||||
initAttributes.put("solrIndexSchema", core.getSchema());
|
||||
|
||||
// Customize Carrot2's resource lookup to first look for resources
|
||||
// using Solr's resource loader. If that fails, try loading from the classpath.
|
||||
DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes)
|
||||
.resourceLookup(new ResourceLookup(new IResourceLocator() {
|
||||
@Override
|
||||
public IResource[] getAll(final String resource) {
|
||||
final SolrResourceLoader resourceLoader = core.getResourceLoader();
|
||||
final String carrot2ResourcesDir = resourceLoader.getConfigDir()
|
||||
+ initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR, CARROT_RESOURCES_PREFIX);
|
||||
try {
|
||||
log.debug("Looking for " + resource + " in "
|
||||
+ carrot2ResourcesDir);
|
||||
final InputStream resourceStream = resourceLoader
|
||||
.openResource(carrot2ResourcesDir + "/" + resource);
|
||||
|
||||
log.info(resource + " loaded from " + carrot2ResourcesDir);
|
||||
final IResource foundResource = new IResource() {
|
||||
@Override
|
||||
public InputStream open() throws IOException {
|
||||
return resourceStream;
|
||||
}
|
||||
};
|
||||
return new IResource[] { foundResource };
|
||||
} catch (RuntimeException e) {
|
||||
// No way to distinguish if the resource was found but failed
|
||||
// to load or wasn't found at all, so we simply fall back
|
||||
// to Carrot2 defaults here by returning an empty locations array.
|
||||
log.debug(resource + " not found in " + carrot2ResourcesDir
|
||||
+ ". Using the default " + resource + " from Carrot JAR.");
|
||||
return new IResource[] {};
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
// Using the class loader directly because this time we want to omit the prefix
|
||||
new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
|
||||
|
||||
this.controller.init(initAttributes);
|
||||
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
|
||||
|
||||
// Make sure the requested Carrot2 clustering algorithm class is available
|
||||
|
@ -148,16 +223,28 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
|
||||
// Names of fields to deliver content for clustering
|
||||
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
|
||||
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
|
||||
fields.add(idFieldName);
|
||||
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the names of fields that will be delivering the actual
|
||||
* content for clustering. Currently, there are two such fields: document
|
||||
* title and document content.
|
||||
*/
|
||||
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
|
||||
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
|
||||
if (StringUtils.isBlank(snippetField)) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
|
||||
+ " must not be blank.");
|
||||
}
|
||||
return Sets.newHashSet(urlField, titleField, snippetField, idFieldName);
|
||||
}
|
||||
return Sets.newHashSet(titleField, snippetField);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares Carrot2 documents for clustering.
|
||||
|
@ -180,7 +267,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
if (produceSummary == true) {
|
||||
highlighter = HighlightComponent.getHighlighter(core);
|
||||
if (highlighter != null){
|
||||
Map args = new HashMap();
|
||||
Map<String, Object> args = Maps.newHashMap();
|
||||
snippetFieldAry = new String[]{snippetField};
|
||||
args.put(HighlightParams.FIELDS, snippetFieldAry);
|
||||
args.put(HighlightParams.HIGHLIGHT, "true");
|
||||
|
@ -214,11 +301,12 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
if (produceSummary && docIds != null) {
|
||||
docsHolder[0] = docIds.get(sdoc).intValue();
|
||||
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
|
||||
NamedList highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
|
||||
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
|
||||
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
|
||||
//should only be one document with one field
|
||||
NamedList tmp = (NamedList) highlights.getVal(0);
|
||||
String [] highlt = (String[]) tmp.get(snippetField);
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
|
||||
String [] highlt = tmp.get(snippetField);
|
||||
if (highlt != null && highlt.length == 1) {
|
||||
snippet = highlt[0];
|
||||
}
|
||||
|
@ -226,27 +314,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
}
|
||||
Document carrotDocument = new Document(getValue(sdoc, titleField),
|
||||
snippet, (String)sdoc.getFieldValue(urlField));
|
||||
carrotDocument.setField("solrId", sdoc.getFieldValue(idFieldName));
|
||||
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
|
||||
result.add(carrotDocument);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
protected String getValue(org.apache.lucene.document.Document doc,
|
||||
String field) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String[] vals = doc.getValues(field);
|
||||
for (int i = 0; i < vals.length; i++) {
|
||||
// Join multiple values with a period so that Carrot2 does not pick up
|
||||
// phrases that cross field value boundaries (in most cases it would
|
||||
// create useless phrases).
|
||||
result.append(vals[i]).append(" . ");
|
||||
}
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
||||
protected String getValue(SolrDocument sdoc, String field) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
Collection<Object> vals = sdoc.getFieldValues(field);
|
||||
|
@ -261,9 +335,9 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
return result.toString().trim();
|
||||
}
|
||||
|
||||
private List clustersToNamedList(List<Cluster> carrotClusters,
|
||||
private List<NamedList<Object>> clustersToNamedList(List<Cluster> carrotClusters,
|
||||
SolrParams solrParams) {
|
||||
List result = new ArrayList();
|
||||
List<NamedList<Object>> result = Lists.newArrayList();
|
||||
clustersToNamedList(carrotClusters, result, solrParams.getBool(
|
||||
CarrotParams.OUTPUT_SUB_CLUSTERS, true), solrParams.getInt(
|
||||
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
|
||||
|
@ -271,25 +345,40 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
}
|
||||
|
||||
private void clustersToNamedList(List<Cluster> outputClusters,
|
||||
List parent, boolean outputSubClusters, int maxLabels) {
|
||||
List<NamedList<Object>> parent, boolean outputSubClusters, int maxLabels) {
|
||||
for (Cluster outCluster : outputClusters) {
|
||||
NamedList cluster = new SimpleOrderedMap();
|
||||
NamedList<Object> cluster = new SimpleOrderedMap<Object>();
|
||||
parent.add(cluster);
|
||||
|
||||
// Add labels
|
||||
List<String> labels = outCluster.getPhrases();
|
||||
if (labels.size() > maxLabels)
|
||||
if (labels.size() > maxLabels) {
|
||||
labels = labels.subList(0, maxLabels);
|
||||
}
|
||||
cluster.add("labels", labels);
|
||||
|
||||
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
|
||||
List docList = new ArrayList();
|
||||
cluster.add("docs", docList);
|
||||
for (Document doc : docs) {
|
||||
docList.add(doc.getField("solrId"));
|
||||
// Add cluster score
|
||||
final Double score = outCluster.getScore();
|
||||
if (score != null) {
|
||||
cluster.add("score", score);
|
||||
}
|
||||
|
||||
if (outputSubClusters) {
|
||||
List subclusters = new ArrayList();
|
||||
// Add other topics marker
|
||||
if (outCluster.isOtherTopics()) {
|
||||
cluster.add("other-topics", outCluster.isOtherTopics());
|
||||
}
|
||||
|
||||
// Add documents
|
||||
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
|
||||
List<Object> docList = Lists.newArrayList();
|
||||
cluster.add("docs", docList);
|
||||
for (Document doc : docs) {
|
||||
docList.add(doc.getField(SOLR_DOCUMENT_ID));
|
||||
}
|
||||
|
||||
// Add subclusters
|
||||
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
|
||||
List<NamedList<Object>> subclusters = Lists.newArrayList();
|
||||
cluster.add("clusters", subclusters);
|
||||
clustersToNamedList(outCluster.getSubclusters(), subclusters,
|
||||
outputSubClusters, maxLabels);
|
||||
|
|
|
@ -35,6 +35,8 @@ public interface CarrotParams {
|
|||
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
|
||||
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragzise";
|
||||
|
||||
String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
|
||||
|
||||
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
|
||||
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
|
||||
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
|
||||
|
|
|
@ -17,27 +17,15 @@ package org.apache.solr.handler.clustering.carrot2;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.HashMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemmer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
|
||||
import org.carrot2.text.analysis.ITokenizer;
|
||||
import org.carrot2.text.linguistic.DefaultLanguageModelFactory;
|
||||
import org.carrot2.text.linguistic.IStemmer;
|
||||
import org.carrot2.text.linguistic.IdentityStemmer;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.ExceptionUtils;
|
||||
import org.carrot2.text.linguistic.IStemmerFactory;
|
||||
import org.carrot2.util.ReflectionUtils;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.slf4j.Logger;
|
||||
import org.tartarus.snowball.SnowballProgram;
|
||||
import org.tartarus.snowball.ext.DanishStemmer;
|
||||
|
@ -57,20 +45,16 @@ import org.tartarus.snowball.ext.SwedishStemmer;
|
|||
import org.tartarus.snowball.ext.TurkishStemmer;
|
||||
|
||||
/**
|
||||
* A Solr-specific language model factory for Carrot2. This factory is the only
|
||||
* element in Carrot2 that depends on Lucene APIs, so should the APIs need to
|
||||
* change, the changes can be made in this class.
|
||||
* An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
|
||||
* APIs. Should the relevant Lucene APIs need to change, the changes can be made
|
||||
* in this class.
|
||||
*/
|
||||
@Bindable(prefix = "DefaultLanguageModelFactory")
|
||||
public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
|
||||
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneLanguageModelFactory.class);
|
||||
.getLogger(LuceneCarrot2StemmerFactory.class);
|
||||
|
||||
/**
|
||||
* Provide an {@link IStemmer} implementation for a given language.
|
||||
*/
|
||||
@Override
|
||||
protected IStemmer createStemmer(LanguageCode language) {
|
||||
public IStemmer getStemmer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case ARABIC:
|
||||
return ArabicStemmerFactory.createStemmer();
|
||||
|
@ -86,26 +70,6 @@ public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ITokenizer createTokenizer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return ChineseTokenizerFactory.createTokenizer();
|
||||
|
||||
/*
|
||||
* We use our own analyzer for Arabic. Lucene's version has special
|
||||
* support for Nonspacing-Mark characters (see
|
||||
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||
* have them included as letters in the parser.
|
||||
*/
|
||||
case ARABIC:
|
||||
// Intentional fall-through.
|
||||
|
||||
default:
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
|
||||
* project.
|
||||
|
@ -263,92 +227,15 @@ public class LuceneLanguageModelFactory extends DefaultLanguageModelFactory {
|
|||
}
|
||||
|
||||
/**
|
||||
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||
* factory will fall back to the default white space tokenizer.
|
||||
* An implementation of {@link IStemmer} that always returns <code>null</code>
|
||||
* which means no stemming.
|
||||
*/
|
||||
private static final class ChineseTokenizerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
} catch (Throwable e) {
|
||||
logger
|
||||
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||
}
|
||||
}
|
||||
private static class IdentityStemmer implements IStemmer {
|
||||
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
|
||||
|
||||
static ITokenizer createTokenizer() {
|
||||
try {
|
||||
return new ChineseTokenizer();
|
||||
} catch (Throwable e) {
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class ChineseTokenizer implements ITokenizer {
|
||||
private final static Pattern numeric = Pattern
|
||||
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||
|
||||
private Tokenizer sentenceTokenizer;
|
||||
private TokenStream wordTokenFilter;
|
||||
private CharTermAttribute term = null;
|
||||
|
||||
private final MutableCharArray tempCharSequence;
|
||||
private final Class<?> tokenFilterClass;
|
||||
|
||||
private ChineseTokenizer() throws Exception {
|
||||
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||
|
||||
// As Smart Chinese is not available during compile time,
|
||||
// we need to resort to reflection.
|
||||
final Class<?> tokenizerClass = ReflectionUtils
|
||||
.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||
Reader.class).newInstance((Reader) null);
|
||||
this.tokenFilterClass = ReflectionUtils
|
||||
.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
}
|
||||
|
||||
public short nextToken() throws IOException {
|
||||
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||
if (hasNextToken) {
|
||||
short flags = 0;
|
||||
final char[] image = term.buffer();
|
||||
final int length = term.length();
|
||||
tempCharSequence.reset(image, 0, length);
|
||||
if (length == 1 && image[0] == ',') {
|
||||
// ChineseTokenizer seems to convert all punctuation to ','
|
||||
// characters
|
||||
flags = ITokenizer.TT_PUNCTUATION;
|
||||
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||
flags = ITokenizer.TT_NUMERIC;
|
||||
} else {
|
||||
flags = ITokenizer.TT_TERM;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
return ITokenizer.TT_EOF;
|
||||
}
|
||||
|
||||
public void setTermBuffer(MutableCharArray array) {
|
||||
array.reset(term.buffer(), 0, term.length());
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
try {
|
||||
sentenceTokenizer.reset(input);
|
||||
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||
TokenStream.class).newInstance(sentenceTokenizer);
|
||||
} catch (Exception e) {
|
||||
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
|
||||
import org.carrot2.text.analysis.ITokenizer;
|
||||
import org.carrot2.text.linguistic.ITokenizerFactory;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.ExceptionUtils;
|
||||
import org.carrot2.util.ReflectionUtils;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
/**
|
||||
* An implementation of Carrot2's {@link ITokenizerFactory} based on Lucene's
|
||||
* Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
|
||||
* classpath at runtime, the default Carrot2's tokenizer is used. Should the
|
||||
* Lucene APIs need to change, the changes can be made in this class.
|
||||
*/
|
||||
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneCarrot2TokenizerFactory.class);
|
||||
|
||||
@Override
|
||||
public ITokenizer getTokenizer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return ChineseTokenizerFactory.createTokenizer();
|
||||
|
||||
/*
|
||||
* We use our own analyzer for Arabic. Lucene's version has special
|
||||
* support for Nonspacing-Mark characters (see
|
||||
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||
* have them included as letters in the parser.
|
||||
*/
|
||||
case ARABIC:
|
||||
// Intentional fall-through.
|
||||
|
||||
default:
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||
* factory will fall back to the default white space tokenizer.
|
||||
*/
|
||||
private static final class ChineseTokenizerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
} catch (Throwable e) {
|
||||
logger
|
||||
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||
}
|
||||
}
|
||||
|
||||
static ITokenizer createTokenizer() {
|
||||
try {
|
||||
return new ChineseTokenizer();
|
||||
} catch (Throwable e) {
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class ChineseTokenizer implements ITokenizer {
|
||||
private final static Pattern numeric = Pattern
|
||||
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||
|
||||
private Tokenizer sentenceTokenizer;
|
||||
private TokenStream wordTokenFilter;
|
||||
private CharTermAttribute term = null;
|
||||
|
||||
private final MutableCharArray tempCharSequence;
|
||||
private final Class<?> tokenFilterClass;
|
||||
|
||||
private ChineseTokenizer() throws Exception {
|
||||
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||
|
||||
// As Smart Chinese is not available during compile time,
|
||||
// we need to resort to reflection.
|
||||
final Class<?> tokenizerClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||
Reader.class).newInstance((Reader) null);
|
||||
this.tokenFilterClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
}
|
||||
|
||||
public short nextToken() throws IOException {
|
||||
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||
if (hasNextToken) {
|
||||
short flags = 0;
|
||||
final char[] image = term.buffer();
|
||||
final int length = term.length();
|
||||
tempCharSequence.reset(image, 0, length);
|
||||
if (length == 1 && image[0] == ',') {
|
||||
// ChineseTokenizer seems to convert all punctuation to ','
|
||||
// characters
|
||||
flags = ITokenizer.TT_PUNCTUATION;
|
||||
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||
flags = ITokenizer.TT_NUMERIC;
|
||||
} else {
|
||||
flags = ITokenizer.TT_TERM;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
return ITokenizer.TT_EOF;
|
||||
}
|
||||
|
||||
public void setTermBuffer(MutableCharArray array) {
|
||||
array.reset(term.buffer(), 0, term.length());
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
try {
|
||||
sentenceTokenizer.reset(input);
|
||||
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||
TokenStream.class).newInstance(sentenceTokenizer);
|
||||
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
|
||||
} catch (Exception e) {
|
||||
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,141 @@
|
|||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.solr.analysis.CommonGramsFilterFactory;
|
||||
import org.apache.solr.analysis.StopFilterFactory;
|
||||
import org.apache.solr.analysis.TokenFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.attribute.Init;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
|
||||
import org.carrot2.text.linguistic.ILexicalData;
|
||||
import org.carrot2.text.linguistic.ILexicalDataFactory;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import com.google.common.collect.HashMultimap;
|
||||
import com.google.common.collect.Multimap;
|
||||
|
||||
/**
|
||||
* An implementation of Carrot2's {@link ILexicalDataFactory} that adds stop
|
||||
* words from a field's StopFilter to the default stop words used in Carrot2,
|
||||
* for all languages Carrot2 supports. Completely replacing Carrot2 stop words
|
||||
* with Solr's wouldn't make much sense because clustering needs more aggressive
|
||||
* stop words removal. In other words, if something is a stop word during
|
||||
* indexing, then it should also be a stop word during clustering, but not the
|
||||
* other way round.
|
||||
*/
|
||||
@Bindable
|
||||
public class SolrStopwordsCarrot2LexicalDataFactory implements
|
||||
ILexicalDataFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
|
||||
@Init
|
||||
@Input
|
||||
@Attribute(key = "solrIndexSchema")
|
||||
private IndexSchema schema;
|
||||
|
||||
@Processing
|
||||
@Input
|
||||
@Attribute(key = "solrFieldNames")
|
||||
private Set<String> fieldNames;
|
||||
|
||||
/**
|
||||
* A lazily-built cache of stop words per field.
|
||||
*/
|
||||
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
|
||||
|
||||
/**
|
||||
* Carrot2's default lexical resources to use in addition to Solr's stop
|
||||
* words.
|
||||
*/
|
||||
private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
|
||||
|
||||
/**
|
||||
* Obtains stop words for a field from the associated
|
||||
* {@link StopFilterFactory}, if any.
|
||||
*/
|
||||
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
|
||||
// No need to synchronize here, Carrot2 ensures that instances
|
||||
// of this class are not used by multiple threads at a time.
|
||||
if (!solrStopWords.containsKey(fieldName)) {
|
||||
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
|
||||
.getAnalyzer();
|
||||
if (fieldAnalyzer instanceof TokenizerChain) {
|
||||
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
|
||||
.getTokenFilterFactories();
|
||||
for (TokenFilterFactory factory : filterFactories) {
|
||||
if (factory instanceof StopFilterFactory) {
|
||||
// StopFilterFactory holds the stop words in a CharArraySet, but
|
||||
// the getStopWords() method returns a Set<?>, so we need to cast.
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
|
||||
}
|
||||
|
||||
if (factory instanceof CommonGramsFilterFactory) {
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((CommonGramsFilterFactory) factory)
|
||||
.getCommonWords());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return solrStopWords.get(fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ILexicalData getLexicalData(LanguageCode languageCode) {
|
||||
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
|
||||
.getLexicalData(languageCode);
|
||||
|
||||
return new ILexicalData() {
|
||||
@Override
|
||||
public boolean isStopLabel(CharSequence word) {
|
||||
// Nothing in Solr maps to the concept of a stop label,
|
||||
// so return Carrot2's default here.
|
||||
return carrot2LexicalData.isStopLabel(word);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCommonWord(MutableCharArray word) {
|
||||
// Loop over the fields involved in clustering first
|
||||
for (String fieldName : fieldNames) {
|
||||
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
|
||||
if (stopWords.contains(word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check default Carrot2 stop words too
|
||||
return carrot2LexicalData.isCommonWord(word);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -17,6 +17,11 @@ package org.apache.solr.handler.clustering.carrot2;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -37,15 +42,11 @@ import org.apache.solr.util.SolrPluginUtils;
|
|||
import org.carrot2.util.attribute.AttributeUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||
@Test
|
||||
public void testCarrotLingo() throws Exception {
|
||||
|
@ -74,7 +75,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
|
||||
@Test
|
||||
public void testWithoutSubclusters() throws Exception {
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs),
|
||||
1, 1, 0);
|
||||
}
|
||||
|
||||
|
@ -82,7 +83,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
public void testWithSubclusters() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), 1, 1, 2);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs), 1, 1, 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -90,19 +91,107 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
|
||||
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
|
||||
params), 1, 3, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClusterScores() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
|
||||
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
|
||||
AbstractClusteringTestCase.numberOfDocs, params);
|
||||
int i = 1;
|
||||
for (NamedList<Object> cluster : clusters) {
|
||||
final Double score = getScore(cluster);
|
||||
assertNotNull(score);
|
||||
assertEquals(0.25 * i++, score, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOtherTopics() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "otherTopicsModulo"), 2);
|
||||
List<NamedList<Object>> clusters = checkEngine(getClusteringEngine("mock"),
|
||||
AbstractClusteringTestCase.numberOfDocs, params);
|
||||
int i = 1;
|
||||
for (NamedList<Object> cluster : clusters) {
|
||||
assertEquals(i++ % 2 == 0 ? true : null, isOtherTopics(cluster));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCarrotAttributePassing() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
|
||||
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
|
||||
checkClusters(checkEngine(getClusteringEngine("mock"), AbstractClusteringTestCase.numberOfDocs,
|
||||
params), 1, 3, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
|
||||
"online,customsolrstopword,customsolrstoplabel");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
|
||||
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
|
||||
}
|
||||
|
||||
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
|
||||
throws IOException {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
wordsToCheck);
|
||||
|
||||
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
|
||||
// stoplabels.en, so we're expecting only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine(engineName), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
|
||||
// "solrownstopword" is in stopwords.txt, so we're expecting
|
||||
// only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
// Force string fields to be used for clustering. Does not make sense
|
||||
// in a real word, but does the job in the test.
|
||||
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
|
||||
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 2, params);
|
||||
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
|
||||
assertEquals(ImmutableList.of("solrownstopword"),
|
||||
getLabels(clusters.get(1)));
|
||||
}
|
||||
|
||||
private CarrotClusteringEngine getClusteringEngine(String engineName) {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore()
|
||||
.getSearchComponent("clustering");
|
||||
|
@ -114,18 +203,18 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
return engine;
|
||||
}
|
||||
|
||||
private List checkEngine(CarrotClusteringEngine engine,
|
||||
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
|
||||
int expectedNumClusters) throws IOException {
|
||||
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), new ModifiableSolrParams());
|
||||
}
|
||||
|
||||
private List checkEngine(CarrotClusteringEngine engine,
|
||||
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine,
|
||||
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
|
||||
return checkEngine(engine, numberOfDocs, expectedNumClusters, new MatchAllDocsQuery(), clusteringParams);
|
||||
}
|
||||
|
||||
|
||||
private List checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
|
||||
private List<NamedList<Object>> checkEngine(CarrotClusteringEngine engine, int expectedNumDocs,
|
||||
int expectedNumClusters, Query query, SolrParams clusteringParams) throws IOException {
|
||||
// Get all documents to cluster
|
||||
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
|
||||
|
@ -145,7 +234,9 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
|
||||
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
|
||||
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
|
||||
List results = (List)engine.cluster(query, solrDocList, docIds, req);
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
|
||||
req.close();
|
||||
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
|
||||
checkClusters(results, false);
|
||||
|
@ -155,51 +246,74 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void checkClusters(List results, int expectedDocCount,
|
||||
private void checkClusters(List<NamedList<Object>> results, int expectedDocCount,
|
||||
int expectedLabelCount, int expectedSubclusterCount) {
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
NamedList cluster = (NamedList) results.get(i);
|
||||
NamedList<Object> cluster = results.get(i);
|
||||
checkCluster(cluster, expectedDocCount, expectedLabelCount,
|
||||
expectedSubclusterCount);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkClusters(List results, boolean hasSubclusters) {
|
||||
private void checkClusters(List<NamedList<Object>> results, boolean hasSubclusters) {
|
||||
for (int i = 0; i < results.size(); i++) {
|
||||
checkCluster((NamedList) results.get(i), hasSubclusters);
|
||||
checkCluster(results.get(i), hasSubclusters);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkCluster(NamedList cluster, boolean hasSubclusters) {
|
||||
List docs = (List) cluster.get("docs");
|
||||
private void checkCluster(NamedList<Object> cluster, boolean hasSubclusters) {
|
||||
List<Object> docs = getDocs(cluster);
|
||||
assertNotNull("docs is null and it shouldn't be", docs);
|
||||
for (int j = 0; j < docs.size(); j++) {
|
||||
String id = (String) docs.get(j);
|
||||
assertNotNull("id is null and it shouldn't be", id);
|
||||
}
|
||||
|
||||
List labels = (List) cluster.get("labels");
|
||||
List<String> labels = getLabels(cluster);
|
||||
assertNotNull("labels is null but it shouldn't be", labels);
|
||||
|
||||
if (hasSubclusters) {
|
||||
List subclusters = (List) cluster.get("clusters");
|
||||
List<NamedList<Object>> subclusters = getSubclusters(cluster);
|
||||
assertNotNull("subclusters is null but it shouldn't be", subclusters);
|
||||
}
|
||||
}
|
||||
|
||||
private void checkCluster(NamedList cluster, int expectedDocCount,
|
||||
private void checkCluster(NamedList<Object> cluster, int expectedDocCount,
|
||||
int expectedLabelCount, int expectedSubclusterCount) {
|
||||
checkCluster(cluster, expectedSubclusterCount > 0);
|
||||
assertEquals("number of docs in cluster", expectedDocCount,
|
||||
((List) cluster.get("docs")).size());
|
||||
getDocs(cluster).size());
|
||||
assertEquals("number of labels in cluster", expectedLabelCount,
|
||||
((List) cluster.get("labels")).size());
|
||||
getLabels(cluster).size());
|
||||
|
||||
if (expectedSubclusterCount > 0) {
|
||||
List subclusters = (List) cluster.get("clusters");
|
||||
List<NamedList<Object>> subclusters = getSubclusters(cluster);
|
||||
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
|
||||
assertEquals("number of subclusters in cluster",
|
||||
expectedSubclusterCount, subclusters.size());
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
|
||||
return (List<NamedList<Object>>) cluster.get("clusters");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<String> getLabels(NamedList<Object> cluster) {
|
||||
return (List<String>) cluster.get("labels");
|
||||
}
|
||||
|
||||
private Double getScore(NamedList<Object> cluster) {
|
||||
return (Double) cluster.get("score");
|
||||
}
|
||||
|
||||
private Boolean isOtherTopics(NamedList<Object> cluster) {
|
||||
return (Boolean)cluster.get("other-topics");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<Object> getDocs(NamedList<Object> cluster) {
|
||||
return (List<Object>) cluster.get("docs");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
package org.apache.solr.handler.clustering.carrot2;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.ProcessingComponentBase;
|
||||
import org.carrot2.core.ProcessingException;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
|
||||
import org.carrot2.text.linguistic.ILexicalData;
|
||||
import org.carrot2.text.linguistic.ILexicalDataFactory;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
import org.carrot2.util.attribute.Bindable;
|
||||
import org.carrot2.util.attribute.Input;
|
||||
import org.carrot2.util.attribute.Output;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* A mock implementation of Carrot2 clustering algorithm for testing whether the
|
||||
* customized lexical resource lookup works correctly. This algorithm ignores
|
||||
* the input documents and instead for each word from {@link #wordsToCheck}, it
|
||||
* outputs a cluster labeled with the word only if the word is neither a stop
|
||||
* word nor a stop label.
|
||||
*/
|
||||
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
|
||||
public class LexicalResourcesCheckClusteringAlgorithm extends
|
||||
ProcessingComponentBase implements IClusteringAlgorithm {
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
private List<Cluster> clusters;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
private String wordsToCheck;
|
||||
|
||||
private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = Lists.newArrayList();
|
||||
if (wordsToCheck == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Test with Maltese so that the English clustering performed in other tests
|
||||
// is not affected by the test stopwords and stoplabels.
|
||||
ILexicalData lexicalData = preprocessing.lexicalDataFactory
|
||||
.getLexicalData(LanguageCode.MALTESE);
|
||||
|
||||
for (String word : wordsToCheck.split(",")) {
|
||||
if (!lexicalData.isCommonWord(new MutableCharArray(word))
|
||||
&& !lexicalData.isStopLabel(word)) {
|
||||
clusters.add(new Cluster(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -49,6 +49,11 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
|
|||
@IntRange(min = 1, max = 5)
|
||||
private int labels = 1;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
private int otherTopicsModulo = 0;
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = Lists.newArrayList();
|
||||
|
@ -59,21 +64,26 @@ public class MockClusteringAlgorithm extends ProcessingComponentBase implements
|
|||
int documentIndex = 1;
|
||||
for (Document document : documents) {
|
||||
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
|
||||
Cluster cluster = createCluster(label.toString(), document);
|
||||
Cluster cluster = createCluster(label.toString(), documentIndex, document);
|
||||
clusters.add(cluster);
|
||||
for (int i = 1; i <= depth; i++) {
|
||||
label.append(".");
|
||||
label.append(i);
|
||||
Cluster newCluster = createCluster(label.toString(), document);
|
||||
cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
|
||||
Cluster newCluster = createCluster(label.toString(), documentIndex, document);
|
||||
cluster.addSubclusters(createCluster(label.toString(), documentIndex, document), newCluster);
|
||||
cluster = newCluster;
|
||||
}
|
||||
documentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
private Cluster createCluster(String labelBase, Document... documents) {
|
||||
private Cluster createCluster(String labelBase, int documentIndex, Document... documents) {
|
||||
Cluster cluster = new Cluster();
|
||||
cluster.setScore(documentIndex * 0.25);
|
||||
if (otherTopicsModulo != 0 && documentIndex % otherTopicsModulo == 0)
|
||||
{
|
||||
cluster.setOtherTopics(true);
|
||||
}
|
||||
for (int i = 0; i < labels; i++) {
|
||||
cluster.addPhrases(labelBase + "#" + (i + 1));
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
customsolrstoplabel
|
|
@ -0,0 +1 @@
|
|||
customsolrstopword
|
|
@ -0,0 +1 @@
|
|||
customsolrstoplabelcustomdir
|
|
@ -0,0 +1 @@
|
|||
customsolrstopwordcustomdir
|
|
@ -396,6 +396,15 @@
|
|||
<str name="name">mock</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.MockClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">lexical-resource-check</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
|
||||
</lst>
|
||||
<lst name="engine">
|
||||
<str name="name">lexical-resource-check-custom-resource-dir</str>
|
||||
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
|
||||
<str name="carrot.lexicalResourcesDir">clustering/custom</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
|
||||
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
|
||||
|
|
|
@ -55,4 +55,5 @@ to
|
|||
was
|
||||
will
|
||||
with
|
||||
solrownstopword
|
||||
|
||||
|
|
|
@ -1198,17 +1198,20 @@
|
|||
<lst name="engine">
|
||||
<!-- The name, only one can be named "default" -->
|
||||
<str name="name">default</str>
|
||||
|
||||
<!-- Class name of Carrot2 clustering algorithm.
|
||||
|
||||
Currently available algorithms are:
|
||||
|
||||
* org.carrot2.clustering.lingo.LingoClusteringAlgorithm
|
||||
* org.carrot2.clustering.stc.STCClusteringAlgorithm
|
||||
* org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
|
||||
|
||||
See http://project.carrot2.org/algorithms.html for the
|
||||
algorithm's characteristics.
|
||||
-->
|
||||
<str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
|
||||
|
||||
<!-- Overriding values for Carrot2 default algorithm attributes.
|
||||
|
||||
For a description of all available attributes, see:
|
||||
|
@ -1220,6 +1223,19 @@
|
|||
-->
|
||||
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
|
||||
|
||||
<!-- Location of Carrot2 lexical resources.
|
||||
|
||||
A directory from which to load Carrot2-specific stop words
|
||||
and stop labels. Absolute or relative to Solr config directory.
|
||||
If a specific resource (e.g. stopwords.en) is present in the
|
||||
specified dir, it will completely override the corresponding
|
||||
default one that ships with Carrot2.
|
||||
|
||||
For an overview of Carrot2 lexical resources, see:
|
||||
http://download.carrot2.org/head/manual/#chapter.lexical-resources
|
||||
-->
|
||||
<str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
|
||||
|
||||
<!-- The language to assume for the documents.
|
||||
|
||||
For a list of allowed values, see:
|
||||
|
|
Loading…
Reference in New Issue