SOLR-5219: follow-up code cleanups.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1522472 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2013-09-12 07:49:23 +00:00
parent 247142e18c
commit 047a650004
12 changed files with 235 additions and 214 deletions

View File

@ -1,4 +1,5 @@
package org.apache.solr.handler.clustering;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -18,7 +19,6 @@ package org.apache.solr.handler.clustering;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
@ -38,6 +38,7 @@ import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -46,13 +47,13 @@ import com.google.common.collect.Maps;
/**
* Provide a plugin for performing cluster analysis. This can either be applied to
* Provides a plugin for performing cluster analysis. This can either be applied to
* search results (e.g., via <a href="http://project.carrot2.org">Carrot<sup>2</sup></a>) or for
* clustering documents (e.g., via <a href="http://mahout.apache.org/">Mahout</a>).
* <p>
* This engine is experimental. Output from this engine is subject to change in future releases.</p>
* <p>
* See Solr example for configuration examples.</p>
*
* @lucene.experimental
*/
public class ClusteringComponent extends SearchComponent implements SolrCoreAware {
private transient static Logger log = LoggerFactory.getLogger(ClusteringComponent.class);
@ -78,6 +79,13 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
*/
private final Map<String, SearchClusteringEngine> searchClusteringEnginesView = Collections.unmodifiableMap(searchClusteringEngines);
/**
* Initialization parameters temporarily saved here, the component
* is initialized in {@link #inform(SolrCore)} because we need to know
* the core's {@link SolrResourceLoader}.
*
* @see #init(NamedList)
*/
private NamedList<Object> initParams;
@Override
@ -150,8 +158,9 @@ public class ClusteringComponent extends SearchComponent implements SolrCoreAwar
SearchClusteringEngine engine = getSearchClusteringEngine(rb);
if (engine != null) {
DocListAndSet results = rb.getResults();
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(results.docList.size());
SolrDocumentList solrDocList = engine.getSolrDocumentList(results.docList, rb.req, docIds);
Map<SolrDocument,Integer> docIds = Maps.newHashMapWithExpectedSize(results.docList.size());
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList(
results.docList, rb.req.getSearcher(), engine.getFieldsToLoad(rb.req), docIds);
Object clusters = engine.cluster(rb.getQuery(), solrDocList, docIds, rb.req);
rb.rsp.add("clusters", clusters);
} else {

View File

@ -18,16 +18,16 @@ package org.apache.solr.handler.clustering;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
/**
*
*
**/
* A base class for {@link SearchClusteringEngine} and {@link DocumentClusteringEngine}.
* @lucene.experimental
*/
public class ClusteringEngine {
private String name;
public static final String ENGINE_NAME = "name";
public static final String DEFAULT_ENGINE_NAME = "default";
private String name;
public String init(NamedList<?> config, SolrCore core) {
name = (String) config.get(ENGINE_NAME);
return name;

View File

@ -16,11 +16,9 @@ package org.apache.solr.handler.clustering;
* limitations under the License.
*/
/**
*
*
**/
* @lucene.experimental
*/
public interface ClusteringParams {
public static final String CLUSTERING_PREFIX = "clustering.";
@ -30,8 +28,9 @@ public interface ClusteringParams {
public static final String USE_SEARCH_RESULTS = CLUSTERING_PREFIX + "results";
public static final String USE_COLLECTION = CLUSTERING_PREFIX + "collection";
/**
* When document clustering, cluster on the Doc Set
* When clustering full documents, cluster on the Doc Set.
*/
public static final String USE_DOC_SET = CLUSTERING_PREFIX + "docs.useDocSet";
}

View File

@ -20,11 +20,9 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.search.DocSet;
/**
* Experimental. Subject to change before the next release.
*
**/
* @lucene.experimental
*/
public abstract class DocumentClusteringEngine extends ClusteringEngine {
/**

View File

@ -1,4 +1,5 @@
package org.apache.solr.handler.clustering;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -16,7 +17,6 @@ package org.apache.solr.handler.clustering;
* limitations under the License.
*/
import java.io.IOException;
import java.util.Map;
import java.util.Set;
@ -24,20 +24,19 @@ import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.DocList;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.response.SolrQueryResponse;
/**
* Base class for clustering engines performing cluster analysis on search
* results.
*
*
**/
* @lucene.experimental
*/
public abstract class SearchClusteringEngine extends ClusteringEngine {
@Deprecated
public abstract Object cluster(Query query, DocList docList, SolrQueryRequest sreq);
// TODO: need DocList, too?
/**
* Do the clustering, return a clusters structure to be appended to
* {@link SolrQueryResponse}.
*/
public abstract Object cluster(Query query, SolrDocumentList solrDocumentList,
Map<SolrDocument,Integer> docIds, SolrQueryRequest sreq);
@ -45,15 +44,10 @@ public abstract class SearchClusteringEngine extends ClusteringEngine {
* Returns the set of field names to load.
* Concrete classes can override this method if needed.
* Default implementation returns null, that is, all stored fields are loaded.
* @return set of field names to load
*
* @return The set of field names to load.
*/
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
return null;
}
public SolrDocumentList getSolrDocumentList(DocList docList, SolrQueryRequest sreq,
Map<SolrDocument, Integer> docIds) throws IOException{
return SolrPluginUtils.docListToSolrDocumentList(
docList, sreq.getSearcher(), getFieldsToLoad(sreq), docIds);
}
}

View File

@ -17,10 +17,7 @@ package org.apache.solr.handler.clustering.carrot2;
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@ -32,7 +29,6 @@ import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.ObjectUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.search.Query;
@ -45,7 +41,6 @@ import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.handler.clustering.ClusteringEngine;
import org.apache.solr.handler.clustering.SearchClusteringEngine;
import org.apache.solr.handler.component.HighlightComponent;
@ -56,7 +51,6 @@ import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.carrot2.core.Cluster;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
@ -71,7 +65,6 @@ import org.carrot2.util.attribute.AttributeValueSet;
import org.carrot2.util.attribute.AttributeValueSets;
import org.carrot2.util.resource.ClassLoaderLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
import org.carrot2.util.resource.ResourceLookup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -84,19 +77,17 @@ import com.google.common.collect.Sets;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
* <p/>
* Output from this class is subject to change.
*
* @see "http://project.carrot2.org"
* @lucene.experimental
*/
public class CarrotClusteringEngine extends SearchClusteringEngine {
private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
transient static Logger log = LoggerFactory.getLogger(CarrotClusteringEngine.class);
/**
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
*/
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
/**
* Name of Carrot2 document's field containing Solr document's identifier.
@ -114,167 +105,15 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
* Carrot2 controller that manages instances of clustering algorithms
*/
private Controller controller = ControllerFactory.createPooling();
/**
* {@link IClusteringAlgorithm} class used for actual clustering.
*/
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
/** Solr core we're bound to. */
private SolrCore core;
private static class SolrResourceLocator implements IResourceLocator {
private final SolrResourceLoader resourceLoader;
private final String carrot2ResourcesDir;
public SolrResourceLocator(SolrCore core, SolrParams initParams) {
resourceLoader = core.getResourceLoader();
@SuppressWarnings("deprecation")
String lexicalResourcesDir = initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR);
String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
carrot2ResourcesDir = firstNonNull(resourcesDir, lexicalResourcesDir, CARROT_RESOURCES_PREFIX);
}
@SuppressWarnings("unchecked")
public static <T> T firstNonNull(T... args) {
for (T t : args) {
if (t != null) return t;
}
throw new NullPointerException("At least one element has to be non-null.");
}
@Override
public IResource[] getAll(final String resource) {
final String resourceName = carrot2ResourcesDir + "/" + resource;
log.debug("Looking for Solr resource: " + resourceName);
InputStream resourceStream = null;
final byte [] asBytes;
try {
resourceStream = resourceLoader.openResource(resourceName);
asBytes = IOUtils.toByteArray(resourceStream);
} catch (IOException e) {
log.debug("Resource not found in Solr's config: " + resourceName
+ ". Using the default " + resource + " from Carrot JAR.");
return new IResource[] {};
} finally {
if (resourceStream != null) {
try {
resourceStream.close();
} catch (IOException e) {
// ignore.
}
}
}
log.info("Loaded Solr resource: " + resourceName);
final IResource foundResource = new IResource() {
@Override
public InputStream open() {
return new ByteArrayInputStream(asBytes);
}
@Override
public int hashCode() {
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.equals(obj);
}
@Override
public String toString() {
return "Solr config resource: " + resourceName;
}
};
return new IResource[] { foundResource };
}
@Override
public int hashCode() {
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.equals(obj);
}
@Override
public String toString() {
String configDir = "";
try {
configDir = "configDir=" + new File(resourceLoader.getConfigDir()).getAbsolutePath() + ", ";
} catch (Exception ignored) {
// If we get the exception, the resource loader implementation
// probably does not support getConfigDir(). Not a big problem.
}
return "SolrResourceLocator, " + configDir
+ "Carrot2 relative lexicalResourcesDir=" + carrot2ResourcesDir;
}
}
@Override
@Deprecated
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
SolrIndexSearcher searcher = sreq.getSearcher();
SolrDocumentList solrDocList;
try {
Map<SolrDocument,Integer> docIds = new HashMap<SolrDocument, Integer>(docList.size());
solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, getFieldsToLoad(sreq), docIds );
return cluster(query, solrDocList, docIds, sreq);
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, e);
}
}
@Override
public Object cluster(Query query, SolrDocumentList solrDocList,
Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
try {
// Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<String, Object>();
List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
// Pass the fields on which clustering runs to the
// SolrStopwordsCarrot2LexicalDataFactory
attributes.put("solrFieldNames", getFieldsForClustering(sreq));
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
// Perform clustering and convert to named list
// Carrot2 uses current thread's context class loader to get
// certain classes (e.g. custom tokenizer/stemmer) at runtime.
// To make sure classes from contrib JARs are available,
// we swap the context class loader for the time of clustering.
Thread ct = Thread.currentThread();
ClassLoader prev = ct.getContextClassLoader();
try {
ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
return clustersToNamedList(controller.process(attributes,
clusteringAlgorithmClass).getClusters(), sreq.getParams());
} finally {
ct.setContextClassLoader(prev);
}
} catch (Exception e) {
log.error("Carrot2 clustering failed", e);
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
}
}
@Override
@SuppressWarnings("rawtypes")
public String init(NamedList config, final SolrCore core) {
@ -377,6 +216,43 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
return result;
}
@Override
public Object cluster(Query query, SolrDocumentList solrDocList,
Map<SolrDocument, Integer> docIds, SolrQueryRequest sreq) {
try {
// Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<String, Object>();
List<Document> documents = getDocuments(solrDocList, docIds, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
// Pass the fields on which clustering runs.
attributes.put("solrFieldNames", getFieldsForClustering(sreq));
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
// Perform clustering and convert to an output structure of clusters.
//
// Carrot2 uses current thread's context class loader to get
// certain classes (e.g. custom tokenizer/stemmer) at runtime.
// To make sure classes from contrib JARs are available,
// we swap the context class loader for the time of clustering.
Thread ct = Thread.currentThread();
ClassLoader prev = ct.getContextClassLoader();
try {
ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
return clustersToNamedList(controller.process(attributes,
clusteringAlgorithmClass).getClusters(), sreq.getParams());
} finally {
ct.setContextClassLoader(prev);
}
} catch (Exception e) {
log.error("Carrot2 clustering failed", e);
throw new SolrException(ErrorCode.SERVER_ERROR, "Carrot2 clustering failed", e);
}
}
@Override
protected Set<String> getFieldsToLoad(SolrQueryRequest sreq){
SolrParams solrParams = sreq.getParams();
@ -434,8 +310,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Parse language code map string into a map
Map<String, String> languageCodeMap = Maps.newHashMap();
if (StringUtils.isNotBlank(languageField)) {
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "")
.split("[, ]")) {
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "").split("[, ]")) {
final String[] split = pair.split(":");
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
languageCodeMap.put(split[0], split[1]);

View File

@ -24,6 +24,7 @@ import com.google.common.collect.ImmutableSet;
/**
* Carrot2 parameter mapping (recognized and mapped if passed via Solr configuration).
* @lucene.experimental
*/
public final class CarrotParams {

View File

@ -48,6 +48,8 @@ import org.tartarus.snowball.ext.TurkishStemmer;
* An implementation of Carrot2's {@link IStemmerFactory} based on Lucene's
* APIs. Should the relevant Lucene APIs need to change, the changes can be made
* in this class.
*
* @lucene.experimental
*/
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
final static Logger logger = org.slf4j.LoggerFactory

View File

@ -38,6 +38,8 @@ import org.slf4j.Logger;
* Smart Chinese tokenizer. If Smart Chinese tokenizer is not available in
* classpath at runtime, the default Carrot2's tokenizer is used. Should the
* Lucene APIs need to change, the changes can be made in this class.
*
* @lucene.experimental
*/
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
final static Logger logger = org.slf4j.LoggerFactory

View File

@ -0,0 +1,140 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.clustering.carrot2;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
/**
* A {@link IResourceLocator} that delegates resource searches to {@link SolrCore}.
*
* @lucene.experimental
*/
class SolrResourceLocator implements IResourceLocator {
private final SolrResourceLoader resourceLoader;
private final String carrot2ResourcesDir;
public SolrResourceLocator(SolrCore core, SolrParams initParams) {
resourceLoader = core.getResourceLoader();
@SuppressWarnings("deprecation")
String lexicalResourcesDir = initParams.get(CarrotParams.LEXICAL_RESOURCES_DIR);
String resourcesDir = initParams.get(CarrotParams.RESOURCES_DIR);
carrot2ResourcesDir = firstNonNull(resourcesDir, lexicalResourcesDir, CarrotClusteringEngine.CARROT_RESOURCES_PREFIX);
}
@SuppressWarnings("unchecked")
public static <T> T firstNonNull(T... args) {
for (T t : args) {
if (t != null) return t;
}
throw new NullPointerException("At least one element has to be non-null.");
}
@Override
public IResource[] getAll(final String resource) {
final String resourceName = carrot2ResourcesDir + "/" + resource;
CarrotClusteringEngine.log.debug("Looking for Solr resource: " + resourceName);
InputStream resourceStream = null;
final byte [] asBytes;
try {
resourceStream = resourceLoader.openResource(resourceName);
asBytes = IOUtils.toByteArray(resourceStream);
} catch (IOException e) {
CarrotClusteringEngine.log.debug("Resource not found in Solr's config: " + resourceName
+ ". Using the default " + resource + " from Carrot JAR.");
return new IResource[] {};
} finally {
if (resourceStream != null) {
try {
resourceStream.close();
} catch (IOException e) {
// ignore.
}
}
}
CarrotClusteringEngine.log.info("Loaded Solr resource: " + resourceName);
final IResource foundResource = new IResource() {
@Override
public InputStream open() {
return new ByteArrayInputStream(asBytes);
}
@Override
public int hashCode() {
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// In case multiple resources are found they will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.equals(obj);
}
@Override
public String toString() {
return "Solr config resource: " + resourceName;
}
};
return new IResource[] { foundResource };
}
@Override
public int hashCode() {
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.hashCode();
}
@Override
public boolean equals(Object obj) {
// In case multiple locations are used locators will be deduped, but we don't use it in Solr,
// so simply rely on instance equivalence.
return super.equals(obj);
}
@Override
public String toString() {
String configDir = "";
try {
configDir = "configDir=" + new File(resourceLoader.getConfigDir()).getAbsolutePath() + ", ";
} catch (Exception ignored) {
// If we get the exception, the resource loader implementation
// probably does not support getConfigDir(). Not a big problem.
}
return "SolrResourceLocator, " + configDir
+ "Carrot2 relative lexicalResourcesDir=" + carrot2ResourcesDir;
}
}

View File

@ -50,10 +50,11 @@ import com.google.common.collect.Multimap;
* stop words removal. In other words, if something is a stop word during
* indexing, then it should also be a stop word during clustering, but not the
* other way round.
*
* @lucene.experimental
*/
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements
ILexicalDataFactory {
public class SolrStopwordsCarrot2LexicalDataFactory implements ILexicalDataFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);

View File

@ -16,6 +16,6 @@
-->
<html>
<body>
Apache Solr Search Server: Clustering contrib
Apache Solr Search Server: text clustering contrib
</body>
</html>