reformat to remove tabs

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@790599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-07-02 14:08:37 +00:00
parent dabddad667
commit 43eb481c2f
7 changed files with 466 additions and 467 deletions

View File

@ -45,204 +45,206 @@ import com.google.common.collect.Sets;
/** /**
* Search results clustering engine based on Carrot2 clustering algorithms. * Search results clustering engine based on Carrot2 clustering algorithms.
* * <p/>
* Output from this class is subject to change. * Output from this class is subject to change.
* *
* @link http://project.carrot2.org * @link http://project.carrot2.org
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public class CarrotClusteringEngine extends SearchClusteringEngine { public class CarrotClusteringEngine extends SearchClusteringEngine {
private transient static Logger log = LoggerFactory private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class); .getLogger(CarrotClusteringEngine.class);
/** Carrot2 controller that manages instances of clustering algorithms */ /**
private CachingController controller = new CachingController(); * Carrot2 controller that manages instances of clustering algorithms
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass; */
private CachingController controller = new CachingController();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
private String idFieldName; private String idFieldName;
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) { public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
try { try {
// Prepare attributes for Carrot2 clustering call // Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<String, Object>(); Map<String, Object> attributes = new HashMap<String, Object>();
List<Document> documents = getDocuments(docList, query, sreq); List<Document> documents = getDocuments(docList, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents); attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString()); attributes.put(AttributeNames.QUERY, query.toString());
// Pass extra overriding attributes from the request, if any // Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes); extractCarrotAttributes(sreq.getParams(), attributes);
// Perform clustering and convert to named list // Perform clustering and convert to named list
return clustersToNamedList(controller.process(attributes, return clustersToNamedList(controller.process(attributes,
clusteringAlgorithmClass).getClusters(), sreq.getParams()); clusteringAlgorithmClass).getClusters(), sreq.getParams());
} catch (Exception e) { } catch (Exception e) {
log.error("Carrot2 clustering failed", e); log.error("Carrot2 clustering failed", e);
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
@Override @Override
public String init(NamedList config, final SolrCore core) { public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core); String result = super.init(config, core);
SolrParams initParams = SolrParams.toSolrParams(config); SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any. // Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>(); HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes); extractCarrotAttributes(initParams, initAttributes);
this.controller.init(initAttributes); this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName(); this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Make sure the requested Carrot2 clustering algorithm class is available // Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM); String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
try { try {
Class<?> algorithmClass = Thread.currentThread().getContextClassLoader() Class<?> algorithmClass = Thread.currentThread().getContextClassLoader()
.loadClass(carrotAlgorithmClassName); .loadClass(carrotAlgorithmClassName);
if (!IClusteringAlgorithm.class.isAssignableFrom(algorithmClass)) { if (!IClusteringAlgorithm.class.isAssignableFrom(algorithmClass)) {
throw new IllegalArgumentException("Class provided as " throw new IllegalArgumentException("Class provided as "
+ CarrotParams.ALGORITHM + " must implement " + CarrotParams.ALGORITHM + " must implement "
+ IClusteringAlgorithm.class.getName()); + IClusteringAlgorithm.class.getName());
} }
this.clusteringAlgorithmClass = (Class<? extends IClusteringAlgorithm>) algorithmClass; this.clusteringAlgorithmClass = (Class<? extends IClusteringAlgorithm>) algorithmClass;
} catch (ClassNotFoundException e) { } catch (ClassNotFoundException e) {
throw new RuntimeException( throw new RuntimeException(
"Failed to load Carrot clustering algorithm class", e); "Failed to load Carrot clustering algorithm class", e);
} }
return result; return result;
} }
/** /**
* Prepares Carrot2 documents for clustering. * Prepares Carrot2 documents for clustering.
*/ */
private List<Document> getDocuments(DocList docList, private List<Document> getDocuments(DocList docList,
Query query, final SolrQueryRequest sreq) throws IOException { Query query, final SolrQueryRequest sreq) throws IOException {
SolrHighlighter highligher = null; SolrHighlighter highligher = null;
SolrParams solrParams = sreq.getParams(); SolrParams solrParams = sreq.getParams();
SolrCore core = sreq.getCore(); SolrCore core = sreq.getCore();
// Names of fields to deliver content for clustering // Names of fields to deliver content for clustering
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url"); String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title"); String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME,
titleField); titleField);
if (StringUtils.isBlank(snippetField)) { if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank."); + " must not be blank.");
} }
Set<String> fieldsToLoad = Sets.newHashSet(urlField, titleField, Set<String> fieldsToLoad = Sets.newHashSet(urlField, titleField,
snippetField, idFieldName); snippetField, idFieldName);
// Get the documents // Get the documents
DocIterator docsIter = docList.iterator(); DocIterator docsIter = docList.iterator();
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY,
false); false);
SolrQueryRequest req = null; SolrQueryRequest req = null;
String[] snippetFieldAry = null; String[] snippetFieldAry = null;
if (produceSummary == true) { if (produceSummary == true) {
highligher = core.getHighlighter(); highligher = core.getHighlighter();
Map args = new HashMap(); Map args = new HashMap();
snippetFieldAry = new String[] { snippetField }; snippetFieldAry = new String[]{snippetField};
args.put(HighlightParams.FIELDS, snippetFieldAry); args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true"); args.put(HighlightParams.HIGHLIGHT, "true");
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) { req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
@Override @Override
public SolrIndexSearcher getSearcher() { public SolrIndexSearcher getSearcher() {
return sreq.getSearcher(); return sreq.getSearcher();
} }
}; };
} }
SolrIndexSearcher searcher = sreq.getSearcher(); SolrIndexSearcher searcher = sreq.getSearcher();
List<Document> result = new ArrayList<Document>(docList.size()); List<Document> result = new ArrayList<Document>(docList.size());
FieldSelector fieldSelector = new SetBasedFieldSelector(fieldsToLoad, FieldSelector fieldSelector = new SetBasedFieldSelector(fieldsToLoad,
Collections.emptySet()); Collections.emptySet());
float[] scores = { 1.0f }; float[] scores = {1.0f};
int[] docsHolder = new int[1]; int[] docsHolder = new int[1];
Query theQuery = query; Query theQuery = query;
while (docsIter.hasNext()) { while (docsIter.hasNext()) {
Integer id = docsIter.next(); Integer id = docsIter.next();
org.apache.lucene.document.Document doc = searcher.doc(id, org.apache.lucene.document.Document doc = searcher.doc(id,
fieldSelector); fieldSelector);
String snippet = getValue(doc, snippetField); String snippet = getValue(doc, snippetField);
if (produceSummary == true) { if (produceSummary == true) {
docsHolder[0] = id.intValue(); docsHolder[0] = id.intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f); DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
highligher.doHighlighting(docAsList, theQuery, req, snippetFieldAry); highligher.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
} }
Document carrotDocument = new Document(getValue(doc, titleField), Document carrotDocument = new Document(getValue(doc, titleField),
snippet, doc.get(urlField)); snippet, doc.get(urlField));
carrotDocument.addField("solrId", doc.get(idFieldName)); carrotDocument.addField("solrId", doc.get(idFieldName));
result.add(carrotDocument); result.add(carrotDocument);
} }
return result; return result;
} }
protected String getValue(org.apache.lucene.document.Document doc, protected String getValue(org.apache.lucene.document.Document doc,
String field) { String field) {
StringBuilder result = new StringBuilder(); StringBuilder result = new StringBuilder();
String[] vals = doc.getValues(field); String[] vals = doc.getValues(field);
for (int i = 0; i < vals.length; i++) { for (int i = 0; i < vals.length; i++) {
// Join multiple values with a period so that Carrot2 does not pick up // Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would // phrases that cross field value boundaries (in most cases it would
// create useless phrases). // create useless phrases).
result.append(vals[i]).append(" . "); result.append(vals[i]).append(" . ");
} }
return result.toString().trim(); return result.toString().trim();
} }
private List clustersToNamedList(List<Cluster> carrotClusters, private List clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) { SolrParams solrParams) {
List result = new ArrayList(); List result = new ArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool( clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, false), solrParams.getInt( CarrotParams.OUTPUT_SUB_CLUSTERS, false), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE)); CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
return result; return result;
} }
private void clustersToNamedList(List<Cluster> outputClusters, private void clustersToNamedList(List<Cluster> outputClusters,
List parent, boolean outputSubClusters, int maxLabels) { List parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) { for (Cluster outCluster : outputClusters) {
NamedList cluster = new SimpleOrderedMap(); NamedList cluster = new SimpleOrderedMap();
parent.add(cluster); parent.add(cluster);
List<String> labels = outCluster.getPhrases(); List<String> labels = outCluster.getPhrases();
if (labels.size() > maxLabels) if (labels.size() > maxLabels)
labels = labels.subList(0,maxLabels); labels = labels.subList(0, maxLabels);
cluster.add("labels", labels); cluster.add("labels", labels);
List<Document> docs = outCluster.getDocuments(); List<Document> docs = outCluster.getDocuments();
List docList = new ArrayList(); List docList = new ArrayList();
cluster.add("docs", docList); cluster.add("docs", docList);
for (Document doc : docs) { for (Document doc : docs) {
docList.add(doc.getField("solrId")); docList.add(doc.getField("solrId"));
} }
if (outputSubClusters) { if (outputSubClusters) {
List subclusters = new ArrayList(); List subclusters = new ArrayList();
cluster.add("clusters",subclusters); cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters, clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels); outputSubClusters, maxLabels);
} }
} }
} }
/** /**
* Extracts parameters that can possibly match some attributes of Carrot2 algorithms. * Extracts parameters that can possibly match some attributes of Carrot2 algorithms.
*/ */
private void extractCarrotAttributes(SolrParams solrParams, private void extractCarrotAttributes(SolrParams solrParams,
Map<String, Object> attributes) { Map<String, Object> attributes) {
// Extract all non-predefined parameters. This way, we'll be able to set all // Extract all non-predefined parameters. This way, we'll be able to set all
// parameters of Carrot2 algorithms without defining their names as constants. // parameters of Carrot2 algorithms without defining their names as constants.
for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames
.hasNext();) { .hasNext();) {
String paramName = paramNames.next(); String paramName = paramNames.next();
if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) { if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) {
attributes.put(paramName, solrParams.get(paramName)); attributes.put(paramName, solrParams.get(paramName));
} }
} }
} }
} }

View File

@ -21,22 +21,20 @@ import com.google.common.collect.ImmutableSet;
* limitations under the License. * limitations under the License.
*/ */
/**
*
*/
public interface CarrotParams { public interface CarrotParams {
String CARROT_PREFIX = "carrot."; String CARROT_PREFIX = "carrot.";
String ALGORITHM = CARROT_PREFIX + "algorithm"; String ALGORITHM = CARROT_PREFIX + "algorithm";
String TITLE_FIELD_NAME = CARROT_PREFIX + "title"; String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
String URL_FIELD_NAME = CARROT_PREFIX + "url"; String URL_FIELD_NAME = CARROT_PREFIX + "url";
String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet"; String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary"; String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions"; String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters"; String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of( public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME, ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS); PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS);
} }

View File

@ -23,7 +23,7 @@ import org.apache.solr.util.AbstractSolrTestCase;
* *
*/ */
public class AbstractClusteringTest extends AbstractSolrTestCase { public class AbstractClusteringTest extends AbstractSolrTestCase {
protected int numberOfDocs = 0; protected int numberOfDocs = 0;
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
@ -31,13 +31,13 @@ public class AbstractClusteringTest extends AbstractSolrTestCase {
numberOfDocs = 0; numberOfDocs = 0;
for (String[] doc : DOCUMENTS) { for (String[] doc : DOCUMENTS) {
assertU("add failed", adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])); assertU("add failed", adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2]));
numberOfDocs++; numberOfDocs++;
} }
assertU("commit", commit()); assertU("commit", commit());
} }
public String getSchemaFile() { public String getSchemaFile() {
return "schema.xml"; return "schema.xml";
} }
@ -45,154 +45,154 @@ public class AbstractClusteringTest extends AbstractSolrTestCase {
return "solrconfig.xml"; return "solrconfig.xml";
} }
final String [][] DOCUMENTS = new String[][] { final String[][] DOCUMENTS = new String[][]{
{ "http://en.wikipedia.org/wiki/Data_mining", {"http://en.wikipedia.org/wiki/Data_mining",
"Data Mining - Wikipedia", "Data Mining - Wikipedia",
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns." }, "Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."},
{ "http://en.wikipedia.org/wiki/Datamining", {"http://en.wikipedia.org/wiki/Datamining",
"Data mining - Wikipedia, the free encyclopedia", "Data mining - Wikipedia, the free encyclopedia",
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..." }, "Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."},
{ "http://www.statsoft.com/textbook/stdatmin.html", {"http://www.statsoft.com/textbook/stdatmin.html",
"Electronic Statistics Textbook: Data Mining Techniques", "Electronic Statistics Textbook: Data Mining Techniques",
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques." }, "Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."},
{ "http://www.thearling.com/text/dmwhite/dmwhite.htm", {"http://www.thearling.com/text/dmwhite/dmwhite.htm",
"An Introduction to Data Mining", "An Introduction to Data Mining",
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..." }, "Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."},
{ "http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm", {"http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
"Data Mining: What is Data Mining?", "Data Mining: What is Data Mining?",
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works." }, "Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."},
{ "http://www.spss.com/datamine", {"http://www.spss.com/datamine",
"Data Mining Software, Data Mining Applications and Data Mining Solutions", "Data Mining Software, Data Mining Applications and Data Mining Solutions",
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..." }, "The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."},
{ "http://www.kdnuggets.com/", {"http://www.kdnuggets.com/",
"KD Nuggets", "KD Nuggets",
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings." }, "Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."},
{ "http://www.answers.com/topic/data-mining", {"http://www.answers.com/topic/data-mining",
"data mining: Definition from Answers.com", "data mining: Definition from Answers.com",
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..." }, "data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."},
{ "http://www.statsoft.com/products/dataminer.htm", {"http://www.statsoft.com/products/dataminer.htm",
"STATISTICA Data Mining and Predictive Modeling Solutions", "STATISTICA Data Mining and Predictive Modeling Solutions",
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..." }, "GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."},
{ "http://datamining.typepad.com/", {"http://datamining.typepad.com/",
"Data Mining: Text Mining, Visualization and Social Media", "Data Mining: Text Mining, Visualization and Social Media",
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..." }, "Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."},
{ "http://www.twocrows.com/", {"http://www.twocrows.com/",
"Two Crows Corporation", "Two Crows Corporation",
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use." }, "Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."},
{ "http://www.thearling.com/", {"http://www.thearling.com/",
"Thearling.com", "Thearling.com",
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies." }, "Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."},
{ "http://www.ccsu.edu/datamining/", {"http://www.ccsu.edu/datamining/",
"CCSU - Data Mining", "CCSU - Data Mining",
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling." }, "Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."},
{ "http://www.oracle.com/technology/products/bi/odm", {"http://www.oracle.com/technology/products/bi/odm",
"Oracle Data Mining", "Oracle Data Mining",
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..." }, "Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."},
{ "http://databases.about.com/od/datamining/a/datamining.htm", {"http://databases.about.com/od/datamining/a/datamining.htm",
"Data Mining: An Introduction", "Data Mining: An Introduction",
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques." }, "About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."},
{ "http://www.dmoz.org/Computers/Software/Databases/Data_Mining/", {"http://www.dmoz.org/Computers/Software/Databases/Data_Mining/",
"Open Directory - Computers: Software: Databases: Data Mining", "Open Directory - Computers: Software: Databases: Data Mining",
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..." }, "Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."},
{ "http://www.cs.wisc.edu/dmi/", {"http://www.cs.wisc.edu/dmi/",
"DMI:Data Mining Institute", "DMI:Data Mining Institute",
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..." }, "Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."},
{ "http://www.the-data-mine.com/", {"http://www.the-data-mine.com/",
"The Data Mine", "The Data Mine",
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining." }, "Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."},
{ "http://www.statserv.com/datamining.html", {"http://www.statserv.com/datamining.html",
"St@tServ - About Data Mining", "St@tServ - About Data Mining",
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..." }, "St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."},
{ "http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm", {"http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm",
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...", "MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..." }, "Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."},
{ "http://www.pentaho.com/products/data_mining/", {"http://www.pentaho.com/products/data_mining/",
"Pentaho Commercial Open Source Business Intelligence: Data Mining", "Pentaho Commercial Open Source Business Intelligence: Data Mining",
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..." }, "For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."},
{ "http://www.investorhome.com/mining.htm", {"http://www.investorhome.com/mining.htm",
"Investor Home - Data Mining", "Investor Home - Data Mining",
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..." }, "Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."},
{ "http://www.datamining.com/", {"http://www.datamining.com/",
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...", "Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics." }, "Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."},
{ "http://www.sourcewatch.org/index.php?title=Data_mining", {"http://www.sourcewatch.org/index.php?title=Data_mining",
"Data mining - SourceWatch", "Data mining - SourceWatch",
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..." }, "These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."},
{ "http://www.autonlab.org/tutorials/", {"http://www.autonlab.org/tutorials/",
"Statistical Data Mining Tutorials", "Statistical Data Mining Tutorials",
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms." }, "Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."},
{ "http://www.microstrategy.com/data-mining/index.asp", {"http://www.microstrategy.com/data-mining/index.asp",
"Data Mining", "Data Mining",
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..." }, "With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."},
{ "http://www.datamininglab.com/", {"http://www.datamininglab.com/",
"Elder Research", "Elder Research",
"Provides consulting and short courses in data mining and pattern discovery patterns in data." }, "Provides consulting and short courses in data mining and pattern discovery patterns in data."},
{ "http://www.sqlserverdatamining.com/", {"http://www.sqlserverdatamining.com/",
"SQL Server Data Mining > Home", "SQL Server Data Mining > Home",
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..." }, "SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."},
{ "http://databases.about.com/cs/datamining/g/dmining.htm", {"http://databases.about.com/cs/datamining/g/dmining.htm",
"Data Mining", "Data Mining",
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..." }, "What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."},
{ "http://www.sas.com/technologies/analytics/datamining/index.html", {"http://www.sas.com/technologies/analytics/datamining/index.html",
"Data Mining Software and Text Mining | SAS", "Data Mining Software and Text Mining | SAS",
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..." } "... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."}
}; };
} }

View File

@ -16,18 +16,16 @@ package org.apache.solr.handler.clustering;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.SpellCheckComponent;
import org.apache.solr.handler.component.QueryComponent;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryResponse; import org.apache.solr.handler.component.QueryComponent;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.request.SolrRequestHandler;
/** /**

View File

@ -1,7 +1,7 @@
package org.apache.solr.handler.clustering; package org.apache.solr.handler.clustering;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.search.DocSet; import org.apache.solr.search.DocSet;
@ -9,7 +9,7 @@ import org.apache.solr.search.DocSet;
* *
* *
**/ **/
public class MockDocumentClusteringEngine extends DocumentClusteringEngine{ public class MockDocumentClusteringEngine extends DocumentClusteringEngine {
public NamedList cluster(DocSet docs, SolrParams solrParams) { public NamedList cluster(DocSet docs, SolrParams solrParams) {
NamedList result = new NamedList(); NamedList result = new NamedList();
return result; return result;

View File

@ -17,151 +17,153 @@ package org.apache.solr.handler.clustering.carrot2;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException; import org.apache.lucene.search.MatchAllDocsQuery;
import java.util.List; import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.*;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.clustering.AbstractClusteringTest; import org.apache.solr.handler.clustering.AbstractClusteringTest;
import org.apache.solr.handler.clustering.ClusteringComponent; import org.apache.solr.handler.clustering.ClusteringComponent;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.search.DocList; import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted; import org.apache.solr.util.RefCounted;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.carrot2.util.attribute.AttributeUtils; import org.carrot2.util.attribute.AttributeUtils;
import java.io.IOException;
import java.util.List;
/** /**
* *
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
public class CarrotClusteringEngineTest extends AbstractClusteringTest { public class CarrotClusteringEngineTest extends AbstractClusteringTest {
public void testCarrotLingo() throws Exception { public void testCarrotLingo() throws Exception {
checkEngine(getClusteringEngine("default"), 9); checkEngine(getClusteringEngine("default"), 9);
} }
public void testCarrotStc() throws Exception { public void testCarrotStc() throws Exception {
checkEngine(getClusteringEngine("stc"), 2); checkEngine(getClusteringEngine("stc"), 2);
} }
public void testWithoutSubclusters() throws Exception { public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs), checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
1, 1, 0); 1, 1, 0);
} }
public void testWithSubclusters() throws Exception { public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams(); ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true); params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs, checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 1, 2); params), 1, 1, 2);
} }
public void testNumDescriptions() throws Exception { public void testNumDescriptions() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams(); ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3); params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs, checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 3, 0); params), 1, 3, 0);
} }
public void testCarrotAttributePassing() throws Exception { public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams(); ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3); params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs, checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 3, 0); params), 1, 3, 0);
} }
private CarrotClusteringEngine getClusteringEngine(String engineName) { private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore() ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering"); .getSearchComponent("clustering");
assertNotNull("clustering component should not be null", comp); assertNotNull("clustering component should not be null", comp);
CarrotClusteringEngine engine = (CarrotClusteringEngine) comp CarrotClusteringEngine engine = (CarrotClusteringEngine) comp
.getSearchClusteringEngines().get(engineName); .getSearchClusteringEngines().get(engineName);
assertNotNull("clustering engine for name: " + engineName assertNotNull("clustering engine for name: " + engineName
+ " should not be null", engine); + " should not be null", engine);
return engine; return engine;
} }
private List checkEngine(CarrotClusteringEngine engine, private List checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException { int expectedNumClusters) throws IOException {
return checkEngine(engine, expectedNumClusters, new ModifiableSolrParams()); return checkEngine(engine, expectedNumClusters, new ModifiableSolrParams());
} }
private List checkEngine(CarrotClusteringEngine engine, private List checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException { int expectedNumClusters, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster // Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher(); RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
MatchAllDocsQuery query = new MatchAllDocsQuery(); MatchAllDocsQuery query = new MatchAllDocsQuery();
DocList docList; DocList docList;
try { try {
SolrIndexSearcher searcher = ref.get(); SolrIndexSearcher searcher = ref.get();
docList = searcher.getDocList(query, (Query) null, new Sort(), 0, docList = searcher.getDocList(query, (Query) null, new Sort(), 0,
numberOfDocs); numberOfDocs);
assertEquals("docList size", this.numberOfDocs, docList.matches()); assertEquals("docList size", this.numberOfDocs, docList.matches());
} finally { } finally {
ref.decref(); ref.decref();
} }
ModifiableSolrParams solrParams = new ModifiableSolrParams(); ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true"); solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
solrParams.add(clusteringParams); solrParams.add(clusteringParams);
// Perform clustering // Perform clustering
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams); LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
List results = (List)engine.cluster(query, docList, req); List results = (List) engine.cluster(query, docList, req);
req.close(); req.close();
assertEquals("number of clusters", expectedNumClusters, results.size()); assertEquals("number of clusters", expectedNumClusters, results.size());
checkClusters(results, false); checkClusters(results, false);
return results; return results;
} }
private void checkClusters(List results, int expectedDocCount, private void checkClusters(List results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) { int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
NamedList cluster = (NamedList) results.get(i); NamedList cluster = (NamedList) results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount, checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount); expectedSubclusterCount);
} }
} }
private void checkClusters(List results, boolean hasSubclusters) { private void checkClusters(List results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) { for (int i = 0; i < results.size(); i++) {
checkCluster((NamedList)results.get(i), hasSubclusters ); checkCluster((NamedList) results.get(i), hasSubclusters);
} }
} }
private void checkCluster(NamedList cluster, boolean hasSubclusters) { private void checkCluster(NamedList cluster, boolean hasSubclusters) {
List docs = (List)cluster.get("docs"); List docs = (List) cluster.get("docs");
assertNotNull("docs is null and it shouldn't be", docs); assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) { for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j); String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id); assertNotNull("id is null and it shouldn't be", id);
} }
List labels = (List) cluster.get("labels"); List labels = (List) cluster.get("labels");
assertNotNull("labels is null but it shouldn't be", labels); assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) { if (hasSubclusters) {
List subclusters = (List) cluster.get("clusters"); List subclusters = (List) cluster.get("clusters");
assertNotNull("subclusters is null but it shouldn't be", subclusters); assertNotNull("subclusters is null but it shouldn't be", subclusters);
} }
} }
private void checkCluster(NamedList cluster, int expectedDocCount, private void checkCluster(NamedList cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) { int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0); checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount, assertEquals("number of docs in cluster", expectedDocCount,
((List) cluster.get("docs")).size()); ((List) cluster.get("docs")).size());
assertEquals("number of labels in cluster", expectedLabelCount, assertEquals("number of labels in cluster", expectedLabelCount,
((List) cluster.get("labels")).size()); ((List) cluster.get("labels")).size());
if (expectedSubclusterCount > 0) { if (expectedSubclusterCount > 0) {
List subclusters = (List) cluster.get("clusters"); List subclusters = (List) cluster.get("clusters");
assertEquals("numClusters", expectedSubclusterCount, subclusters.size()); assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster", assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size()); expectedSubclusterCount, subclusters.size());
} }
} }
} }

View File

@ -1,69 +1,68 @@
package org.apache.solr.handler.clustering.carrot2; package org.apache.solr.handler.clustering.carrot2;
import java.util.List; import com.google.common.collect.Lists;
import org.carrot2.core.*; import org.carrot2.core.*;
import org.carrot2.core.attribute.AttributeNames; import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing; import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*; import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange; import org.carrot2.util.attribute.constraint.IntRange;
import com.google.common.collect.Lists; import java.util.List;
@Bindable(prefix = "MockClusteringAlgorithm") @Bindable(prefix = "MockClusteringAlgorithm")
public class MockClusteringAlgorithm extends ProcessingComponentBase implements public class MockClusteringAlgorithm extends ProcessingComponentBase implements
IClusteringAlgorithm { IClusteringAlgorithm {
@Input @Input
@Processing @Processing
@Attribute(key = AttributeNames.DOCUMENTS) @Attribute(key = AttributeNames.DOCUMENTS)
private List<Document> documents; private List<Document> documents;
@Output @Output
@Processing @Processing
@Attribute(key = AttributeNames.CLUSTERS) @Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters; private List<Cluster> clusters;
@Input @Input
@Processing @Processing
@Attribute @Attribute
@IntRange(min = 1, max = 5) @IntRange(min = 1, max = 5)
private int depth = 2; private int depth = 2;
@Input @Input
@Processing @Processing
@Attribute @Attribute
@IntRange(min = 1, max = 5) @IntRange(min = 1, max = 5)
private int labels = 1; private int labels = 1;
@Override @Override
public void process() throws ProcessingException { public void process() throws ProcessingException {
clusters = Lists.newArrayList(); clusters = Lists.newArrayList();
if (documents == null) { if (documents == null) {
return; return;
} }
int documentIndex = 1; int documentIndex = 1;
for (Document document : documents) { for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex); StringBuilder label = new StringBuilder("Cluster " + documentIndex);
Cluster cluster = createCluster(label.toString(), document); Cluster cluster = createCluster(label.toString(), document);
clusters.add(cluster); clusters.add(cluster);
for (int i = 1; i <= depth; i++) { for (int i = 1; i <= depth; i++) {
label.append("."); label.append(".");
label.append(i); label.append(i);
Cluster newCluster = createCluster(label.toString(), document); Cluster newCluster = createCluster(label.toString(), document);
cluster.addSubclusters(createCluster(label.toString(), document), newCluster); cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
cluster = newCluster; cluster = newCluster;
} }
documentIndex++; documentIndex++;
} }
} }
private Cluster createCluster(String labelBase, Document... documents) { private Cluster createCluster(String labelBase, Document... documents) {
Cluster cluster = new Cluster(); Cluster cluster = new Cluster();
for (int i = 0; i < labels; i++) { for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1)); cluster.addPhrases(labelBase + "#" + (i + 1));
} }
cluster.addDocuments(documents); cluster.addDocuments(documents);
return cluster; return cluster;
} }
} }