reformat to remove tabs

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@790599 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-07-02 14:08:37 +00:00
parent dabddad667
commit 43eb481c2f
7 changed files with 466 additions and 467 deletions

View File

@ -45,204 +45,206 @@ import com.google.common.collect.Sets;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
*
* <p/>
* Output from this class is subject to change.
*
*
* @link http://project.carrot2.org
*/
@SuppressWarnings("unchecked")
public class CarrotClusteringEngine extends SearchClusteringEngine {
private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
/** Carrot2 controller that manages instances of clustering algorithms */
private CachingController controller = new CachingController();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
private String idFieldName;
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
private CachingController controller = new CachingController();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
try {
// Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<String, Object>();
List<Document> documents = getDocuments(docList, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
private String idFieldName;
// Perform clustering and convert to named list
return clustersToNamedList(controller.process(attributes,
clusteringAlgorithmClass).getClusters(), sreq.getParams());
} catch (Exception e) {
log.error("Carrot2 clustering failed", e);
throw new RuntimeException(e);
}
}
public Object cluster(Query query, DocList docList, SolrQueryRequest sreq) {
try {
// Prepare attributes for Carrot2 clustering call
Map<String, Object> attributes = new HashMap<String, Object>();
List<Document> documents = getDocuments(docList, query, sreq);
attributes.put(AttributeNames.DOCUMENTS, documents);
attributes.put(AttributeNames.QUERY, query.toString());
@Override
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
SolrParams initParams = SolrParams.toSolrParams(config);
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Pass extra overriding attributes from the request, if any
extractCarrotAttributes(sreq.getParams(), attributes);
// Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
try {
Class<?> algorithmClass = Thread.currentThread().getContextClassLoader()
.loadClass(carrotAlgorithmClassName);
if (!IClusteringAlgorithm.class.isAssignableFrom(algorithmClass)) {
throw new IllegalArgumentException("Class provided as "
+ CarrotParams.ALGORITHM + " must implement "
+ IClusteringAlgorithm.class.getName());
}
this.clusteringAlgorithmClass = (Class<? extends IClusteringAlgorithm>) algorithmClass;
} catch (ClassNotFoundException e) {
throw new RuntimeException(
"Failed to load Carrot clustering algorithm class", e);
}
// Perform clustering and convert to named list
return clustersToNamedList(controller.process(attributes,
clusteringAlgorithmClass).getClusters(), sreq.getParams());
} catch (Exception e) {
log.error("Carrot2 clustering failed", e);
throw new RuntimeException(e);
}
}
return result;
}
@Override
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
SolrParams initParams = SolrParams.toSolrParams(config);
/**
* Prepares Carrot2 documents for clustering.
*/
private List<Document> getDocuments(DocList docList,
Query query, final SolrQueryRequest sreq) throws IOException {
SolrHighlighter highligher = null;
SolrParams solrParams = sreq.getParams();
SolrCore core = sreq.getCore();
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
this.controller.init(initAttributes);
// Names of fields to deliver content for clustering
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME,
titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
Set<String> fieldsToLoad = Sets.newHashSet(urlField, titleField,
snippetField, idFieldName);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
// Get the documents
DocIterator docsIter = docList.iterator();
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY,
false);
// Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
try {
Class<?> algorithmClass = Thread.currentThread().getContextClassLoader()
.loadClass(carrotAlgorithmClassName);
if (!IClusteringAlgorithm.class.isAssignableFrom(algorithmClass)) {
throw new IllegalArgumentException("Class provided as "
+ CarrotParams.ALGORITHM + " must implement "
+ IClusteringAlgorithm.class.getName());
}
this.clusteringAlgorithmClass = (Class<? extends IClusteringAlgorithm>) algorithmClass;
} catch (ClassNotFoundException e) {
throw new RuntimeException(
"Failed to load Carrot clustering algorithm class", e);
}
SolrQueryRequest req = null;
String[] snippetFieldAry = null;
if (produceSummary == true) {
highligher = core.getHighlighter();
Map args = new HashMap();
snippetFieldAry = new String[] { snippetField };
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
@Override
public SolrIndexSearcher getSearcher() {
return sreq.getSearcher();
}
};
}
return result;
}
SolrIndexSearcher searcher = sreq.getSearcher();
List<Document> result = new ArrayList<Document>(docList.size());
FieldSelector fieldSelector = new SetBasedFieldSelector(fieldsToLoad,
Collections.emptySet());
float[] scores = { 1.0f };
int[] docsHolder = new int[1];
Query theQuery = query;
/**
* Prepares Carrot2 documents for clustering.
*/
private List<Document> getDocuments(DocList docList,
Query query, final SolrQueryRequest sreq) throws IOException {
SolrHighlighter highligher = null;
SolrParams solrParams = sreq.getParams();
SolrCore core = sreq.getCore();
while (docsIter.hasNext()) {
Integer id = docsIter.next();
org.apache.lucene.document.Document doc = searcher.doc(id,
fieldSelector);
String snippet = getValue(doc, snippetField);
if (produceSummary == true) {
docsHolder[0] = id.intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
highligher.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
}
Document carrotDocument = new Document(getValue(doc, titleField),
snippet, doc.get(urlField));
carrotDocument.addField("solrId", doc.get(idFieldName));
result.add(carrotDocument);
}
// Names of fields to deliver content for clustering
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME,
titleField);
if (StringUtils.isBlank(snippetField)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
Set<String> fieldsToLoad = Sets.newHashSet(urlField, titleField,
snippetField, idFieldName);
return result;
}
// Get the documents
DocIterator docsIter = docList.iterator();
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY,
false);
protected String getValue(org.apache.lucene.document.Document doc,
String field) {
StringBuilder result = new StringBuilder();
String[] vals = doc.getValues(field);
for (int i = 0; i < vals.length; i++) {
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append(vals[i]).append(" . ");
}
return result.toString().trim();
}
SolrQueryRequest req = null;
String[] snippetFieldAry = null;
if (produceSummary == true) {
highligher = core.getHighlighter();
Map args = new HashMap();
snippetFieldAry = new String[]{snippetField};
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
@Override
public SolrIndexSearcher getSearcher() {
return sreq.getSearcher();
}
};
}
private List clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
List result = new ArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, false), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
return result;
}
SolrIndexSearcher searcher = sreq.getSearcher();
List<Document> result = new ArrayList<Document>(docList.size());
FieldSelector fieldSelector = new SetBasedFieldSelector(fieldsToLoad,
Collections.emptySet());
float[] scores = {1.0f};
int[] docsHolder = new int[1];
Query theQuery = query;
private void clustersToNamedList(List<Cluster> outputClusters,
List parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
NamedList cluster = new SimpleOrderedMap();
parent.add(cluster);
while (docsIter.hasNext()) {
Integer id = docsIter.next();
org.apache.lucene.document.Document doc = searcher.doc(id,
fieldSelector);
String snippet = getValue(doc, snippetField);
if (produceSummary == true) {
docsHolder[0] = id.intValue();
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
highligher.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
}
Document carrotDocument = new Document(getValue(doc, titleField),
snippet, doc.get(urlField));
carrotDocument.addField("solrId", doc.get(idFieldName));
result.add(carrotDocument);
}
List<String> labels = outCluster.getPhrases();
if (labels.size() > maxLabels)
labels = labels.subList(0,maxLabels);
cluster.add("labels", labels);
return result;
}
List<Document> docs = outCluster.getDocuments();
List docList = new ArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
docList.add(doc.getField("solrId"));
}
protected String getValue(org.apache.lucene.document.Document doc,
String field) {
StringBuilder result = new StringBuilder();
String[] vals = doc.getValues(field);
for (int i = 0; i < vals.length; i++) {
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append(vals[i]).append(" . ");
}
return result.toString().trim();
}
if (outputSubClusters) {
List subclusters = new ArrayList();
cluster.add("clusters",subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);
}
}
}
private List clustersToNamedList(List<Cluster> carrotClusters,
SolrParams solrParams) {
List result = new ArrayList();
clustersToNamedList(carrotClusters, result, solrParams.getBool(
CarrotParams.OUTPUT_SUB_CLUSTERS, false), solrParams.getInt(
CarrotParams.NUM_DESCRIPTIONS, Integer.MAX_VALUE));
return result;
}
/**
* Extracts parameters that can possibly match some attributes of Carrot2 algorithms.
*/
private void extractCarrotAttributes(SolrParams solrParams,
Map<String, Object> attributes) {
// Extract all non-predefined parameters. This way, we'll be able to set all
// parameters of Carrot2 algorithms without defining their names as constants.
for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames
.hasNext();) {
String paramName = paramNames.next();
if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) {
attributes.put(paramName, solrParams.get(paramName));
}
}
}
private void clustersToNamedList(List<Cluster> outputClusters,
List parent, boolean outputSubClusters, int maxLabels) {
for (Cluster outCluster : outputClusters) {
NamedList cluster = new SimpleOrderedMap();
parent.add(cluster);
List<String> labels = outCluster.getPhrases();
if (labels.size() > maxLabels)
labels = labels.subList(0, maxLabels);
cluster.add("labels", labels);
List<Document> docs = outCluster.getDocuments();
List docList = new ArrayList();
cluster.add("docs", docList);
for (Document doc : docs) {
docList.add(doc.getField("solrId"));
}
if (outputSubClusters) {
List subclusters = new ArrayList();
cluster.add("clusters", subclusters);
clustersToNamedList(outCluster.getSubclusters(), subclusters,
outputSubClusters, maxLabels);
}
}
}
/**
* Extracts parameters that can possibly match some attributes of Carrot2 algorithms.
*/
private void extractCarrotAttributes(SolrParams solrParams,
Map<String, Object> attributes) {
// Extract all non-predefined parameters. This way, we'll be able to set all
// parameters of Carrot2 algorithms without defining their names as constants.
for (Iterator<String> paramNames = solrParams.getParameterNamesIterator(); paramNames
.hasNext();) {
String paramName = paramNames.next();
if (!CarrotParams.CARROT_PARAM_NAMES.contains(paramName)) {
attributes.put(paramName, solrParams.get(paramName));
}
}
}
}

View File

@ -21,22 +21,20 @@ import com.google.common.collect.ImmutableSet;
* limitations under the License.
*/
/**
*
*/
public interface CarrotParams {
String CARROT_PREFIX = "carrot.";
String CARROT_PREFIX = "carrot.";
String ALGORITHM = CARROT_PREFIX + "algorithm";
String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
String URL_FIELD_NAME = CARROT_PREFIX + "url";
String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String ALGORITHM = CARROT_PREFIX + "algorithm";
String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
String URL_FIELD_NAME = CARROT_PREFIX + "url";
String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS);
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS);
}

View File

@ -23,21 +23,21 @@ import org.apache.solr.util.AbstractSolrTestCase;
*
*/
public class AbstractClusteringTest extends AbstractSolrTestCase {
protected int numberOfDocs = 0;
protected int numberOfDocs = 0;
@Override
public void setUp() throws Exception {
super.setUp();
numberOfDocs = 0;
for (String[] doc : DOCUMENTS) {
assertU("add failed", adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2]));
numberOfDocs++;
}
assertU("add failed", adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2]));
numberOfDocs++;
}
assertU("commit", commit());
}
public String getSchemaFile() {
public String getSchemaFile() {
return "schema.xml";
}
@ -45,154 +45,154 @@ public class AbstractClusteringTest extends AbstractSolrTestCase {
return "solrconfig.xml";
}
final String [][] DOCUMENTS = new String[][] {
{ "http://en.wikipedia.org/wiki/Data_mining",
"Data Mining - Wikipedia",
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns." },
final String[][] DOCUMENTS = new String[][]{
{"http://en.wikipedia.org/wiki/Data_mining",
"Data Mining - Wikipedia",
"Article about knowledge-discovery in databases (KDD), the practice of automatically searching large stores of data for patterns."},
{ "http://en.wikipedia.org/wiki/Datamining",
"Data mining - Wikipedia, the free encyclopedia",
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..." },
{"http://en.wikipedia.org/wiki/Datamining",
"Data mining - Wikipedia, the free encyclopedia",
"Data mining is the entire process of applying computer-based methodology, ... Moreover, some data-mining systems such as neural networks are inherently geared ..."},
{ "http://www.statsoft.com/textbook/stdatmin.html",
"Electronic Statistics Textbook: Data Mining Techniques",
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques." },
{"http://www.statsoft.com/textbook/stdatmin.html",
"Electronic Statistics Textbook: Data Mining Techniques",
"Outlines the crucial concepts in data mining, defines the data warehousing process, and offers examples of computational and graphical exploratory data analysis techniques."},
{ "http://www.thearling.com/text/dmwhite/dmwhite.htm",
"An Introduction to Data Mining",
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..." },
{"http://www.thearling.com/text/dmwhite/dmwhite.htm",
"An Introduction to Data Mining",
"Data mining, the extraction of hidden predictive information from large ... Data mining tools predict future trends and behaviors, allowing businesses to ..."},
{ "http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
"Data Mining: What is Data Mining?",
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works." },
{"http://www.anderson.ucla.edu/faculty/jason.frand/teacher/technologies/palace/datamining.htm",
"Data Mining: What is Data Mining?",
"Outlines what knowledge discovery, the process of analyzing data from different perspectives and summarizing it into useful information, can do and how it works."},
{ "http://www.spss.com/datamine",
"Data Mining Software, Data Mining Applications and Data Mining Solutions",
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..." },
{"http://www.spss.com/datamine",
"Data Mining Software, Data Mining Applications and Data Mining Solutions",
"The patterns uncovered using data mining help organizations make better and ... data mining customer ... Data mining applications, on the other hand, embed ..."},
{ "http://www.kdnuggets.com/",
"KD Nuggets",
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings." },
{"http://www.kdnuggets.com/",
"KD Nuggets",
"Newsletter on the data mining and knowledge industries, offering information on data mining, knowledge discovery, text mining, and web mining software, courses, jobs, publications, and meetings."},
{ "http://www.answers.com/topic/data-mining",
"data mining: Definition from Answers.com",
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..." },
{"http://www.answers.com/topic/data-mining",
"data mining: Definition from Answers.com",
"data mining n. The automatic extraction of useful, often previously unknown information from large databases or data ... Data Mining For Investing ..."},
{ "http://www.statsoft.com/products/dataminer.htm",
"STATISTICA Data Mining and Predictive Modeling Solutions",
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..." },
{"http://www.statsoft.com/products/dataminer.htm",
"STATISTICA Data Mining and Predictive Modeling Solutions",
"GRC site-wide menuing system research and development. ... Contact a Data Mining Solutions Consultant. News and Success Stories. Events ..."},
{ "http://datamining.typepad.com/",
"Data Mining: Text Mining, Visualization and Social Media",
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..." },
{"http://datamining.typepad.com/",
"Data Mining: Text Mining, Visualization and Social Media",
"Commentary on text mining, data mining, social media and data visualization. ... While mining Twitter data for business and marketing intelligence (trend/buzz ..."},
{ "http://www.twocrows.com/",
"Two Crows Corporation",
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use." },
{"http://www.twocrows.com/",
"Two Crows Corporation",
"Dedicated to the development, marketing, sales and support of tools for knowledge discovery to make data mining accessible and easy to use."},
{ "http://www.thearling.com/",
"Thearling.com",
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies." },
{"http://www.thearling.com/",
"Thearling.com",
"Kurt Thearling's site dedicated to sharing information about data mining, the automated extraction of hidden predictive information from databases, and other analytic technologies."},
{ "http://www.ccsu.edu/datamining/",
"CCSU - Data Mining",
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling." },
{"http://www.ccsu.edu/datamining/",
"CCSU - Data Mining",
"Offers degrees and certificates in data mining. Allows students to explore cutting-edge data mining techniques and applications: market basket analysis, decision trees, neural networks, machine learning, web mining, and data modeling."},
{ "http://www.oracle.com/technology/products/bi/odm",
"Oracle Data Mining",
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..." },
{"http://www.oracle.com/technology/products/bi/odm",
"Oracle Data Mining",
"Oracle Data Mining Product Center ... New Oracle Data Mining Powers New Social CRM Application (more information ... Mining High-Dimensional Data for ..."},
{ "http://databases.about.com/od/datamining/a/datamining.htm",
"Data Mining: An Introduction",
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques." },
{"http://databases.about.com/od/datamining/a/datamining.htm",
"Data Mining: An Introduction",
"About.com article on how businesses are discovering new trends and patterns of behavior that previously went unnoticed through data mining, automated statistical analysis techniques."},
{ "http://www.dmoz.org/Computers/Software/Databases/Data_Mining/",
"Open Directory - Computers: Software: Databases: Data Mining",
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..." },
{"http://www.dmoz.org/Computers/Software/Databases/Data_Mining/",
"Open Directory - Computers: Software: Databases: Data Mining",
"Data Mining and Knowledge Discovery - A peer-reviewed journal publishing ... Data mining creates information assets that an organization can leverage to ..."},
{ "http://www.cs.wisc.edu/dmi/",
"DMI:Data Mining Institute",
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..." },
{"http://www.cs.wisc.edu/dmi/",
"DMI:Data Mining Institute",
"Data Mining Institute at UW-Madison ... The Data Mining Institute (DMI) was started on June 1, 1999 at the Computer ... of the Data Mining Group of Microsoft ..."},
{ "http://www.the-data-mine.com/",
"The Data Mine",
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining." },
{"http://www.the-data-mine.com/",
"The Data Mine",
"Provides information about data mining also known as knowledge discovery in databases (KDD) or simply knowledge discovery. List software, events, organizations, and people working in data mining."},
{ "http://www.statserv.com/datamining.html",
"St@tServ - About Data Mining",
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..." },
{"http://www.statserv.com/datamining.html",
"St@tServ - About Data Mining",
"St@tServ Data Mining page ... Data mining in molecular biology, by Alvis Brazma. Graham Williams page. Knowledge Discovery and Data Mining Resources, ..."},
{ "http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm",
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..." },
{"http://ocw.mit.edu/OcwWeb/Sloan-School-of-Management/15-062Data-MiningSpring2003/CourseHome/index.htm",
"MIT OpenCourseWare | Sloan School of Management | 15.062 Data Mining ...",
"Introduces students to a class of methods known as data mining that assists managers in recognizing patterns and making intelligent use of massive amounts of ..."},
{ "http://www.pentaho.com/products/data_mining/",
"Pentaho Commercial Open Source Business Intelligence: Data Mining",
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..." },
{"http://www.pentaho.com/products/data_mining/",
"Pentaho Commercial Open Source Business Intelligence: Data Mining",
"For example, data mining can warn you there's a high probability a specific ... Pentaho Data Mining is differentiated by its open, standards-compliant nature, ..."},
{ "http://www.investorhome.com/mining.htm",
"Investor Home - Data Mining",
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..." },
{"http://www.investorhome.com/mining.htm",
"Investor Home - Data Mining",
"Data Mining or Data Snooping is the practice of searching for relationships and ... Data mining involves searching through databases for correlations and patterns ..."},
{ "http://www.datamining.com/",
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics." },
{"http://www.datamining.com/",
"Predictive Modeling and Predictive Analytics Solutions | Enterprise ...",
"Insightful Enterprise Miner - Enterprise data mining for predictive modeling and predictive analytics."},
{ "http://www.sourcewatch.org/index.php?title=Data_mining",
"Data mining - SourceWatch",
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..." },
{"http://www.sourcewatch.org/index.php?title=Data_mining",
"Data mining - SourceWatch",
"These agencies reported 199 data mining projects, of which 68 ... Office, \"DATA MINING. ... powerful technology known as data mining -- and how, in the ..."},
{ "http://www.autonlab.org/tutorials/",
"Statistical Data Mining Tutorials",
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms." },
{"http://www.autonlab.org/tutorials/",
"Statistical Data Mining Tutorials",
"Includes a set of tutorials on many aspects of statistical data mining, including the foundations of probability, the foundations of statistical data analysis, and most of the classic machine learning and data mining algorithms."},
{ "http://www.microstrategy.com/data-mining/index.asp",
"Data Mining",
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..." },
{"http://www.microstrategy.com/data-mining/index.asp",
"Data Mining",
"With MicroStrategy, data mining scoring is fully integrated into mainstream ... The integration of data mining models from other applications is accomplished by ..."},
{ "http://www.datamininglab.com/",
"Elder Research",
"Provides consulting and short courses in data mining and pattern discovery patterns in data." },
{"http://www.datamininglab.com/",
"Elder Research",
"Provides consulting and short courses in data mining and pattern discovery patterns in data."},
{ "http://www.sqlserverdatamining.com/",
"SQL Server Data Mining > Home",
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..." },
{"http://www.sqlserverdatamining.com/",
"SQL Server Data Mining > Home",
"SQL Server Data Mining Portal ... Data Mining as an Application Platform (Whitepaper) Creating a Web Cross-sell Application with SQL Server 2005 Data Mining (Article) ..."},
{ "http://databases.about.com/cs/datamining/g/dmining.htm",
"Data Mining",
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..." },
{"http://databases.about.com/cs/datamining/g/dmining.htm",
"Data Mining",
"What is data mining? Find out here! ... Book Review: Data Mining and Statistical Analysis Using SQL. What is Data Mining, and What Does it Have to Do with ..."},
{ "http://www.sas.com/technologies/analytics/datamining/index.html",
"Data Mining Software and Text Mining | SAS",
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..." }
{"http://www.sas.com/technologies/analytics/datamining/index.html",
"Data Mining Software and Text Mining | SAS",
"... raw data to smarter ... Data Mining is an iterative process of creating ... The knowledge gleaned from data and text mining can be used to fuel ..."}
};
}

View File

@ -16,18 +16,16 @@ package org.apache.solr.handler.clustering;
* limitations under the License.
*/
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.SpellCheckComponent;
import org.apache.solr.handler.component.QueryComponent;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.QueryComponent;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.request.SolrRequestHandler;
/**
@ -45,7 +43,7 @@ public class ClusteringComponentTest extends AbstractClusteringTest {
params.add(ClusteringComponent.COMPONENT_NAME, "true");
params.add(CommonParams.Q, "*:*");
params.add(ClusteringParams.USE_SEARCH_RESULTS, "true");
@ -76,7 +74,7 @@ public class ClusteringComponentTest extends AbstractClusteringTest {
//System.out.println("Clusters: " + clusters);
assertTrue("clusters is null and it shouldn't be", clusters != null);
}
}

View File

@ -1,7 +1,7 @@
package org.apache.solr.handler.clustering;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.search.DocSet;
@ -9,7 +9,7 @@ import org.apache.solr.search.DocSet;
*
*
**/
public class MockDocumentClusteringEngine extends DocumentClusteringEngine{
public class MockDocumentClusteringEngine extends DocumentClusteringEngine {
public NamedList cluster(DocSet docs, SolrParams solrParams) {
NamedList result = new NamedList();
return result;

View File

@ -17,151 +17,153 @@ package org.apache.solr.handler.clustering.carrot2;
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.search.*;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.clustering.AbstractClusteringTest;
import org.apache.solr.handler.clustering.ClusteringComponent;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.carrot2.util.attribute.AttributeUtils;
import java.io.IOException;
import java.util.List;
/**
*
*/
@SuppressWarnings("unchecked")
public class CarrotClusteringEngineTest extends AbstractClusteringTest {
public void testCarrotLingo() throws Exception {
checkEngine(getClusteringEngine("default"), 9);
}
public void testCarrotLingo() throws Exception {
checkEngine(getClusteringEngine("default"), 9);
}
public void testCarrotStc() throws Exception {
checkEngine(getClusteringEngine("stc"), 2);
}
public void testCarrotStc() throws Exception {
checkEngine(getClusteringEngine("stc"), 2);
}
public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
1, 1, 0);
}
public void testWithoutSubclusters() throws Exception {
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs),
1, 1, 0);
}
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 1, 2);
}
public void testNumDescriptions() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 3, 0);
}
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 3, 0);
}
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
assertNotNull("clustering component should not be null", comp);
CarrotClusteringEngine engine = (CarrotClusteringEngine) comp
.getSearchClusteringEngines().get(engineName);
assertNotNull("clustering engine for name: " + engineName
+ " should not be null", engine);
return engine;
}
public void testWithSubclusters() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CarrotParams.OUTPUT_SUB_CLUSTERS, true);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 1, 2);
}
private List checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, expectedNumClusters, new ModifiableSolrParams());
}
public void testNumDescriptions() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 5);
params.set(CarrotParams.NUM_DESCRIPTIONS, 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 3, 0);
}
private List checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
MatchAllDocsQuery query = new MatchAllDocsQuery();
DocList docList;
try {
SolrIndexSearcher searcher = ref.get();
docList = searcher.getDocList(query, (Query) null, new Sort(), 0,
numberOfDocs);
assertEquals("docList size", this.numberOfDocs, docList.matches());
} finally {
ref.decref();
}
public void testCarrotAttributePassing() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "depth"), 1);
params.set(AttributeUtils.getKey(MockClusteringAlgorithm.class, "labels"), 3);
checkClusters(checkEngine(getClusteringEngine("mock"), this.numberOfDocs,
params), 1, 3, 0);
}
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
solrParams.add(clusteringParams);
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");
assertNotNull("clustering component should not be null", comp);
CarrotClusteringEngine engine = (CarrotClusteringEngine) comp
.getSearchClusteringEngines().get(engineName);
assertNotNull("clustering engine for name: " + engineName
+ " should not be null", engine);
return engine;
}
// Perform clustering
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
List results = (List)engine.cluster(query, docList, req);
req.close();
assertEquals("number of clusters", expectedNumClusters, results.size());
checkClusters(results, false);
return results;
}
private List checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters) throws IOException {
return checkEngine(engine, expectedNumClusters, new ModifiableSolrParams());
}
private void checkClusters(List results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
NamedList cluster = (NamedList) results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
private List checkEngine(CarrotClusteringEngine engine,
int expectedNumClusters, SolrParams clusteringParams) throws IOException {
// Get all documents to cluster
RefCounted<SolrIndexSearcher> ref = h.getCore().getSearcher();
MatchAllDocsQuery query = new MatchAllDocsQuery();
DocList docList;
try {
SolrIndexSearcher searcher = ref.get();
docList = searcher.getDocList(query, (Query) null, new Sort(), 0,
numberOfDocs);
assertEquals("docList size", this.numberOfDocs, docList.matches());
} finally {
ref.decref();
}
private void checkClusters(List results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
checkCluster((NamedList)results.get(i), hasSubclusters );
}
}
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(CarrotParams.PRODUCE_SUMMARY, "true");
solrParams.add(clusteringParams);
private void checkCluster(NamedList cluster, boolean hasSubclusters) {
List docs = (List)cluster.get("docs");
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
// Perform clustering
LocalSolrQueryRequest req = new LocalSolrQueryRequest(h.getCore(), solrParams);
List results = (List) engine.cluster(query, docList, req);
req.close();
assertEquals("number of clusters", expectedNumClusters, results.size());
checkClusters(results, false);
return results;
}
List labels = (List) cluster.get("labels");
assertNotNull("labels is null but it shouldn't be", labels);
private void checkClusters(List results, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
for (int i = 0; i < results.size(); i++) {
NamedList cluster = (NamedList) results.get(i);
checkCluster(cluster, expectedDocCount, expectedLabelCount,
expectedSubclusterCount);
}
}
if (hasSubclusters) {
List subclusters = (List) cluster.get("clusters");
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
private void checkClusters(List results, boolean hasSubclusters) {
for (int i = 0; i < results.size(); i++) {
checkCluster((NamedList) results.get(i), hasSubclusters);
}
}
private void checkCluster(NamedList cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
((List) cluster.get("docs")).size());
assertEquals("number of labels in cluster", expectedLabelCount,
((List) cluster.get("labels")).size());
private void checkCluster(NamedList cluster, boolean hasSubclusters) {
List docs = (List) cluster.get("docs");
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
if (expectedSubclusterCount > 0) {
List subclusters = (List) cluster.get("clusters");
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
List labels = (List) cluster.get("labels");
assertNotNull("labels is null but it shouldn't be", labels);
if (hasSubclusters) {
List subclusters = (List) cluster.get("clusters");
assertNotNull("subclusters is null but it shouldn't be", subclusters);
}
}
private void checkCluster(NamedList cluster, int expectedDocCount,
int expectedLabelCount, int expectedSubclusterCount) {
checkCluster(cluster, expectedSubclusterCount > 0);
assertEquals("number of docs in cluster", expectedDocCount,
((List) cluster.get("docs")).size());
assertEquals("number of labels in cluster", expectedLabelCount,
((List) cluster.get("labels")).size());
if (expectedSubclusterCount > 0) {
List subclusters = (List) cluster.get("clusters");
assertEquals("numClusters", expectedSubclusterCount, subclusters.size());
assertEquals("number of subclusters in cluster",
expectedSubclusterCount, subclusters.size());
}
}
}

View File

@ -1,69 +1,68 @@
package org.apache.solr.handler.clustering.carrot2;
import java.util.List;
import com.google.common.collect.Lists;
import org.carrot2.core.*;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.util.attribute.*;
import org.carrot2.util.attribute.constraint.IntRange;
import com.google.common.collect.Lists;
import java.util.List;
@Bindable(prefix = "MockClusteringAlgorithm")
public class MockClusteringAlgorithm extends ProcessingComponentBase implements
IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
private List<Document> documents;
IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
private List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
@Input
@Processing
@Attribute
@IntRange(min = 1, max = 5)
private int depth = 2;
@Input
@Processing
@Attribute
@IntRange(min = 1, max = 5)
private int depth = 2;
@Input
@Processing
@Attribute
@IntRange(min = 1, max = 5)
private int labels = 1;
@Input
@Processing
@Attribute
@IntRange(min = 1, max = 5)
private int labels = 1;
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
if (documents == null) {
return;
}
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
if (documents == null) {
return;
}
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
Cluster cluster = createCluster(label.toString(), document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
Cluster newCluster = createCluster(label.toString(), document);
cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
int documentIndex = 1;
for (Document document : documents) {
StringBuilder label = new StringBuilder("Cluster " + documentIndex);
Cluster cluster = createCluster(label.toString(), document);
clusters.add(cluster);
for (int i = 1; i <= depth; i++) {
label.append(".");
label.append(i);
Cluster newCluster = createCluster(label.toString(), document);
cluster.addSubclusters(createCluster(label.toString(), document), newCluster);
cluster = newCluster;
}
documentIndex++;
}
}
private Cluster createCluster(String labelBase, Document... documents) {
Cluster cluster = new Cluster();
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}
cluster.addDocuments(documents);
return cluster;
}
private Cluster createCluster(String labelBase, Document... documents) {
Cluster cluster = new Cluster();
for (int i = 0; i < labels; i++) {
cluster.addPhrases(labelBase + "#" + (i + 1));
}
cluster.addDocuments(documents);
return cluster;
}
}