LUCENE-3622: merge trunk (1212397:1212829)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3622@1212830 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-12-10 18:05:08 +00:00
commit 5627b52e70
22 changed files with 962 additions and 519 deletions

View File

@ -217,7 +217,11 @@ Changes in backwards compatibility policy
instances. Furthermore, IndexReader.setNorm() was removed. If you need
customized norm values, the recommended way to do this is by modifying
SimilarityProvider to use an external byte[] or one of the new DocValues
fields (LUCENE-3108). (Uwe Schindler, Robert Muir)
fields (LUCENE-3108). Alternatively, to dynamically change norms (boost
*and* length norm) at query time, wrap your IndexReader using
FilterIndexReader, overriding FilterIndexReader.norms(). To persist the
changes on disk, copy the FilteredIndexReader to a new index using
IndexWriter.addIndexes(). (Uwe Schindler, Robert Muir)
Changes in Runtime Behavior
@ -676,6 +680,22 @@ Security fixes
prevents this as best as it can by throwing AlreadyClosedException
also on clones. (Uwe Schindler, Robert Muir)
API Changes
* LUCENE-3606: IndexReader will be made read-only in Lucene 4.0, so all
methods allowing to delete or undelete documents using IndexReader were
deprecated; you should use IndexWriter now. Consequently
IndexReader.commit() and all open(), openIfChanged(), clone() methods
taking readOnly booleans (or IndexDeletionPolicy instances) were
deprecated. IndexReader.setNorm() is superfluous and was deprecated.
If you have to change per-document boost use CustomScoreQuery.
If you want to dynamically change norms (boost *and* length norm) at
query time, wrap your IndexReader using FilterIndexReader, overriding
FilterIndexReader.norms(). To persist the changes on disk, copy the
FilteredIndexReader to a new index using IndexWriter.addIndexes().
In Lucene 4.0, SimilarityProvider will allow you to customize scoring
using external norms, too. (Uwe Schindler, Robert Muir)
New Features
* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
@ -691,6 +711,12 @@ Bug fixes
* LUCENE-3627: Don't let an errant 0-byte segments_N file corrupt the index.
(Ken McCracken via Mike McCandless)
* LUCENE-3630: The internal method MultiReader.doOpenIfChanged(boolean doClone)
was overriding IndexReader.doOpenIfChanged(boolean readOnly), so changing the
contract of the overridden method. This method was renamed and made private.
In ParallelReader the bug was not existent, but the implementation method
was also made private. (Uwe Schindler)
Documentation
* LUCENE-3597: Fixed incorrect grouping documentation. (Martijn van Groningen, Robert Muir)

View File

@ -76,6 +76,11 @@ API Changes
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
* LUCENE-3606: FieldNormModifier was deprecated, because IndexReader's
setNorm() was deprecated. Furthermore, this class is broken, as it does
not take position overlaps into account while recalculating norms.
(Uwe Schindler, Robert Muir)
Bug Fixes
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was

View File

@ -108,7 +108,7 @@ public class MultiReader extends IndexReader implements Cloneable {
*/
@Override
protected synchronized IndexReader doOpenIfChanged() throws CorruptIndexException, IOException {
return doOpenIfChanged(false);
return doReopen(false);
}
/**
@ -123,7 +123,7 @@ public class MultiReader extends IndexReader implements Cloneable {
@Override
public synchronized Object clone() {
try {
return doOpenIfChanged(true);
return doReopen(true);
} catch (Exception ex) {
throw new RuntimeException(ex);
}
@ -141,7 +141,7 @@ public class MultiReader extends IndexReader implements Cloneable {
* @throws CorruptIndexException
* @throws IOException
*/
protected IndexReader doOpenIfChanged(boolean doClone) throws CorruptIndexException, IOException {
private IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
ensureOpen();
boolean changed = false;

View File

@ -243,7 +243,7 @@ public class ParallelReader extends IndexReader {
return doReopen(false);
}
protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
private IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
ensureOpen();
boolean reopened = false;

View File

@ -30,11 +30,37 @@ import org.apache.lucene.util.LuceneTestCase;
*/
public class TestIndexReaderClone extends LuceneTestCase {
private void assertDelDocsRefCountEquals(int refCount, SegmentReader reader) {
assertEquals(refCount, reader.liveDocsRef.get());
public void testDirectoryReader() throws Exception {
final Directory dir = createIndex(0);
performDefaultTests(IndexReader.open(dir));
dir.close();
}
public void testCloseStoredFields() throws Exception {
public void testMultiReader() throws Exception {
final Directory dir1 = createIndex(0);
final IndexReader r1 = IndexReader.open(dir1);
final Directory dir2 = createIndex(0);
final IndexReader r2 = IndexReader.open(dir2);
final MultiReader mr = new MultiReader(r1, r2);
performDefaultTests(mr);
dir1.close();
dir2.close();
}
public void testParallelReader() throws Exception {
final Directory dir1 = createIndex(0);
final IndexReader r1 = IndexReader.open(dir1);
final Directory dir2 = createIndex(1);
final IndexReader r2 = IndexReader.open(dir2);
final ParallelReader pr = new ParallelReader();
pr.add(r1);
pr.add(r2);
performDefaultTests(pr);
dir1.close();
dir2.close();
}
private Directory createIndex(int no) throws Exception {
final Directory dir = newDirectory();
IndexWriter w = new IndexWriter(
dir,
@ -42,13 +68,19 @@ public class TestIndexReaderClone extends LuceneTestCase {
setMergePolicy(newLogMergePolicy(false))
);
Document doc = new Document();
doc.add(newField("field", "yes it's stored", TextField.TYPE_STORED));
doc.add(newField("field"+no, "yes it's stored", TextField.TYPE_STORED));
w.addDocument(doc);
w.close();
IndexReader r1 = IndexReader.open(dir);
return dir;
}
private void performDefaultTests(IndexReader r1) throws Exception {
IndexReader r2 = (IndexReader) r1.clone();
assertTrue(r1 != r2);
TestIndexReader.assertIndexEquals(r1, r2);
r1.close();
r2.close();
dir.close();
TestIndexReaderReopen.assertReaderClosed(r1, true, true);
TestIndexReaderReopen.assertReaderClosed(r2, true, true);
}
}

View File

@ -757,7 +757,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
}
}
private void assertReaderClosed(IndexReader reader, boolean checkSubReaders, boolean checkNormsClosed) {
static void assertReaderClosed(IndexReader reader, boolean checkSubReaders, boolean checkNormsClosed) {
assertEquals(0, reader.getRefCount());
if (checkNormsClosed && reader instanceof SegmentReader) {

View File

@ -195,6 +195,8 @@ New Features
"multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
* SOLR-2481: Add support for commitWithin in DataImportHandler (Sami Siren via yonik)
Optimizations
----------------------
@ -409,6 +411,7 @@ New Features
* SOLR-2919: Added support for localized range queries when the analysis chain uses
CollationKeyFilter or ICUCollationKeyFilter. (Michael Sokolov, rmuir)
Bug Fixes
----------------------
* SOLR-2912: Fixed File descriptor leak in ShowFileRequestHandler (Michael Ryan, shalin)
@ -418,6 +421,9 @@ Bug Fixes
* SOLR-2509: StringIndexOutOfBoundsException in the spellchecker collate when the term contains
a hyphen. (Thomas Gambier caught the bug, Steffen Godskesen did the patch, via Erick Erickson)
* SOLR-2955: Fixed IllegalStateException when querying with group.sort=score desc in sharded
environment. (Steffen Elberg Godskesen, Martijn van Groningen)
Other Changes
----------------------

View File

@ -13,7 +13,27 @@ $Id$
================== Release 3.6.0 ==================
(No Changes)
* SOLR-2937: Configuring the number of contextual snippets used for
search results clustering. The hl.snippets parameter is now respected
by the clustering plugin, can be overridden by carrot.summarySnippets
if needed (Stanislaw Osinski).
* SOLR-2938: Clustering on multiple fields. The carrot.title and
carrot.snippet can now take comma- or space-separated lists of
field names to cluster (Stanislaw Osinski).
* SOLR-2939: Clustering of multilingual search results. The document's
language field be passed in the carrot.lang parameter, the carrot.lcmap
parameter enables mapping of language codes to ISO 639 (Stanislaw Osinski).
* SOLR-2940: Passing values for custom Carrot2 fields. The custom field
mapping are defined using the carrot.custom parameter (Stanislaw Osinski).
* SOLR-2941: NullPointerException on clustering component initialization
when schema does not have a unique key field (Stanislaw Osinski).
* SOLR-2942: ClassCastException when passing non-textual fields for
clustering (Stanislaw Osinski).
================== Release 3.5.0 ==================
@ -21,10 +41,10 @@ $Id$
================== Release 3.4.0 ==================
SOLR-2706: The carrot.lexicalResourcesDir parameter now works
with absolute directories (Stanislaw Osinski)
* SOLR-2706: The carrot.lexicalResourcesDir parameter now works
with absolute directories (Stanislaw Osinski)
SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to
* SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to
"carrot.fragSize" (Stanislaw Osinski).
================== Release 3.3.0 ==================

View File

@ -19,15 +19,18 @@ package org.apache.solr.handler.clustering.carrot2;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.ObjectUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrDocument;
@ -45,6 +48,7 @@ import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.highlight.SolrHighlighter;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocSlice;
import org.apache.solr.search.SolrIndexSearcher;
@ -54,6 +58,7 @@ import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
@ -77,13 +82,13 @@ import com.google.common.io.Closeables;
* @see "http://project.carrot2.org"
*/
public class CarrotClusteringEngine extends SearchClusteringEngine {
private transient static Logger log = LoggerFactory
private transient static Logger log = LoggerFactory
.getLogger(CarrotClusteringEngine.class);
/**
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
*/
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
/**
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
*/
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
/**
* Name of Carrot2 document's field containing Solr document's identifier.
@ -102,7 +107,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
*/
private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
private static class SolrResourceLocator implements IResourceLocator {
private final SolrResourceLoader resourceLoader;
private final String carrot2ResourcesDir;
@ -227,8 +232,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
}
}
@Override
@SuppressWarnings({ "unchecked", "rawtypes" })
@Override
@SuppressWarnings({ "unchecked", "rawtypes" })
public String init(NamedList config, final SolrCore core) {
String result = super.init(config, core);
final SolrParams initParams = SolrParams.toSolrParams(config);
@ -243,13 +248,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Additionally, we set a custom lexical resource factory for Carrot2 that
// will use both Carrot2 default stop words as well as stop words from
// the StopFilter defined on the field.
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrIndexSchema", core.getSchema());
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrIndexSchema", core.getSchema());
// Customize Carrot2's resource lookup to first look for resources
// using Solr's resource loader. If that fails, try loading from the classpath.
@ -261,7 +266,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
SchemaField uniqueField = core.getSchema().getUniqueKeyField();
if (uniqueField == null) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField");
}
this.idFieldName = uniqueField.getName();
// Make sure the requested Carrot2 clustering algorithm class is available
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
@ -283,25 +294,35 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
fields.add(idFieldName);
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
return fields;
fields.addAll(getCustomFieldsMap(solrParams).keySet());
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME);
if (StringUtils.isNotBlank(languageField)) {
fields.add(languageField);
}
return fields;
}
/**
* Returns the names of fields that will be delivering the actual
* content for clustering. Currently, there are two such fields: document
* title and document content.
*/
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
/**
* Returns the names of fields that will be delivering the actual
* content for clustering. Currently, there are two such fields: document
* title and document content.
*/
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
SolrParams solrParams = sreq.getParams();
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
if (StringUtils.isBlank(snippetField)) {
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
if (StringUtils.isBlank(snippetFieldSpec)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
+ " must not be blank.");
}
return Sets.newHashSet(titleField, snippetField);
}
final Set<String> fields = Sets.newHashSet();
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
return fields;
}
/**
* Prepares Carrot2 documents for clustering.
@ -313,8 +334,27 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
SolrCore core = sreq.getCore();
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
// Maps Solr field names to Carrot2 custom field names
Map<String, String> customFields = getCustomFieldsMap(solrParams);
// Parse language code map string into a map
Map<String, String> languageCodeMap = Maps.newHashMap();
if (StringUtils.isNotBlank(languageField)) {
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "")
.split("[, ]")) {
final String[] split = pair.split(":");
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
languageCodeMap.put(split[0], split[1]);
} else {
log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP
+ ": '" + pair + "'. Skipping this mapping.");
}
}
}
// Get the documents
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
@ -325,12 +365,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
highlighter = HighlightComponent.getHighlighter(core);
if (highlighter != null){
Map<String, Object> args = Maps.newHashMap();
snippetFieldAry = new String[]{snippetField};
snippetFieldAry = snippetFieldSpec.split("[, ]");
args.put(HighlightParams.FIELDS, snippetFieldAry);
args.put(HighlightParams.HIGHLIGHT, "true");
args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
args.put(HighlightParams.SIMPLE_POST, "");
args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
@Override
public SolrIndexSearcher getSearcher() {
@ -352,7 +393,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
while (docsIter.hasNext()) {
SolrDocument sdoc = docsIter.next();
String snippet = getValue(sdoc, snippetField);
String snippet = null;
// TODO: docIds will be null when running distributed search.
// See comment in ClusteringComponent#finishStage().
if (produceSummary && docIds != null) {
@ -360,34 +402,115 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
//should only be one document with one field
//should only be one document
@SuppressWarnings("unchecked")
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
String [] highlt = tmp.get(snippetField);
if (highlt != null && highlt.length == 1) {
snippet = highlt[0];
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
final StringBuilder sb = new StringBuilder();
for (int j = 0; j < snippetFieldAry.length; j++) {
// Join fragments with a period, so that Carrot2 does not create
// cross-fragment phrases, such phrases rarely make sense.
String [] highlt = tmp.get(snippetFieldAry[j]);
if (highlt != null && highlt.length > 0) {
for (int i = 0; i < highlt.length; i++) {
sb.append(highlt[i]);
sb.append(" . ");
}
}
}
snippet = sb.toString();
}
}
// If summaries not enabled or summary generation failed, use full content.
if (snippet == null) {
snippet = getConcatenated(sdoc, snippetFieldSpec);
}
// Create a Carrot2 document
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), ""));
// Store Solr id of the document, we need it to map document instances
// found in clusters back to identifiers.
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
// Set language
if (StringUtils.isNotBlank(languageField)) {
Collection<Object> languages = sdoc.getFieldValues(languageField);
if (languages != null) {
// Use the first Carrot2-supported language
for (Object l : languages) {
String lang = ObjectUtils.toString(l, "");
if (languageCodeMap.containsKey(lang)) {
lang = languageCodeMap.get(lang);
}
// Language detection Library for Java uses dashes to separate
// language variants, such as 'zh-cn', but Carrot2 uses underscores.
if (lang.indexOf('-') > 0) {
lang = lang.replace('-', '_');
}
// If the language is supported by Carrot2, we'll get a non-null value
final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
if (carrot2Language != null) {
carrotDocument.setLanguage(carrot2Language);
break;
}
}
}
}
Document carrotDocument = new Document(getValue(sdoc, titleField),
snippet, (String)sdoc.getFieldValue(urlField));
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
// Add custom fields
if (customFields != null) {
for (Entry<String, String> entry : customFields.entrySet()) {
carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey()));
}
}
result.add(carrotDocument);
}
return result;
}
protected String getValue(SolrDocument sdoc, String field) {
/**
* Prepares a map of Solr field names (keys) to the corresponding Carrot2
* custom field names.
*/
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
Map<String, String> customFields = Maps.newHashMap();
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
if (customFieldsSpec != null) {
customFields = Maps.newHashMap();
for (String customFieldSpec : customFieldsSpec) {
String [] split = customFieldSpec.split(":");
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
customFields.put(split[0], split[1]);
} else {
log.warn("Unsupported format for " + CarrotParams.CUSTOM_FIELD_NAME
+ ": '" + customFieldSpec + "'. Skipping this field definition.");
}
}
}
return customFields;
}
private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
StringBuilder result = new StringBuilder();
Collection<Object> vals = sdoc.getFieldValues(field);
if(vals == null) return "";
Iterator<Object> ite = vals.iterator();
while(ite.hasNext()){
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append((String)ite.next()).append(" . ");
for (String field : fieldsSpec.split("[, ]")) {
Collection<Object> vals = sdoc.getFieldValues(field);
if (vals == null) continue;
Iterator<Object> ite = vals.iterator();
while(ite.hasNext()){
// Join multiple values with a period so that Carrot2 does not pick up
// phrases that cross field value boundaries (in most cases it would
// create useless phrases).
result.append(ObjectUtils.toString(ite.next())).append(" . ");
}
}
return result.toString().trim();
}

View File

@ -27,17 +27,24 @@ public interface CarrotParams {
String CARROT_PREFIX = "carrot.";
String ALGORITHM = CARROT_PREFIX + "algorithm";
String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
String URL_FIELD_NAME = CARROT_PREFIX + "url";
String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
String CUSTOM_FIELD_NAME = CARROT_PREFIX + "custom";
String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME, LANGUAGE_FIELD_NAME,
PRODUCE_SUMMARY, SUMMARY_FRAGSIZE, SUMMARY_SNIPPETS, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS,
LEXICAL_RESOURCES_DIR);
}

View File

@ -50,192 +50,192 @@ import org.tartarus.snowball.ext.TurkishStemmer;
* in this class.
*/
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneCarrot2StemmerFactory.class);
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneCarrot2StemmerFactory.class);
@Override
public IStemmer getStemmer(LanguageCode language) {
switch (language) {
case ARABIC:
return ArabicStemmerFactory.createStemmer();
@Override
public IStemmer getStemmer(LanguageCode language) {
switch (language) {
case ARABIC:
return ArabicStemmerFactory.createStemmer();
case CHINESE_SIMPLIFIED:
return IdentityStemmer.INSTANCE;
case CHINESE_SIMPLIFIED:
return IdentityStemmer.INSTANCE;
default:
/*
* For other languages, try to use snowball's stemming.
*/
return SnowballStemmerFactory.createStemmer(language);
}
}
default:
/*
* For other languages, try to use snowball's stemming.
*/
return SnowballStemmerFactory.createStemmer(language);
}
}
/**
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
* project.
*/
private final static class SnowballStemmerFactory {
/**
* Static hard mapping from language codes to stemmer classes in Snowball.
* This mapping is not dynamic because we want to keep the possibility to
* obfuscate these classes.
*/
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
static {
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
snowballStemmerClasses
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
snowballStemmerClasses
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
PortugueseStemmer.class);
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
}
/**
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
* project.
*/
private final static class SnowballStemmerFactory {
/**
* Static hard mapping from language codes to stemmer classes in Snowball.
* This mapping is not dynamic because we want to keep the possibility to
* obfuscate these classes.
*/
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
static {
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
snowballStemmerClasses
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
snowballStemmerClasses
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
PortugueseStemmer.class);
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
}
/**
* An adapter converting Snowball programs into {@link IStemmer} interface.
*/
private static class SnowballStemmerAdapter implements IStemmer {
private final SnowballProgram snowballStemmer;
/**
* An adapter converting Snowball programs into {@link IStemmer} interface.
*/
private static class SnowballStemmerAdapter implements IStemmer {
private final SnowballProgram snowballStemmer;
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
this.snowballStemmer = snowballStemmer;
}
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
this.snowballStemmer = snowballStemmer;
}
public CharSequence stem(CharSequence word) {
snowballStemmer.setCurrent(word.toString());
if (snowballStemmer.stem()) {
return snowballStemmer.getCurrent();
} else {
return null;
}
}
}
public CharSequence stem(CharSequence word) {
snowballStemmer.setCurrent(word.toString());
if (snowballStemmer.stem()) {
return snowballStemmer.getCurrent();
} else {
return null;
}
}
}
/**
* Create and return an {@link IStemmer} adapter for a
* {@link SnowballProgram} for a given language code. An identity stemmer is
* returned for unknown languages.
*/
public static IStemmer createStemmer(LanguageCode language) {
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
.get(language);
/**
* Create and return an {@link IStemmer} adapter for a
* {@link SnowballProgram} for a given language code. An identity stemmer is
* returned for unknown languages.
*/
public static IStemmer createStemmer(LanguageCode language) {
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
.get(language);
if (stemmerClazz == null) {
logger.warn("No Snowball stemmer class for: " + language.name()
+ ". Quality of clustering may be degraded.");
return IdentityStemmer.INSTANCE;
}
if (stemmerClazz == null) {
logger.warn("No Snowball stemmer class for: " + language.name()
+ ". Quality of clustering may be degraded.");
return IdentityStemmer.INSTANCE;
}
try {
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
} catch (Exception e) {
logger.warn("Could not instantiate snowball stemmer"
+ " for language: " + language.name()
+ ". Quality of clustering may be degraded.", e);
try {
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
} catch (Exception e) {
logger.warn("Could not instantiate snowball stemmer"
+ " for language: " + language.name()
+ ". Quality of clustering may be degraded.", e);
return IdentityStemmer.INSTANCE;
}
}
}
return IdentityStemmer.INSTANCE;
}
}
}
/**
* Factory of {@link IStemmer} implementations for the
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
* to be present in classpath, otherwise an empty (identity) stemmer is
* returned.
*/
private static class ArabicStemmerFactory {
static {
try {
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
} catch (ClassNotFoundException e) {
logger
.warn(
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ "of Arabic content may be degraded. For best quality clusters, "
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
e);
}
}
/**
* Factory of {@link IStemmer} implementations for the
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
* to be present in classpath, otherwise an empty (identity) stemmer is
* returned.
*/
private static class ArabicStemmerFactory {
static {
try {
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
} catch (ClassNotFoundException e) {
logger
.warn(
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ "of Arabic content may be degraded. For best quality clusters, "
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
e);
}
}
/**
* Adapter to lucene-contrib Arabic analyzers.
*/
private static class LuceneStemmerAdapter implements IStemmer {
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
/**
* Adapter to lucene-contrib Arabic analyzers.
*/
private static class LuceneStemmerAdapter implements IStemmer {
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
private char[] buffer = new char[0];
private char[] buffer = new char[0];
private LuceneStemmerAdapter() throws Exception {
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
}
private LuceneStemmerAdapter() throws Exception {
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
}
public CharSequence stem(CharSequence word) {
if (word.length() > buffer.length) {
buffer = new char[word.length()];
}
public CharSequence stem(CharSequence word) {
if (word.length() > buffer.length) {
buffer = new char[word.length()];
}
for (int i = 0; i < word.length(); i++) {
buffer[i] = word.charAt(i);
}
for (int i = 0; i < word.length(); i++) {
buffer[i] = word.charAt(i);
}
int newLen = normalizer.normalize(buffer, word.length());
newLen = delegate.stem(buffer, newLen);
int newLen = normalizer.normalize(buffer, word.length());
newLen = delegate.stem(buffer, newLen);
if (newLen != word.length() || !equals(buffer, newLen, word)) {
return CharBuffer.wrap(buffer, 0, newLen);
}
if (newLen != word.length() || !equals(buffer, newLen, word)) {
return CharBuffer.wrap(buffer, 0, newLen);
}
// Same-same.
return null;
}
// Same-same.
return null;
}
private boolean equals(char[] buffer, int len, CharSequence word) {
assert len == word.length();
private boolean equals(char[] buffer, int len, CharSequence word) {
assert len == word.length();
for (int i = 0; i < len; i++) {
if (buffer[i] != word.charAt(i))
return false;
}
for (int i = 0; i < len; i++) {
if (buffer[i] != word.charAt(i))
return false;
}
return true;
}
}
return true;
}
}
public static IStemmer createStemmer() {
try {
return new LuceneStemmerAdapter();
} catch (Throwable e) {
return IdentityStemmer.INSTANCE;
}
}
}
public static IStemmer createStemmer() {
try {
return new LuceneStemmerAdapter();
} catch (Throwable e) {
return IdentityStemmer.INSTANCE;
}
}
}
/**
* An implementation of {@link IStemmer} that always returns <code>null</code>
* which means no stemming.
*/
private static class IdentityStemmer implements IStemmer {
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
/**
* An implementation of {@link IStemmer} that always returns <code>null</code>
* which means no stemming.
*/
private static class IdentityStemmer implements IStemmer {
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
@Override
public CharSequence stem(CharSequence word) {
return null;
}
}
@Override
public CharSequence stem(CharSequence word) {
return null;
}
}
}

View File

@ -40,117 +40,117 @@ import org.slf4j.Logger;
* Lucene APIs need to change, the changes can be made in this class.
*/
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneCarrot2TokenizerFactory.class);
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(LuceneCarrot2TokenizerFactory.class);
@Override
public ITokenizer getTokenizer(LanguageCode language) {
switch (language) {
case CHINESE_SIMPLIFIED:
return ChineseTokenizerFactory.createTokenizer();
@Override
public ITokenizer getTokenizer(LanguageCode language) {
switch (language) {
case CHINESE_SIMPLIFIED:
return ChineseTokenizerFactory.createTokenizer();
/*
* We use our own analyzer for Arabic. Lucene's version has special
* support for Nonspacing-Mark characters (see
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
* have them included as letters in the parser.
*/
case ARABIC:
// Intentional fall-through.
/*
* We use our own analyzer for Arabic. Lucene's version has special
* support for Nonspacing-Mark characters (see
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
* have them included as letters in the parser.
*/
case ARABIC:
// Intentional fall-through.
default:
return new ExtendedWhitespaceTokenizer();
}
}
default:
return new ExtendedWhitespaceTokenizer();
}
}
/**
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
* factory will fall back to the default white space tokenizer.
*/
private static final class ChineseTokenizerFactory {
static {
try {
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
} catch (Throwable e) {
logger
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ "of Chinese content may be degraded. For best quality clusters, "
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
}
}
/**
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
* factory will fall back to the default white space tokenizer.
*/
private static final class ChineseTokenizerFactory {
static {
try {
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
} catch (Throwable e) {
logger
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ "of Chinese content may be degraded. For best quality clusters, "
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
}
}
static ITokenizer createTokenizer() {
try {
return new ChineseTokenizer();
} catch (Throwable e) {
return new ExtendedWhitespaceTokenizer();
}
}
static ITokenizer createTokenizer() {
try {
return new ChineseTokenizer();
} catch (Throwable e) {
return new ExtendedWhitespaceTokenizer();
}
}
private final static class ChineseTokenizer implements ITokenizer {
private final static Pattern numeric = Pattern
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
private final static class ChineseTokenizer implements ITokenizer {
private final static Pattern numeric = Pattern
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
private Tokenizer sentenceTokenizer;
private TokenStream wordTokenFilter;
private CharTermAttribute term = null;
private Tokenizer sentenceTokenizer;
private TokenStream wordTokenFilter;
private CharTermAttribute term = null;
private final MutableCharArray tempCharSequence;
private final Class<?> tokenFilterClass;
private final MutableCharArray tempCharSequence;
private final Class<?> tokenFilterClass;
private ChineseTokenizer() throws Exception {
this.tempCharSequence = new MutableCharArray(new char[0]);
private ChineseTokenizer() throws Exception {
this.tempCharSequence = new MutableCharArray(new char[0]);
// As Smart Chinese is not available during compile time,
// we need to resort to reflection.
final Class<?> tokenizerClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
Reader.class).newInstance((Reader) null);
this.tokenFilterClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
}
// As Smart Chinese is not available during compile time,
// we need to resort to reflection.
final Class<?> tokenizerClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
Reader.class).newInstance((Reader) null);
this.tokenFilterClass = ReflectionUtils.classForName(
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
}
public short nextToken() throws IOException {
final boolean hasNextToken = wordTokenFilter.incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = term.buffer();
final int length = term.length();
tempCharSequence.reset(image, 0, length);
if (length == 1 && image[0] == ',') {
// ChineseTokenizer seems to convert all punctuation to ','
// characters
flags = ITokenizer.TT_PUNCTUATION;
} else if (numeric.matcher(tempCharSequence).matches()) {
flags = ITokenizer.TT_NUMERIC;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
public short nextToken() throws IOException {
final boolean hasNextToken = wordTokenFilter.incrementToken();
if (hasNextToken) {
short flags = 0;
final char[] image = term.buffer();
final int length = term.length();
tempCharSequence.reset(image, 0, length);
if (length == 1 && image[0] == ',') {
// ChineseTokenizer seems to convert all punctuation to ','
// characters
flags = ITokenizer.TT_PUNCTUATION;
} else if (numeric.matcher(tempCharSequence).matches()) {
flags = ITokenizer.TT_NUMERIC;
} else {
flags = ITokenizer.TT_TERM;
}
return flags;
}
return ITokenizer.TT_EOF;
}
return ITokenizer.TT_EOF;
}
public void setTermBuffer(MutableCharArray array) {
array.reset(term.buffer(), 0, term.length());
}
public void setTermBuffer(MutableCharArray array) {
array.reset(term.buffer(), 0, term.length());
}
public void reset(Reader input) throws IOException {
try {
sentenceTokenizer.reset(input);
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
TokenStream.class).newInstance(sentenceTokenizer);
public void reset(Reader input) throws IOException {
try {
sentenceTokenizer.reset(input);
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
TokenStream.class).newInstance(sentenceTokenizer);
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
} catch (Exception e) {
throw ExceptionUtils.wrapAsRuntimeException(e);
}
}
}
}
} catch (Exception e) {
throw ExceptionUtils.wrapAsRuntimeException(e);
}
}
}
}
}

View File

@ -53,89 +53,89 @@ import com.google.common.collect.Multimap;
*/
@Bindable
public class SolrStopwordsCarrot2LexicalDataFactory implements
ILexicalDataFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
ILexicalDataFactory {
final static Logger logger = org.slf4j.LoggerFactory
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
@Init
@Input
@Attribute(key = "solrIndexSchema")
private IndexSchema schema;
@Init
@Input
@Attribute(key = "solrIndexSchema")
private IndexSchema schema;
@Processing
@Input
@Attribute(key = "solrFieldNames")
private Set<String> fieldNames;
@Processing
@Input
@Attribute(key = "solrFieldNames")
private Set<String> fieldNames;
/**
* A lazily-built cache of stop words per field.
*/
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
/**
* A lazily-built cache of stop words per field.
*/
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
/**
* Carrot2's default lexical resources to use in addition to Solr's stop
* words.
*/
private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
/**
* Carrot2's default lexical resources to use in addition to Solr's stop
* words.
*/
private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
/**
* Obtains stop words for a field from the associated
* {@link StopFilterFactory}, if any.
*/
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
// No need to synchronize here, Carrot2 ensures that instances
// of this class are not used by multiple threads at a time.
if (!solrStopWords.containsKey(fieldName)) {
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
.getAnalyzer();
if (fieldAnalyzer instanceof TokenizerChain) {
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
.getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet, but
// the getStopWords() method returns a Set<?>, so we need to cast.
solrStopWords.put(fieldName,
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
}
/**
* Obtains stop words for a field from the associated
* {@link StopFilterFactory}, if any.
*/
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
// No need to synchronize here, Carrot2 ensures that instances
// of this class are not used by multiple threads at a time.
if (!solrStopWords.containsKey(fieldName)) {
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
.getAnalyzer();
if (fieldAnalyzer instanceof TokenizerChain) {
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
.getTokenFilterFactories();
for (TokenFilterFactory factory : filterFactories) {
if (factory instanceof StopFilterFactory) {
// StopFilterFactory holds the stop words in a CharArraySet, but
// the getStopWords() method returns a Set<?>, so we need to cast.
solrStopWords.put(fieldName,
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
}
if (factory instanceof CommonGramsFilterFactory) {
solrStopWords.put(fieldName,
(CharArraySet) ((CommonGramsFilterFactory) factory)
.getCommonWords());
}
}
}
}
return solrStopWords.get(fieldName);
}
if (factory instanceof CommonGramsFilterFactory) {
solrStopWords.put(fieldName,
(CharArraySet) ((CommonGramsFilterFactory) factory)
.getCommonWords());
}
}
}
}
return solrStopWords.get(fieldName);
}
@Override
public ILexicalData getLexicalData(LanguageCode languageCode) {
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
.getLexicalData(languageCode);
@Override
public ILexicalData getLexicalData(LanguageCode languageCode) {
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
.getLexicalData(languageCode);
return new ILexicalData() {
@Override
public boolean isStopLabel(CharSequence word) {
// Nothing in Solr maps to the concept of a stop label,
// so return Carrot2's default here.
return carrot2LexicalData.isStopLabel(word);
}
return new ILexicalData() {
@Override
public boolean isStopLabel(CharSequence word) {
// Nothing in Solr maps to the concept of a stop label,
// so return Carrot2's default here.
return carrot2LexicalData.isStopLabel(word);
}
@Override
public boolean isCommonWord(MutableCharArray word) {
// Loop over the fields involved in clustering first
for (String fieldName : fieldNames) {
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
if (stopWords.contains(word)) {
return true;
}
}
}
// Check default Carrot2 stop words too
return carrot2LexicalData.isCommonWord(word);
}
};
}
@Override
public boolean isCommonWord(MutableCharArray word) {
// Loop over the fields involved in clustering first
for (String fieldName : fieldNames) {
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
if (stopWords.contains(word)) {
return true;
}
}
}
// Check default Carrot2 stop words too
return carrot2LexicalData.isCommonWord(word);
}
};
}
}

View File

@ -280,8 +280,10 @@
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="url" type="string" indexed="true" stored="true" required="true" />
<field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true" />
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented

View File

@ -17,6 +17,7 @@ package org.apache.solr.handler.clustering;
*/
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.junit.BeforeClass;
@ -34,6 +35,54 @@ public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
numberOfDocs++;
}
// Add a multi-valued snippet
final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
multiValuedSnippet.addField("id", numberOfDocs++);
multiValuedSnippet.addField("title", "Title");
multiValuedSnippet.addField("url", "URL");
multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
// Add a document with multi-field title and snippet
final SolrInputDocument multiFieldDoc = new SolrInputDocument();
multiFieldDoc.addField("id", numberOfDocs++);
multiFieldDoc.addField("title", "Title field");
multiFieldDoc.addField("heading", "Heading field");
multiFieldDoc.addField("url", "URL");
multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
// Add a document with one language supported by Carrot2
final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
docWithOneSupprtedLanguage.addField("title", "");
docWithOneSupprtedLanguage.addField("url", "one_supported_language");
docWithOneSupprtedLanguage.addField("lang", "zh-cn");
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
// Add a document with more languages, one supported by Carrot2
final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
docWithOneSupprtedLanguageOfMany.addField("lang", "de");
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
// Add a document with more languages, one supported by Carrot2
final SolrInputDocument docWithCustomFields = new SolrInputDocument();
docWithCustomFields.addField("id", numberOfDocs++);
docWithCustomFields.addField("url", "custom_fields");
docWithCustomFields.addField("intfield_i", 10);
docWithCustomFields.addField("floatfield_f", 10.5);
docWithCustomFields.addField("heading", "first");
docWithCustomFields.addField("heading", "second");
assertNull(h.validateUpdate(adoc(docWithCustomFields)));
assertNull(h.validateUpdate(commit()));
}

View File

@ -39,6 +39,7 @@ import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.SolrPluginUtils;
import org.carrot2.core.LanguageCode;
import org.carrot2.util.attribute.AttributeUtils;
import org.junit.Test;
@ -50,10 +51,10 @@ import com.google.common.collect.ImmutableList;
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
@Test
public void testCarrotLingo() throws Exception {
// Note: the expected number of clusters may change after upgrading Carrot2
// due to e.g. internal improvements or tuning of Carrot2 clustering.
// Note: the expected number of clusters may change after upgrading Carrot2
// due to e.g. internal improvements or tuning of Carrot2 clustering.
final int expectedNumClusters = 10;
checkEngine(getClusteringEngine("default"), expectedNumClusters);
checkEngine(getClusteringEngine("default"), expectedNumClusters);
}
@Test
@ -88,10 +89,15 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
private List<NamedList<Object>> clusterWithHighlighting(
boolean enableHighlighting, int fragSize) throws IOException {
// Some documents don't have mining in the snippet
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 7);
}
private List<NamedList<Object>> clusterWithHighlighting(
boolean enableHighlighting, int fragSize, int summarySnippets,
String term, int expectedNumDocuments) throws IOException {
final TermQuery query = new TermQuery(new Term("snippet", "mine"));
// Two documents don't have mining in the snippet
int expectedNumDocuments = numberOfDocs - 2;
final TermQuery query = new TermQuery(new Term("snippet", term));
final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
@ -99,6 +105,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
Boolean.toString(enableHighlighting));
summaryParams
.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
summaryParams
.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets));
final List<NamedList<Object>> summaryClusters = checkEngine(
getClusteringEngine("echo"), expectedNumDocuments,
expectedNumDocuments, query, summaryParams);
@ -169,66 +177,180 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
params), 1, 3, 0);
}
@Test
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
"online,customsolrstopword,customsolrstoplabel");
}
@Test
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
"online,customsolrstopword,customsolrstoplabel");
}
@Test
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
@Test
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
}
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
throws IOException {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
wordsToCheck);
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
throws IOException {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
wordsToCheck);
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
// stoplabels.mt, so we're expecting only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine(engineName), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
}
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
// stoplabels.mt, so we're expecting only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine(engineName), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
}
@Test
public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
@Test
public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
// "solrownstopword" is in stopwords.txt, so we're expecting
// only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
}
// "solrownstopword" is in stopwords.txt, so we're expecting
// only one cluster with label "online".
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 1, params);
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
}
@Test
public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
// Force string fields to be used for clustering. Does not make sense
// in a real word, but does the job in the test.
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
@Test
public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
ModifiableSolrParams params = new ModifiableSolrParams();
// Force string fields to be used for clustering. Does not make sense
// in a real word, but does the job in the test.
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
params.set("merge-resources", false);
params.set(AttributeUtils.getKey(
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
"online,solrownstopword");
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 2, params);
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
assertEquals(ImmutableList.of("solrownstopword"),
getLabels(clusters.get(1)));
}
final List<NamedList<Object>> clusters = checkEngine(
getClusteringEngine("lexical-resource-check"), 2, params);
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
assertEquals(ImmutableList.of("solrownstopword"),
getLabels(clusters.get(1)));
}
@Test
public void highlightingOfMultiValueField() throws Exception {
final String snippetWithoutSummary = getLabels(clusterWithHighlighting(
false, 30, 3, "multi", 1).get(0)).get(1);
assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First"));
assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second"));
assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third"));
final String snippetWithSummary = getLabels(clusterWithHighlighting(
true, 30, 3, "multi", 1).get(0)).get(1);
assertTrue("Snippet with summary shorter than full snippet",
snippetWithoutSummary.length() > snippetWithSummary.length());
assertTrue("Summary covers first value", snippetWithSummary.contains("First"));
assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
}
@Test
public void concatenatingMultipleFields() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
"snippet")), params).get(0));
assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
}
@Test
public void highlightingMultipleFields() throws Exception {
final TermQuery query = new TermQuery(new Term("snippet", "content"));
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
final String snippetWithoutSummary = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
final String snippetWithSummary = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
assertTrue("Snippet with summary shorter than full snippet",
snippetWithoutSummary.length() > snippetWithSummary.length());
assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
}
@Test
public void oneCarrot2SupportedLanguage() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"one_supported_language")), params).get(0));
assertEquals(3, labels.size());
assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
}
@Test
public void oneCarrot2SupportedLanguageOfMany() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"one_supported_language_of_many")), params).get(0));
assertEquals(3, labels.size());
assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
}
@Test
public void languageCodeMapping() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"one_supported_language_of_many")), params).get(0));
assertEquals(3, labels.size());
assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
}
@Test
public void passingOfCustomFields() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.CUSTOM_FIELD_NAME, "intfield_i:intfield");
params.add(CarrotParams.CUSTOM_FIELD_NAME, "floatfield_f:floatfield");
params.add(CarrotParams.CUSTOM_FIELD_NAME, "heading:multi");
// Let the echo mock clustering algorithm know which custom field to echo
params.add("custom-fields", "intfield,floatfield,multi");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
"custom_fields")), params).get(0));
assertEquals(5, labels.size());
assertEquals("Integer field", "10", labels.get(2));
assertEquals("Float field", "10.5", labels.get(3));
assertEquals("List field", "[first, second]", labels.get(4));
}
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
@ -273,7 +395,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
@SuppressWarnings("unchecked")
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
req.close();
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
checkClusters(results, false);
@ -302,7 +424,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
List<Object> docs = getDocs(cluster);
assertNotNull("docs is null and it shouldn't be", docs);
for (int j = 0; j < docs.size(); j++) {
String id = (String) docs.get(j);
Object id = docs.get(j);
assertNotNull("id is null and it shouldn't be", id);
}
@ -331,26 +453,26 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
}
}
@SuppressWarnings("unchecked")
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
return (List<NamedList<Object>>) cluster.get("clusters");
}
@SuppressWarnings("unchecked")
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
return (List<NamedList<Object>>) cluster.get("clusters");
}
@SuppressWarnings("unchecked")
private List<String> getLabels(NamedList<Object> cluster) {
return (List<String>) cluster.get("labels");
}
@SuppressWarnings("unchecked")
private List<String> getLabels(NamedList<Object> cluster) {
return (List<String>) cluster.get("labels");
}
private Double getScore(NamedList<Object> cluster) {
return (Double) cluster.get("score");
}
private Double getScore(NamedList<Object> cluster) {
return (Double) cluster.get("score");
}
private Boolean isOtherTopics(NamedList<Object> cluster) {
return (Boolean)cluster.get("other-topics");
}
private Boolean isOtherTopics(NamedList<Object> cluster) {
return (Boolean)cluster.get("other-topics");
}
@SuppressWarnings("unchecked")
private List<Object> getDocs(NamedList<Object> cluster) {
return (List<Object>) cluster.get("docs");
}
@SuppressWarnings("unchecked")
private List<Object> getDocs(NamedList<Object> cluster) {
return (List<Object>) cluster.get("docs");
}
}

View File

@ -15,6 +15,7 @@ package org.apache.solr.handler.clustering.carrot2;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collections;
import java.util.List;
import org.carrot2.core.Cluster;
@ -48,6 +49,12 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
@Input
@Processing
@Attribute(key = "custom-fields")
private String customFields = "";
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayListWithCapacity(documents.size());
@ -55,6 +62,15 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
for (Document document : documents) {
final Cluster cluster = new Cluster();
cluster.addPhrases(document.getTitle(), document.getSummary());
if (document.getLanguage() != null) {
cluster.addPhrases(document.getLanguage().name());
}
for (String field : customFields.split(",")) {
Object value = document.getField(field);
if (value != null) {
cluster.addPhrases(value.toString());
}
}
cluster.addDocuments(document);
clusters.add(cluster);
}

View File

@ -25,9 +25,7 @@ import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.attribute.Attribute;
@ -46,37 +44,37 @@ import com.google.common.collect.Lists;
*/
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
public class LexicalResourcesCheckClusteringAlgorithm extends
ProcessingComponentBase implements IClusteringAlgorithm {
ProcessingComponentBase implements IClusteringAlgorithm {
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
@Input
@Processing
@Attribute
private String wordsToCheck;
@Input
@Processing
@Attribute
private String wordsToCheck;
private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
if (wordsToCheck == null) {
return;
}
@Override
public void process() throws ProcessingException {
clusters = Lists.newArrayList();
if (wordsToCheck == null) {
return;
}
// Test with Maltese so that the English clustering performed in other tests
// is not affected by the test stopwords and stoplabels.
ILexicalData lexicalData = preprocessing.lexicalDataFactory
.getLexicalData(LanguageCode.MALTESE);
// Test with Maltese so that the English clustering performed in other tests
// is not affected by the test stopwords and stoplabels.
ILexicalData lexicalData = preprocessing.lexicalDataFactory
.getLexicalData(LanguageCode.MALTESE);
for (String word : wordsToCheck.split(",")) {
if (!lexicalData.isCommonWord(new MutableCharArray(word))
&& !lexicalData.isStopLabel(word)) {
clusters.add(new Cluster(word));
}
}
}
for (String word : wordsToCheck.split(",")) {
if (!lexicalData.isCommonWord(new MutableCharArray(word))
&& !lexicalData.isStopLabel(word)) {
clusters.add(new Cluster(word));
}
}
}
}

View File

@ -17,6 +17,7 @@
package org.apache.solr.handler.dataimport;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.CommitUpdateCommand;
@ -27,8 +28,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.Map;
import java.util.Set;
/**
* <p> Writes documents to SOLR. </p>
@ -43,12 +42,14 @@ public class SolrWriter extends DIHWriterBase implements DIHWriter {
static final String LAST_INDEX_KEY = "last_index_time";
private final UpdateRequestProcessor processor;
private final int commitWithin;
SolrQueryRequest req;
public SolrWriter(UpdateRequestProcessor processor, SolrQueryRequest req) {
this.processor = processor;
this.req = req;
commitWithin = (req != null) ? req.getParams().getInt(UpdateParams.COMMIT_WITHIN, -1): -1;
}
@Override
@ -65,6 +66,7 @@ public class SolrWriter extends DIHWriterBase implements DIHWriter {
try {
AddUpdateCommand command = new AddUpdateCommand(req);
command.solrDoc = d;
command.commitWithin = commitWithin;
processor.processAdd(command);
} catch (Exception e) {
log.warn("Error creating document : " + d, e);

View File

@ -24,6 +24,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.junit.After;
import org.junit.Before;
@ -80,6 +81,33 @@ public class TestContentStreamDataSource extends AbstractDataImportHandlerTestCa
assertEquals("Hello C1", ((List)doc.getFieldValue("desc")).get(0));
}
@Test
public void testCommitWithin() throws Exception {
DirectXmlRequest req = new DirectXmlRequest("/dataimport", xml);
ModifiableSolrParams params = params("command", "full-import",
"clean", "false", UpdateParams.COMMIT, "false",
UpdateParams.COMMIT_WITHIN, "1000");
req.setParams(params);
String url = "http://localhost:" + jetty.getLocalPort() + "/solr";
CommonsHttpSolrServer solrServer = new CommonsHttpSolrServer(url);
solrServer.request(req);
Thread.sleep(100);
ModifiableSolrParams queryAll = params("q", "*");
QueryResponse qres = solrServer.query(queryAll);
SolrDocumentList results = qres.getResults();
assertEquals(0, results.getNumFound());
Thread.sleep(1000);
for (int i = 0; i < 10; i++) {
qres = solrServer.query(queryAll);
results = qres.getResults();
if (2 == results.getNumFound()) {
return;
}
Thread.sleep(500);
}
fail("Commit should have occured but it did not");
}
private class SolrInstance {
String name;
Integer port;

View File

@ -162,13 +162,18 @@ public class QueryComponent extends SearchComponent
//TODO: move weighting of sort
Sort groupSort = searcher.weightSort(cmd.getSort());
if (groupSort == null) {
groupSort = Sort.RELEVANCE;
}
// groupSort defaults to sort
String groupSortStr = params.get(GroupParams.GROUP_SORT);
if (groupSort == null) {
groupSort = new Sort();
}
//TODO: move weighting of sort
Sort sortWithinGroup = groupSortStr == null ? groupSort : searcher.weightSort(QueryParsing.parseSort(groupSortStr, req));
if (sortWithinGroup == null) {
sortWithinGroup = Sort.RELEVANCE;
}
groupingSpec.setSortWithinGroup(sortWithinGroup);
groupingSpec.setGroupSort(groupSort);

View File

@ -140,6 +140,7 @@ public class TestDistributedGrouping extends BaseDistributedSearchTestCase {
query("q", "*:*", "fq", s1 + ":a", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", i1 + " asc, id asc", "group.truncate", "true", "facet", "true", "facet.field", t1);
// We cannot validate distributed grouping with scoring as first sort. since there is no global idf. We can check if no errors occur
simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", i1 + " desc", "group.sort", "score desc"); // SOLR-2955
simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", "score desc, _docid_ asc, id asc");
simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10);
}
@ -149,6 +150,7 @@ public class TestDistributedGrouping extends BaseDistributedSearchTestCase {
for (int i = 0; i < queryParams.length; i += 2) {
params.add(queryParams[i].toString(), queryParams[i + 1].toString());
}
params.set("shards", shards);
queryServer(params);
}