mirror of https://github.com/apache/lucene.git
LUCENE-3622: merge trunk (1212397:1212829)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3622@1212830 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
5627b52e70
|
@ -217,7 +217,11 @@ Changes in backwards compatibility policy
|
|||
instances. Furthermore, IndexReader.setNorm() was removed. If you need
|
||||
customized norm values, the recommended way to do this is by modifying
|
||||
SimilarityProvider to use an external byte[] or one of the new DocValues
|
||||
fields (LUCENE-3108). (Uwe Schindler, Robert Muir)
|
||||
fields (LUCENE-3108). Alternatively, to dynamically change norms (boost
|
||||
*and* length norm) at query time, wrap your IndexReader using
|
||||
FilterIndexReader, overriding FilterIndexReader.norms(). To persist the
|
||||
changes on disk, copy the FilteredIndexReader to a new index using
|
||||
IndexWriter.addIndexes(). (Uwe Schindler, Robert Muir)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
|
@ -676,6 +680,22 @@ Security fixes
|
|||
prevents this as best as it can by throwing AlreadyClosedException
|
||||
also on clones. (Uwe Schindler, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3606: IndexReader will be made read-only in Lucene 4.0, so all
|
||||
methods allowing to delete or undelete documents using IndexReader were
|
||||
deprecated; you should use IndexWriter now. Consequently
|
||||
IndexReader.commit() and all open(), openIfChanged(), clone() methods
|
||||
taking readOnly booleans (or IndexDeletionPolicy instances) were
|
||||
deprecated. IndexReader.setNorm() is superfluous and was deprecated.
|
||||
If you have to change per-document boost use CustomScoreQuery.
|
||||
If you want to dynamically change norms (boost *and* length norm) at
|
||||
query time, wrap your IndexReader using FilterIndexReader, overriding
|
||||
FilterIndexReader.norms(). To persist the changes on disk, copy the
|
||||
FilteredIndexReader to a new index using IndexWriter.addIndexes().
|
||||
In Lucene 4.0, SimilarityProvider will allow you to customize scoring
|
||||
using external norms, too. (Uwe Schindler, Robert Muir)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-3593: Added a FieldValueFilter that accepts all documents that either
|
||||
|
@ -691,6 +711,12 @@ Bug fixes
|
|||
* LUCENE-3627: Don't let an errant 0-byte segments_N file corrupt the index.
|
||||
(Ken McCracken via Mike McCandless)
|
||||
|
||||
* LUCENE-3630: The internal method MultiReader.doOpenIfChanged(boolean doClone)
|
||||
was overriding IndexReader.doOpenIfChanged(boolean readOnly), so changing the
|
||||
contract of the overridden method. This method was renamed and made private.
|
||||
In ParallelReader the bug was not existent, but the implementation method
|
||||
was also made private. (Uwe Schindler)
|
||||
|
||||
Documentation
|
||||
|
||||
* LUCENE-3597: Fixed incorrect grouping documentation. (Martijn van Groningen, Robert Muir)
|
||||
|
|
|
@ -76,6 +76,11 @@ API Changes
|
|||
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
|
||||
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
|
||||
|
||||
* LUCENE-3606: FieldNormModifier was deprecated, because IndexReader's
|
||||
setNorm() was deprecated. Furthermore, this class is broken, as it does
|
||||
not take position overlaps into account while recalculating norms.
|
||||
(Uwe Schindler, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
|
||||
|
|
|
@ -108,7 +108,7 @@ public class MultiReader extends IndexReader implements Cloneable {
|
|||
*/
|
||||
@Override
|
||||
protected synchronized IndexReader doOpenIfChanged() throws CorruptIndexException, IOException {
|
||||
return doOpenIfChanged(false);
|
||||
return doReopen(false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -123,7 +123,7 @@ public class MultiReader extends IndexReader implements Cloneable {
|
|||
@Override
|
||||
public synchronized Object clone() {
|
||||
try {
|
||||
return doOpenIfChanged(true);
|
||||
return doReopen(true);
|
||||
} catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
@ -141,7 +141,7 @@ public class MultiReader extends IndexReader implements Cloneable {
|
|||
* @throws CorruptIndexException
|
||||
* @throws IOException
|
||||
*/
|
||||
protected IndexReader doOpenIfChanged(boolean doClone) throws CorruptIndexException, IOException {
|
||||
private IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
|
||||
boolean changed = false;
|
||||
|
|
|
@ -243,7 +243,7 @@ public class ParallelReader extends IndexReader {
|
|||
return doReopen(false);
|
||||
}
|
||||
|
||||
protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
|
||||
private IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException {
|
||||
ensureOpen();
|
||||
|
||||
boolean reopened = false;
|
||||
|
|
|
@ -30,11 +30,37 @@ import org.apache.lucene.util.LuceneTestCase;
|
|||
*/
|
||||
public class TestIndexReaderClone extends LuceneTestCase {
|
||||
|
||||
private void assertDelDocsRefCountEquals(int refCount, SegmentReader reader) {
|
||||
assertEquals(refCount, reader.liveDocsRef.get());
|
||||
public void testDirectoryReader() throws Exception {
|
||||
final Directory dir = createIndex(0);
|
||||
performDefaultTests(IndexReader.open(dir));
|
||||
dir.close();
|
||||
}
|
||||
|
||||
public void testCloseStoredFields() throws Exception {
|
||||
|
||||
public void testMultiReader() throws Exception {
|
||||
final Directory dir1 = createIndex(0);
|
||||
final IndexReader r1 = IndexReader.open(dir1);
|
||||
final Directory dir2 = createIndex(0);
|
||||
final IndexReader r2 = IndexReader.open(dir2);
|
||||
final MultiReader mr = new MultiReader(r1, r2);
|
||||
performDefaultTests(mr);
|
||||
dir1.close();
|
||||
dir2.close();
|
||||
}
|
||||
|
||||
public void testParallelReader() throws Exception {
|
||||
final Directory dir1 = createIndex(0);
|
||||
final IndexReader r1 = IndexReader.open(dir1);
|
||||
final Directory dir2 = createIndex(1);
|
||||
final IndexReader r2 = IndexReader.open(dir2);
|
||||
final ParallelReader pr = new ParallelReader();
|
||||
pr.add(r1);
|
||||
pr.add(r2);
|
||||
performDefaultTests(pr);
|
||||
dir1.close();
|
||||
dir2.close();
|
||||
}
|
||||
|
||||
private Directory createIndex(int no) throws Exception {
|
||||
final Directory dir = newDirectory();
|
||||
IndexWriter w = new IndexWriter(
|
||||
dir,
|
||||
|
@ -42,13 +68,19 @@ public class TestIndexReaderClone extends LuceneTestCase {
|
|||
setMergePolicy(newLogMergePolicy(false))
|
||||
);
|
||||
Document doc = new Document();
|
||||
doc.add(newField("field", "yes it's stored", TextField.TYPE_STORED));
|
||||
doc.add(newField("field"+no, "yes it's stored", TextField.TYPE_STORED));
|
||||
w.addDocument(doc);
|
||||
w.close();
|
||||
IndexReader r1 = IndexReader.open(dir);
|
||||
return dir;
|
||||
}
|
||||
|
||||
private void performDefaultTests(IndexReader r1) throws Exception {
|
||||
IndexReader r2 = (IndexReader) r1.clone();
|
||||
assertTrue(r1 != r2);
|
||||
TestIndexReader.assertIndexEquals(r1, r2);
|
||||
r1.close();
|
||||
r2.close();
|
||||
dir.close();
|
||||
TestIndexReaderReopen.assertReaderClosed(r1, true, true);
|
||||
TestIndexReaderReopen.assertReaderClosed(r2, true, true);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -757,7 +757,7 @@ public class TestIndexReaderReopen extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private void assertReaderClosed(IndexReader reader, boolean checkSubReaders, boolean checkNormsClosed) {
|
||||
static void assertReaderClosed(IndexReader reader, boolean checkSubReaders, boolean checkNormsClosed) {
|
||||
assertEquals(0, reader.getRefCount());
|
||||
|
||||
if (checkNormsClosed && reader instanceof SegmentReader) {
|
||||
|
|
|
@ -195,6 +195,8 @@ New Features
|
|||
"multiterm" analyzer in our schema.xml, but Solr should "do the right thing" if you don't
|
||||
specify <fieldType="multiterm"> (Pete Sturge Erick Erickson, Mentoring from Seeley and Muir)
|
||||
|
||||
* SOLR-2481: Add support for commitWithin in DataImportHandler (Sami Siren via yonik)
|
||||
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
@ -409,6 +411,7 @@ New Features
|
|||
* SOLR-2919: Added support for localized range queries when the analysis chain uses
|
||||
CollationKeyFilter or ICUCollationKeyFilter. (Michael Sokolov, rmuir)
|
||||
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
* SOLR-2912: Fixed File descriptor leak in ShowFileRequestHandler (Michael Ryan, shalin)
|
||||
|
@ -418,6 +421,9 @@ Bug Fixes
|
|||
|
||||
* SOLR-2509: StringIndexOutOfBoundsException in the spellchecker collate when the term contains
|
||||
a hyphen. (Thomas Gambier caught the bug, Steffen Godskesen did the patch, via Erick Erickson)
|
||||
|
||||
* SOLR-2955: Fixed IllegalStateException when querying with group.sort=score desc in sharded
|
||||
environment. (Steffen Elberg Godskesen, Martijn van Groningen)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
|
|
@ -13,7 +13,27 @@ $Id$
|
|||
|
||||
================== Release 3.6.0 ==================
|
||||
|
||||
(No Changes)
|
||||
* SOLR-2937: Configuring the number of contextual snippets used for
|
||||
search results clustering. The hl.snippets parameter is now respected
|
||||
by the clustering plugin, can be overridden by carrot.summarySnippets
|
||||
if needed (Stanislaw Osinski).
|
||||
|
||||
* SOLR-2938: Clustering on multiple fields. The carrot.title and
|
||||
carrot.snippet can now take comma- or space-separated lists of
|
||||
field names to cluster (Stanislaw Osinski).
|
||||
|
||||
* SOLR-2939: Clustering of multilingual search results. The document's
|
||||
language field be passed in the carrot.lang parameter, the carrot.lcmap
|
||||
parameter enables mapping of language codes to ISO 639 (Stanislaw Osinski).
|
||||
|
||||
* SOLR-2940: Passing values for custom Carrot2 fields. The custom field
|
||||
mapping are defined using the carrot.custom parameter (Stanislaw Osinski).
|
||||
|
||||
* SOLR-2941: NullPointerException on clustering component initialization
|
||||
when schema does not have a unique key field (Stanislaw Osinski).
|
||||
|
||||
* SOLR-2942: ClassCastException when passing non-textual fields for
|
||||
clustering (Stanislaw Osinski).
|
||||
|
||||
================== Release 3.5.0 ==================
|
||||
|
||||
|
@ -21,10 +41,10 @@ $Id$
|
|||
|
||||
================== Release 3.4.0 ==================
|
||||
|
||||
SOLR-2706: The carrot.lexicalResourcesDir parameter now works
|
||||
with absolute directories (Stanislaw Osinski)
|
||||
* SOLR-2706: The carrot.lexicalResourcesDir parameter now works
|
||||
with absolute directories (Stanislaw Osinski)
|
||||
|
||||
SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to
|
||||
* SOLR-2692: Typo in param name fixed: "carrot.fragzise" changed to
|
||||
"carrot.fragSize" (Stanislaw Osinski).
|
||||
|
||||
================== Release 3.3.0 ==================
|
||||
|
|
|
@ -19,15 +19,18 @@ package org.apache.solr.handler.clustering.carrot2;
|
|||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.ObjectUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
|
@ -45,6 +48,7 @@ import org.apache.solr.handler.component.HighlightComponent;
|
|||
import org.apache.solr.highlight.SolrHighlighter;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.search.DocList;
|
||||
import org.apache.solr.search.DocSlice;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
|
@ -54,6 +58,7 @@ import org.carrot2.core.Controller;
|
|||
import org.carrot2.core.ControllerFactory;
|
||||
import org.carrot2.core.Document;
|
||||
import org.carrot2.core.IClusteringAlgorithm;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
|
||||
|
@ -77,13 +82,13 @@ import com.google.common.io.Closeables;
|
|||
* @see "http://project.carrot2.org"
|
||||
*/
|
||||
public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||
private transient static Logger log = LoggerFactory
|
||||
private transient static Logger log = LoggerFactory
|
||||
.getLogger(CarrotClusteringEngine.class);
|
||||
|
||||
/**
|
||||
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
|
||||
*/
|
||||
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
|
||||
/**
|
||||
* The subdirectory in Solr config dir to read customized Carrot2 resources from.
|
||||
*/
|
||||
private static final String CARROT_RESOURCES_PREFIX = "clustering/carrot2";
|
||||
|
||||
/**
|
||||
* Name of Carrot2 document's field containing Solr document's identifier.
|
||||
|
@ -102,7 +107,7 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
*/
|
||||
private Controller controller = ControllerFactory.createPooling();
|
||||
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
|
||||
|
||||
|
||||
private static class SolrResourceLocator implements IResourceLocator {
|
||||
private final SolrResourceLoader resourceLoader;
|
||||
private final String carrot2ResourcesDir;
|
||||
|
@ -227,8 +232,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings({ "unchecked", "rawtypes" })
|
||||
@Override
|
||||
@SuppressWarnings({ "unchecked", "rawtypes" })
|
||||
public String init(NamedList config, final SolrCore core) {
|
||||
String result = super.init(config, core);
|
||||
final SolrParams initParams = SolrParams.toSolrParams(config);
|
||||
|
@ -243,13 +248,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
// Additionally, we set a custom lexical resource factory for Carrot2 that
|
||||
// will use both Carrot2 default stop words as well as stop words from
|
||||
// the StopFilter defined on the field.
|
||||
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
|
||||
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
|
||||
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
|
||||
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
|
||||
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
|
||||
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
|
||||
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
|
||||
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
|
||||
initAttributes.put("solrIndexSchema", core.getSchema());
|
||||
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
|
||||
initAttributes.put("solrIndexSchema", core.getSchema());
|
||||
|
||||
// Customize Carrot2's resource lookup to first look for resources
|
||||
// using Solr's resource loader. If that fails, try loading from the classpath.
|
||||
|
@ -261,7 +266,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
new ClassLoaderLocator(core.getResourceLoader().getClassLoader())));
|
||||
|
||||
this.controller.init(initAttributes);
|
||||
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
|
||||
|
||||
SchemaField uniqueField = core.getSchema().getUniqueKeyField();
|
||||
if (uniqueField == null) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
|
||||
CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField");
|
||||
}
|
||||
this.idFieldName = uniqueField.getName();
|
||||
|
||||
// Make sure the requested Carrot2 clustering algorithm class is available
|
||||
String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
|
||||
|
@ -283,25 +294,35 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
HashSet<String> fields = Sets.newHashSet(getFieldsForClustering(sreq));
|
||||
fields.add(idFieldName);
|
||||
fields.add(solrParams.get(CarrotParams.URL_FIELD_NAME, "url"));
|
||||
return fields;
|
||||
fields.addAll(getCustomFieldsMap(solrParams).keySet());
|
||||
|
||||
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME);
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
fields.add(languageField);
|
||||
}
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the names of fields that will be delivering the actual
|
||||
* content for clustering. Currently, there are two such fields: document
|
||||
* title and document content.
|
||||
*/
|
||||
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
|
||||
/**
|
||||
* Returns the names of fields that will be delivering the actual
|
||||
* content for clustering. Currently, there are two such fields: document
|
||||
* title and document content.
|
||||
*/
|
||||
private Set<String> getFieldsForClustering(SolrQueryRequest sreq) {
|
||||
SolrParams solrParams = sreq.getParams();
|
||||
|
||||
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
|
||||
if (StringUtils.isBlank(snippetField)) {
|
||||
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
|
||||
if (StringUtils.isBlank(snippetFieldSpec)) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotParams.SNIPPET_FIELD_NAME
|
||||
+ " must not be blank.");
|
||||
}
|
||||
return Sets.newHashSet(titleField, snippetField);
|
||||
}
|
||||
|
||||
final Set<String> fields = Sets.newHashSet();
|
||||
fields.addAll(Arrays.asList(titleFieldSpec.split("[, ]")));
|
||||
fields.addAll(Arrays.asList(snippetFieldSpec.split("[, ]")));
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares Carrot2 documents for clustering.
|
||||
|
@ -313,8 +334,27 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
SolrCore core = sreq.getCore();
|
||||
|
||||
String urlField = solrParams.get(CarrotParams.URL_FIELD_NAME, "url");
|
||||
String titleField = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetField = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleField);
|
||||
String titleFieldSpec = solrParams.get(CarrotParams.TITLE_FIELD_NAME, "title");
|
||||
String snippetFieldSpec = solrParams.get(CarrotParams.SNIPPET_FIELD_NAME, titleFieldSpec);
|
||||
String languageField = solrParams.get(CarrotParams.LANGUAGE_FIELD_NAME, null);
|
||||
|
||||
// Maps Solr field names to Carrot2 custom field names
|
||||
Map<String, String> customFields = getCustomFieldsMap(solrParams);
|
||||
|
||||
// Parse language code map string into a map
|
||||
Map<String, String> languageCodeMap = Maps.newHashMap();
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
for (String pair : solrParams.get(CarrotParams.LANGUAGE_CODE_MAP, "")
|
||||
.split("[, ]")) {
|
||||
final String[] split = pair.split(":");
|
||||
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
|
||||
languageCodeMap.put(split[0], split[1]);
|
||||
} else {
|
||||
log.warn("Unsupported format for " + CarrotParams.LANGUAGE_CODE_MAP
|
||||
+ ": '" + pair + "'. Skipping this mapping.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get the documents
|
||||
boolean produceSummary = solrParams.getBool(CarrotParams.PRODUCE_SUMMARY, false);
|
||||
|
@ -325,12 +365,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
highlighter = HighlightComponent.getHighlighter(core);
|
||||
if (highlighter != null){
|
||||
Map<String, Object> args = Maps.newHashMap();
|
||||
snippetFieldAry = new String[]{snippetField};
|
||||
snippetFieldAry = snippetFieldSpec.split("[, ]");
|
||||
args.put(HighlightParams.FIELDS, snippetFieldAry);
|
||||
args.put(HighlightParams.HIGHLIGHT, "true");
|
||||
args.put(HighlightParams.SIMPLE_PRE, ""); //we don't care about actually highlighting the area
|
||||
args.put(HighlightParams.SIMPLE_POST, "");
|
||||
args.put(HighlightParams.FRAGSIZE, solrParams.getInt(CarrotParams.SUMMARY_FRAGSIZE, solrParams.getInt(HighlightParams.FRAGSIZE, 100)));
|
||||
args.put(HighlightParams.SNIPPETS, solrParams.getInt(CarrotParams.SUMMARY_SNIPPETS, solrParams.getInt(HighlightParams.SNIPPETS, 1)));
|
||||
req = new LocalSolrQueryRequest(core, query.toString(), "", 0, 1, args) {
|
||||
@Override
|
||||
public SolrIndexSearcher getSearcher() {
|
||||
|
@ -352,7 +393,8 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
|
||||
while (docsIter.hasNext()) {
|
||||
SolrDocument sdoc = docsIter.next();
|
||||
String snippet = getValue(sdoc, snippetField);
|
||||
String snippet = null;
|
||||
|
||||
// TODO: docIds will be null when running distributed search.
|
||||
// See comment in ClusteringComponent#finishStage().
|
||||
if (produceSummary && docIds != null) {
|
||||
|
@ -360,34 +402,115 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
DocList docAsList = new DocSlice(0, 1, docsHolder, scores, 1, 1.0f);
|
||||
NamedList<Object> highlights = highlighter.doHighlighting(docAsList, theQuery, req, snippetFieldAry);
|
||||
if (highlights != null && highlights.size() == 1) {//should only be one value given our setup
|
||||
//should only be one document with one field
|
||||
//should only be one document
|
||||
@SuppressWarnings("unchecked")
|
||||
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
|
||||
String [] highlt = tmp.get(snippetField);
|
||||
if (highlt != null && highlt.length == 1) {
|
||||
snippet = highlt[0];
|
||||
NamedList<String []> tmp = (NamedList<String[]>) highlights.getVal(0);
|
||||
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (int j = 0; j < snippetFieldAry.length; j++) {
|
||||
// Join fragments with a period, so that Carrot2 does not create
|
||||
// cross-fragment phrases, such phrases rarely make sense.
|
||||
String [] highlt = tmp.get(snippetFieldAry[j]);
|
||||
if (highlt != null && highlt.length > 0) {
|
||||
for (int i = 0; i < highlt.length; i++) {
|
||||
sb.append(highlt[i]);
|
||||
sb.append(" . ");
|
||||
}
|
||||
}
|
||||
}
|
||||
snippet = sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
// If summaries not enabled or summary generation failed, use full content.
|
||||
if (snippet == null) {
|
||||
snippet = getConcatenated(sdoc, snippetFieldSpec);
|
||||
}
|
||||
|
||||
// Create a Carrot2 document
|
||||
Document carrotDocument = new Document(getConcatenated(sdoc, titleFieldSpec),
|
||||
snippet, ObjectUtils.toString(sdoc.getFieldValue(urlField), ""));
|
||||
|
||||
// Store Solr id of the document, we need it to map document instances
|
||||
// found in clusters back to identifiers.
|
||||
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
|
||||
|
||||
// Set language
|
||||
if (StringUtils.isNotBlank(languageField)) {
|
||||
Collection<Object> languages = sdoc.getFieldValues(languageField);
|
||||
if (languages != null) {
|
||||
|
||||
// Use the first Carrot2-supported language
|
||||
for (Object l : languages) {
|
||||
String lang = ObjectUtils.toString(l, "");
|
||||
|
||||
if (languageCodeMap.containsKey(lang)) {
|
||||
lang = languageCodeMap.get(lang);
|
||||
}
|
||||
|
||||
// Language detection Library for Java uses dashes to separate
|
||||
// language variants, such as 'zh-cn', but Carrot2 uses underscores.
|
||||
if (lang.indexOf('-') > 0) {
|
||||
lang = lang.replace('-', '_');
|
||||
}
|
||||
|
||||
// If the language is supported by Carrot2, we'll get a non-null value
|
||||
final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
|
||||
if (carrot2Language != null) {
|
||||
carrotDocument.setLanguage(carrot2Language);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Document carrotDocument = new Document(getValue(sdoc, titleField),
|
||||
snippet, (String)sdoc.getFieldValue(urlField));
|
||||
carrotDocument.setField(SOLR_DOCUMENT_ID, sdoc.getFieldValue(idFieldName));
|
||||
|
||||
// Add custom fields
|
||||
if (customFields != null) {
|
||||
for (Entry<String, String> entry : customFields.entrySet()) {
|
||||
carrotDocument.setField(entry.getValue(), sdoc.getFieldValue(entry.getKey()));
|
||||
}
|
||||
}
|
||||
|
||||
result.add(carrotDocument);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
protected String getValue(SolrDocument sdoc, String field) {
|
||||
/**
|
||||
* Prepares a map of Solr field names (keys) to the corresponding Carrot2
|
||||
* custom field names.
|
||||
*/
|
||||
private Map<String, String> getCustomFieldsMap(SolrParams solrParams) {
|
||||
Map<String, String> customFields = Maps.newHashMap();
|
||||
String [] customFieldsSpec = solrParams.getParams(CarrotParams.CUSTOM_FIELD_NAME);
|
||||
if (customFieldsSpec != null) {
|
||||
customFields = Maps.newHashMap();
|
||||
for (String customFieldSpec : customFieldsSpec) {
|
||||
String [] split = customFieldSpec.split(":");
|
||||
if (split.length == 2 && StringUtils.isNotBlank(split[0]) && StringUtils.isNotBlank(split[1])) {
|
||||
customFields.put(split[0], split[1]);
|
||||
} else {
|
||||
log.warn("Unsupported format for " + CarrotParams.CUSTOM_FIELD_NAME
|
||||
+ ": '" + customFieldSpec + "'. Skipping this field definition.");
|
||||
}
|
||||
}
|
||||
}
|
||||
return customFields;
|
||||
}
|
||||
|
||||
private String getConcatenated(SolrDocument sdoc, String fieldsSpec) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
Collection<Object> vals = sdoc.getFieldValues(field);
|
||||
if(vals == null) return "";
|
||||
Iterator<Object> ite = vals.iterator();
|
||||
while(ite.hasNext()){
|
||||
// Join multiple values with a period so that Carrot2 does not pick up
|
||||
// phrases that cross field value boundaries (in most cases it would
|
||||
// create useless phrases).
|
||||
result.append((String)ite.next()).append(" . ");
|
||||
for (String field : fieldsSpec.split("[, ]")) {
|
||||
Collection<Object> vals = sdoc.getFieldValues(field);
|
||||
if (vals == null) continue;
|
||||
Iterator<Object> ite = vals.iterator();
|
||||
while(ite.hasNext()){
|
||||
// Join multiple values with a period so that Carrot2 does not pick up
|
||||
// phrases that cross field value boundaries (in most cases it would
|
||||
// create useless phrases).
|
||||
result.append(ObjectUtils.toString(ite.next())).append(" . ");
|
||||
}
|
||||
}
|
||||
return result.toString().trim();
|
||||
}
|
||||
|
|
|
@ -27,17 +27,24 @@ public interface CarrotParams {
|
|||
String CARROT_PREFIX = "carrot.";
|
||||
|
||||
String ALGORITHM = CARROT_PREFIX + "algorithm";
|
||||
|
||||
String TITLE_FIELD_NAME = CARROT_PREFIX + "title";
|
||||
String URL_FIELD_NAME = CARROT_PREFIX + "url";
|
||||
String SNIPPET_FIELD_NAME = CARROT_PREFIX + "snippet";
|
||||
String LANGUAGE_FIELD_NAME = CARROT_PREFIX + "lang";
|
||||
String CUSTOM_FIELD_NAME = CARROT_PREFIX + "custom";
|
||||
|
||||
String PRODUCE_SUMMARY = CARROT_PREFIX + "produceSummary";
|
||||
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
|
||||
String SUMMARY_SNIPPETS = CARROT_PREFIX + "summarySnippets";
|
||||
|
||||
String NUM_DESCRIPTIONS = CARROT_PREFIX + "numDescriptions";
|
||||
String OUTPUT_SUB_CLUSTERS = CARROT_PREFIX + "outputSubClusters";
|
||||
String SUMMARY_FRAGSIZE = CARROT_PREFIX + "fragSize";
|
||||
|
||||
String LEXICAL_RESOURCES_DIR = CARROT_PREFIX + "lexicalResourcesDir";
|
||||
String LANGUAGE_CODE_MAP = CARROT_PREFIX + "lcmap";
|
||||
|
||||
public static final Set<String> CARROT_PARAM_NAMES = ImmutableSet.of(
|
||||
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME,
|
||||
PRODUCE_SUMMARY, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS, SUMMARY_FRAGSIZE);
|
||||
ALGORITHM, TITLE_FIELD_NAME, URL_FIELD_NAME, SNIPPET_FIELD_NAME, LANGUAGE_FIELD_NAME,
|
||||
PRODUCE_SUMMARY, SUMMARY_FRAGSIZE, SUMMARY_SNIPPETS, NUM_DESCRIPTIONS, OUTPUT_SUB_CLUSTERS,
|
||||
LEXICAL_RESOURCES_DIR);
|
||||
}
|
||||
|
|
|
@ -50,192 +50,192 @@ import org.tartarus.snowball.ext.TurkishStemmer;
|
|||
* in this class.
|
||||
*/
|
||||
public class LuceneCarrot2StemmerFactory implements IStemmerFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneCarrot2StemmerFactory.class);
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneCarrot2StemmerFactory.class);
|
||||
|
||||
@Override
|
||||
public IStemmer getStemmer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case ARABIC:
|
||||
return ArabicStemmerFactory.createStemmer();
|
||||
@Override
|
||||
public IStemmer getStemmer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case ARABIC:
|
||||
return ArabicStemmerFactory.createStemmer();
|
||||
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return IdentityStemmer.INSTANCE;
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return IdentityStemmer.INSTANCE;
|
||||
|
||||
default:
|
||||
/*
|
||||
* For other languages, try to use snowball's stemming.
|
||||
*/
|
||||
return SnowballStemmerFactory.createStemmer(language);
|
||||
}
|
||||
}
|
||||
default:
|
||||
/*
|
||||
* For other languages, try to use snowball's stemming.
|
||||
*/
|
||||
return SnowballStemmerFactory.createStemmer(language);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
|
||||
* project.
|
||||
*/
|
||||
private final static class SnowballStemmerFactory {
|
||||
/**
|
||||
* Static hard mapping from language codes to stemmer classes in Snowball.
|
||||
* This mapping is not dynamic because we want to keep the possibility to
|
||||
* obfuscate these classes.
|
||||
*/
|
||||
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
|
||||
static {
|
||||
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
|
||||
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
|
||||
PortugueseStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
|
||||
}
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations from the <code>snowball</code>
|
||||
* project.
|
||||
*/
|
||||
private final static class SnowballStemmerFactory {
|
||||
/**
|
||||
* Static hard mapping from language codes to stemmer classes in Snowball.
|
||||
* This mapping is not dynamic because we want to keep the possibility to
|
||||
* obfuscate these classes.
|
||||
*/
|
||||
private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
|
||||
static {
|
||||
snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
|
||||
snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
|
||||
snowballStemmerClasses
|
||||
.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
|
||||
PortugueseStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
|
||||
snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* An adapter converting Snowball programs into {@link IStemmer} interface.
|
||||
*/
|
||||
private static class SnowballStemmerAdapter implements IStemmer {
|
||||
private final SnowballProgram snowballStemmer;
|
||||
/**
|
||||
* An adapter converting Snowball programs into {@link IStemmer} interface.
|
||||
*/
|
||||
private static class SnowballStemmerAdapter implements IStemmer {
|
||||
private final SnowballProgram snowballStemmer;
|
||||
|
||||
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
|
||||
this.snowballStemmer = snowballStemmer;
|
||||
}
|
||||
public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
|
||||
this.snowballStemmer = snowballStemmer;
|
||||
}
|
||||
|
||||
public CharSequence stem(CharSequence word) {
|
||||
snowballStemmer.setCurrent(word.toString());
|
||||
if (snowballStemmer.stem()) {
|
||||
return snowballStemmer.getCurrent();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
public CharSequence stem(CharSequence word) {
|
||||
snowballStemmer.setCurrent(word.toString());
|
||||
if (snowballStemmer.stem()) {
|
||||
return snowballStemmer.getCurrent();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and return an {@link IStemmer} adapter for a
|
||||
* {@link SnowballProgram} for a given language code. An identity stemmer is
|
||||
* returned for unknown languages.
|
||||
*/
|
||||
public static IStemmer createStemmer(LanguageCode language) {
|
||||
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
|
||||
.get(language);
|
||||
/**
|
||||
* Create and return an {@link IStemmer} adapter for a
|
||||
* {@link SnowballProgram} for a given language code. An identity stemmer is
|
||||
* returned for unknown languages.
|
||||
*/
|
||||
public static IStemmer createStemmer(LanguageCode language) {
|
||||
final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
|
||||
.get(language);
|
||||
|
||||
if (stemmerClazz == null) {
|
||||
logger.warn("No Snowball stemmer class for: " + language.name()
|
||||
+ ". Quality of clustering may be degraded.");
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
if (stemmerClazz == null) {
|
||||
logger.warn("No Snowball stemmer class for: " + language.name()
|
||||
+ ". Quality of clustering may be degraded.");
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
|
||||
try {
|
||||
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Could not instantiate snowball stemmer"
|
||||
+ " for language: " + language.name()
|
||||
+ ". Quality of clustering may be degraded.", e);
|
||||
try {
|
||||
return new SnowballStemmerAdapter(stemmerClazz.newInstance());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Could not instantiate snowball stemmer"
|
||||
+ " for language: " + language.name()
|
||||
+ ". Quality of clustering may be degraded.", e);
|
||||
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations for the
|
||||
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
|
||||
* to be present in classpath, otherwise an empty (identity) stemmer is
|
||||
* returned.
|
||||
*/
|
||||
private static class ArabicStemmerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
|
||||
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
|
||||
} catch (ClassNotFoundException e) {
|
||||
logger
|
||||
.warn(
|
||||
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
|
||||
+ "of Arabic content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
|
||||
e);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Factory of {@link IStemmer} implementations for the
|
||||
* {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
|
||||
* to be present in classpath, otherwise an empty (identity) stemmer is
|
||||
* returned.
|
||||
*/
|
||||
private static class ArabicStemmerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
|
||||
ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
|
||||
} catch (ClassNotFoundException e) {
|
||||
logger
|
||||
.warn(
|
||||
"Could not instantiate Lucene stemmer for Arabic, clustering quality "
|
||||
+ "of Arabic content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adapter to lucene-contrib Arabic analyzers.
|
||||
*/
|
||||
private static class LuceneStemmerAdapter implements IStemmer {
|
||||
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
|
||||
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
|
||||
/**
|
||||
* Adapter to lucene-contrib Arabic analyzers.
|
||||
*/
|
||||
private static class LuceneStemmerAdapter implements IStemmer {
|
||||
private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
|
||||
private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
|
||||
|
||||
private char[] buffer = new char[0];
|
||||
private char[] buffer = new char[0];
|
||||
|
||||
private LuceneStemmerAdapter() throws Exception {
|
||||
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
|
||||
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
|
||||
}
|
||||
private LuceneStemmerAdapter() throws Exception {
|
||||
delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
|
||||
normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
|
||||
}
|
||||
|
||||
public CharSequence stem(CharSequence word) {
|
||||
if (word.length() > buffer.length) {
|
||||
buffer = new char[word.length()];
|
||||
}
|
||||
public CharSequence stem(CharSequence word) {
|
||||
if (word.length() > buffer.length) {
|
||||
buffer = new char[word.length()];
|
||||
}
|
||||
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
buffer[i] = word.charAt(i);
|
||||
}
|
||||
for (int i = 0; i < word.length(); i++) {
|
||||
buffer[i] = word.charAt(i);
|
||||
}
|
||||
|
||||
int newLen = normalizer.normalize(buffer, word.length());
|
||||
newLen = delegate.stem(buffer, newLen);
|
||||
int newLen = normalizer.normalize(buffer, word.length());
|
||||
newLen = delegate.stem(buffer, newLen);
|
||||
|
||||
if (newLen != word.length() || !equals(buffer, newLen, word)) {
|
||||
return CharBuffer.wrap(buffer, 0, newLen);
|
||||
}
|
||||
if (newLen != word.length() || !equals(buffer, newLen, word)) {
|
||||
return CharBuffer.wrap(buffer, 0, newLen);
|
||||
}
|
||||
|
||||
// Same-same.
|
||||
return null;
|
||||
}
|
||||
// Same-same.
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean equals(char[] buffer, int len, CharSequence word) {
|
||||
assert len == word.length();
|
||||
private boolean equals(char[] buffer, int len, CharSequence word) {
|
||||
assert len == word.length();
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (buffer[i] != word.charAt(i))
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (buffer[i] != word.charAt(i))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public static IStemmer createStemmer() {
|
||||
try {
|
||||
return new LuceneStemmerAdapter();
|
||||
} catch (Throwable e) {
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
public static IStemmer createStemmer() {
|
||||
try {
|
||||
return new LuceneStemmerAdapter();
|
||||
} catch (Throwable e) {
|
||||
return IdentityStemmer.INSTANCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An implementation of {@link IStemmer} that always returns <code>null</code>
|
||||
* which means no stemming.
|
||||
*/
|
||||
private static class IdentityStemmer implements IStemmer {
|
||||
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
|
||||
/**
|
||||
* An implementation of {@link IStemmer} that always returns <code>null</code>
|
||||
* which means no stemming.
|
||||
*/
|
||||
private static class IdentityStemmer implements IStemmer {
|
||||
private final static IdentityStemmer INSTANCE = new IdentityStemmer();
|
||||
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public CharSequence stem(CharSequence word) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,117 +40,117 @@ import org.slf4j.Logger;
|
|||
* Lucene APIs need to change, the changes can be made in this class.
|
||||
*/
|
||||
public class LuceneCarrot2TokenizerFactory implements ITokenizerFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneCarrot2TokenizerFactory.class);
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(LuceneCarrot2TokenizerFactory.class);
|
||||
|
||||
@Override
|
||||
public ITokenizer getTokenizer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return ChineseTokenizerFactory.createTokenizer();
|
||||
@Override
|
||||
public ITokenizer getTokenizer(LanguageCode language) {
|
||||
switch (language) {
|
||||
case CHINESE_SIMPLIFIED:
|
||||
return ChineseTokenizerFactory.createTokenizer();
|
||||
|
||||
/*
|
||||
* We use our own analyzer for Arabic. Lucene's version has special
|
||||
* support for Nonspacing-Mark characters (see
|
||||
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||
* have them included as letters in the parser.
|
||||
*/
|
||||
case ARABIC:
|
||||
// Intentional fall-through.
|
||||
/*
|
||||
* We use our own analyzer for Arabic. Lucene's version has special
|
||||
* support for Nonspacing-Mark characters (see
|
||||
* http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
|
||||
* have them included as letters in the parser.
|
||||
*/
|
||||
case ARABIC:
|
||||
// Intentional fall-through.
|
||||
|
||||
default:
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
default:
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||
* factory will fall back to the default white space tokenizer.
|
||||
*/
|
||||
private static final class ChineseTokenizerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
} catch (Throwable e) {
|
||||
logger
|
||||
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
|
||||
* {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
|
||||
* factory will fall back to the default white space tokenizer.
|
||||
*/
|
||||
private static final class ChineseTokenizerFactory {
|
||||
static {
|
||||
try {
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
} catch (Throwable e) {
|
||||
logger
|
||||
.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
|
||||
+ "of Chinese content may be degraded. For best quality clusters, "
|
||||
+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
|
||||
}
|
||||
}
|
||||
|
||||
static ITokenizer createTokenizer() {
|
||||
try {
|
||||
return new ChineseTokenizer();
|
||||
} catch (Throwable e) {
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
static ITokenizer createTokenizer() {
|
||||
try {
|
||||
return new ChineseTokenizer();
|
||||
} catch (Throwable e) {
|
||||
return new ExtendedWhitespaceTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
private final static class ChineseTokenizer implements ITokenizer {
|
||||
private final static Pattern numeric = Pattern
|
||||
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||
private final static class ChineseTokenizer implements ITokenizer {
|
||||
private final static Pattern numeric = Pattern
|
||||
.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
|
||||
|
||||
private Tokenizer sentenceTokenizer;
|
||||
private TokenStream wordTokenFilter;
|
||||
private CharTermAttribute term = null;
|
||||
private Tokenizer sentenceTokenizer;
|
||||
private TokenStream wordTokenFilter;
|
||||
private CharTermAttribute term = null;
|
||||
|
||||
private final MutableCharArray tempCharSequence;
|
||||
private final Class<?> tokenFilterClass;
|
||||
private final MutableCharArray tempCharSequence;
|
||||
private final Class<?> tokenFilterClass;
|
||||
|
||||
private ChineseTokenizer() throws Exception {
|
||||
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||
private ChineseTokenizer() throws Exception {
|
||||
this.tempCharSequence = new MutableCharArray(new char[0]);
|
||||
|
||||
// As Smart Chinese is not available during compile time,
|
||||
// we need to resort to reflection.
|
||||
final Class<?> tokenizerClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||
Reader.class).newInstance((Reader) null);
|
||||
this.tokenFilterClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
}
|
||||
// As Smart Chinese is not available during compile time,
|
||||
// we need to resort to reflection.
|
||||
final Class<?> tokenizerClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
|
||||
this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
|
||||
Reader.class).newInstance((Reader) null);
|
||||
this.tokenFilterClass = ReflectionUtils.classForName(
|
||||
"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
|
||||
}
|
||||
|
||||
public short nextToken() throws IOException {
|
||||
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||
if (hasNextToken) {
|
||||
short flags = 0;
|
||||
final char[] image = term.buffer();
|
||||
final int length = term.length();
|
||||
tempCharSequence.reset(image, 0, length);
|
||||
if (length == 1 && image[0] == ',') {
|
||||
// ChineseTokenizer seems to convert all punctuation to ','
|
||||
// characters
|
||||
flags = ITokenizer.TT_PUNCTUATION;
|
||||
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||
flags = ITokenizer.TT_NUMERIC;
|
||||
} else {
|
||||
flags = ITokenizer.TT_TERM;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
public short nextToken() throws IOException {
|
||||
final boolean hasNextToken = wordTokenFilter.incrementToken();
|
||||
if (hasNextToken) {
|
||||
short flags = 0;
|
||||
final char[] image = term.buffer();
|
||||
final int length = term.length();
|
||||
tempCharSequence.reset(image, 0, length);
|
||||
if (length == 1 && image[0] == ',') {
|
||||
// ChineseTokenizer seems to convert all punctuation to ','
|
||||
// characters
|
||||
flags = ITokenizer.TT_PUNCTUATION;
|
||||
} else if (numeric.matcher(tempCharSequence).matches()) {
|
||||
flags = ITokenizer.TT_NUMERIC;
|
||||
} else {
|
||||
flags = ITokenizer.TT_TERM;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
return ITokenizer.TT_EOF;
|
||||
}
|
||||
return ITokenizer.TT_EOF;
|
||||
}
|
||||
|
||||
public void setTermBuffer(MutableCharArray array) {
|
||||
array.reset(term.buffer(), 0, term.length());
|
||||
}
|
||||
public void setTermBuffer(MutableCharArray array) {
|
||||
array.reset(term.buffer(), 0, term.length());
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
try {
|
||||
sentenceTokenizer.reset(input);
|
||||
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||
TokenStream.class).newInstance(sentenceTokenizer);
|
||||
public void reset(Reader input) throws IOException {
|
||||
try {
|
||||
sentenceTokenizer.reset(input);
|
||||
wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
|
||||
TokenStream.class).newInstance(sentenceTokenizer);
|
||||
term = wordTokenFilter.addAttribute(CharTermAttribute.class);
|
||||
} catch (Exception e) {
|
||||
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw ExceptionUtils.wrapAsRuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,89 +53,89 @@ import com.google.common.collect.Multimap;
|
|||
*/
|
||||
@Bindable
|
||||
public class SolrStopwordsCarrot2LexicalDataFactory implements
|
||||
ILexicalDataFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
ILexicalDataFactory {
|
||||
final static Logger logger = org.slf4j.LoggerFactory
|
||||
.getLogger(SolrStopwordsCarrot2LexicalDataFactory.class);
|
||||
|
||||
@Init
|
||||
@Input
|
||||
@Attribute(key = "solrIndexSchema")
|
||||
private IndexSchema schema;
|
||||
@Init
|
||||
@Input
|
||||
@Attribute(key = "solrIndexSchema")
|
||||
private IndexSchema schema;
|
||||
|
||||
@Processing
|
||||
@Input
|
||||
@Attribute(key = "solrFieldNames")
|
||||
private Set<String> fieldNames;
|
||||
@Processing
|
||||
@Input
|
||||
@Attribute(key = "solrFieldNames")
|
||||
private Set<String> fieldNames;
|
||||
|
||||
/**
|
||||
* A lazily-built cache of stop words per field.
|
||||
*/
|
||||
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
|
||||
/**
|
||||
* A lazily-built cache of stop words per field.
|
||||
*/
|
||||
private Multimap<String, CharArraySet> solrStopWords = HashMultimap.create();
|
||||
|
||||
/**
|
||||
* Carrot2's default lexical resources to use in addition to Solr's stop
|
||||
* words.
|
||||
*/
|
||||
private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
|
||||
/**
|
||||
* Carrot2's default lexical resources to use in addition to Solr's stop
|
||||
* words.
|
||||
*/
|
||||
private DefaultLexicalDataFactory carrot2LexicalDataFactory = new DefaultLexicalDataFactory();
|
||||
|
||||
/**
|
||||
* Obtains stop words for a field from the associated
|
||||
* {@link StopFilterFactory}, if any.
|
||||
*/
|
||||
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
|
||||
// No need to synchronize here, Carrot2 ensures that instances
|
||||
// of this class are not used by multiple threads at a time.
|
||||
if (!solrStopWords.containsKey(fieldName)) {
|
||||
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
|
||||
.getAnalyzer();
|
||||
if (fieldAnalyzer instanceof TokenizerChain) {
|
||||
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
|
||||
.getTokenFilterFactories();
|
||||
for (TokenFilterFactory factory : filterFactories) {
|
||||
if (factory instanceof StopFilterFactory) {
|
||||
// StopFilterFactory holds the stop words in a CharArraySet, but
|
||||
// the getStopWords() method returns a Set<?>, so we need to cast.
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
|
||||
}
|
||||
/**
|
||||
* Obtains stop words for a field from the associated
|
||||
* {@link StopFilterFactory}, if any.
|
||||
*/
|
||||
private Collection<CharArraySet> getSolrStopWordsForField(String fieldName) {
|
||||
// No need to synchronize here, Carrot2 ensures that instances
|
||||
// of this class are not used by multiple threads at a time.
|
||||
if (!solrStopWords.containsKey(fieldName)) {
|
||||
final Analyzer fieldAnalyzer = schema.getFieldType(fieldName)
|
||||
.getAnalyzer();
|
||||
if (fieldAnalyzer instanceof TokenizerChain) {
|
||||
final TokenFilterFactory[] filterFactories = ((TokenizerChain) fieldAnalyzer)
|
||||
.getTokenFilterFactories();
|
||||
for (TokenFilterFactory factory : filterFactories) {
|
||||
if (factory instanceof StopFilterFactory) {
|
||||
// StopFilterFactory holds the stop words in a CharArraySet, but
|
||||
// the getStopWords() method returns a Set<?>, so we need to cast.
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((StopFilterFactory) factory).getStopWords());
|
||||
}
|
||||
|
||||
if (factory instanceof CommonGramsFilterFactory) {
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((CommonGramsFilterFactory) factory)
|
||||
.getCommonWords());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return solrStopWords.get(fieldName);
|
||||
}
|
||||
if (factory instanceof CommonGramsFilterFactory) {
|
||||
solrStopWords.put(fieldName,
|
||||
(CharArraySet) ((CommonGramsFilterFactory) factory)
|
||||
.getCommonWords());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return solrStopWords.get(fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ILexicalData getLexicalData(LanguageCode languageCode) {
|
||||
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
|
||||
.getLexicalData(languageCode);
|
||||
@Override
|
||||
public ILexicalData getLexicalData(LanguageCode languageCode) {
|
||||
final ILexicalData carrot2LexicalData = carrot2LexicalDataFactory
|
||||
.getLexicalData(languageCode);
|
||||
|
||||
return new ILexicalData() {
|
||||
@Override
|
||||
public boolean isStopLabel(CharSequence word) {
|
||||
// Nothing in Solr maps to the concept of a stop label,
|
||||
// so return Carrot2's default here.
|
||||
return carrot2LexicalData.isStopLabel(word);
|
||||
}
|
||||
return new ILexicalData() {
|
||||
@Override
|
||||
public boolean isStopLabel(CharSequence word) {
|
||||
// Nothing in Solr maps to the concept of a stop label,
|
||||
// so return Carrot2's default here.
|
||||
return carrot2LexicalData.isStopLabel(word);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCommonWord(MutableCharArray word) {
|
||||
// Loop over the fields involved in clustering first
|
||||
for (String fieldName : fieldNames) {
|
||||
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
|
||||
if (stopWords.contains(word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check default Carrot2 stop words too
|
||||
return carrot2LexicalData.isCommonWord(word);
|
||||
}
|
||||
};
|
||||
}
|
||||
@Override
|
||||
public boolean isCommonWord(MutableCharArray word) {
|
||||
// Loop over the fields involved in clustering first
|
||||
for (String fieldName : fieldNames) {
|
||||
for (CharArraySet stopWords : getSolrStopWordsForField(fieldName)) {
|
||||
if (stopWords.contains(word)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check default Carrot2 stop words too
|
||||
return carrot2LexicalData.isCommonWord(word);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
|
@ -280,8 +280,10 @@
|
|||
|
||||
<field name="id" type="string" indexed="true" stored="true" required="true" />
|
||||
<field name="url" type="string" indexed="true" stored="true" required="true" />
|
||||
<field name="lang" type="string" indexed="true" stored="true" required="false" multiValued="true" />
|
||||
|
||||
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="heading" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="snippet" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<field name="body" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
<!-- catchall field, containing all other searchable text fields (implemented
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.solr.handler.clustering;
|
|||
*/
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
|
||||
|
@ -34,6 +35,54 @@ public abstract class AbstractClusteringTestCase extends SolrTestCaseJ4 {
|
|||
assertNull(h.validateUpdate(adoc("id", Integer.toString(numberOfDocs), "url", doc[0], "title", doc[1], "snippet", doc[2])));
|
||||
numberOfDocs++;
|
||||
}
|
||||
|
||||
// Add a multi-valued snippet
|
||||
final SolrInputDocument multiValuedSnippet = new SolrInputDocument();
|
||||
multiValuedSnippet.addField("id", numberOfDocs++);
|
||||
multiValuedSnippet.addField("title", "Title");
|
||||
multiValuedSnippet.addField("url", "URL");
|
||||
multiValuedSnippet.addField("snippet", "First value of multi field. Some more text. And still more.");
|
||||
multiValuedSnippet.addField("snippet", "Second value of multi field. Some more text. And still more.");
|
||||
multiValuedSnippet.addField("snippet", "Third value of multi field. Some more text. And still more.");
|
||||
assertNull(h.validateUpdate(adoc(multiValuedSnippet)));
|
||||
|
||||
// Add a document with multi-field title and snippet
|
||||
final SolrInputDocument multiFieldDoc = new SolrInputDocument();
|
||||
multiFieldDoc.addField("id", numberOfDocs++);
|
||||
multiFieldDoc.addField("title", "Title field");
|
||||
multiFieldDoc.addField("heading", "Heading field");
|
||||
multiFieldDoc.addField("url", "URL");
|
||||
multiFieldDoc.addField("snippet", "Snippet field: this is the contents of the snippet field.");
|
||||
multiFieldDoc.addField("body", "Body field: this is the contents of the body field that will get clustered together with snippet.");
|
||||
assertNull(h.validateUpdate(adoc(multiFieldDoc)));
|
||||
|
||||
// Add a document with one language supported by Carrot2
|
||||
final SolrInputDocument docWithOneSupprtedLanguage = new SolrInputDocument();
|
||||
docWithOneSupprtedLanguage.addField("id", numberOfDocs++);
|
||||
docWithOneSupprtedLanguage.addField("title", "");
|
||||
docWithOneSupprtedLanguage.addField("url", "one_supported_language");
|
||||
docWithOneSupprtedLanguage.addField("lang", "zh-cn");
|
||||
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguage)));
|
||||
|
||||
// Add a document with more languages, one supported by Carrot2
|
||||
final SolrInputDocument docWithOneSupprtedLanguageOfMany = new SolrInputDocument();
|
||||
docWithOneSupprtedLanguageOfMany.addField("id", numberOfDocs++);
|
||||
docWithOneSupprtedLanguageOfMany.addField("url", "one_supported_language_of_many");
|
||||
docWithOneSupprtedLanguageOfMany.addField("lang", "zh-tw");
|
||||
docWithOneSupprtedLanguageOfMany.addField("lang", "POLISH");
|
||||
docWithOneSupprtedLanguageOfMany.addField("lang", "de");
|
||||
assertNull(h.validateUpdate(adoc(docWithOneSupprtedLanguageOfMany)));
|
||||
|
||||
// Add a document with more languages, one supported by Carrot2
|
||||
final SolrInputDocument docWithCustomFields = new SolrInputDocument();
|
||||
docWithCustomFields.addField("id", numberOfDocs++);
|
||||
docWithCustomFields.addField("url", "custom_fields");
|
||||
docWithCustomFields.addField("intfield_i", 10);
|
||||
docWithCustomFields.addField("floatfield_f", 10.5);
|
||||
docWithCustomFields.addField("heading", "first");
|
||||
docWithCustomFields.addField("heading", "second");
|
||||
assertNull(h.validateUpdate(adoc(docWithCustomFields)));
|
||||
|
||||
assertNull(h.validateUpdate(commit()));
|
||||
}
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.apache.solr.search.DocList;
|
|||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.RefCounted;
|
||||
import org.apache.solr.util.SolrPluginUtils;
|
||||
import org.carrot2.core.LanguageCode;
|
||||
import org.carrot2.util.attribute.AttributeUtils;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -50,10 +51,10 @@ import com.google.common.collect.ImmutableList;
|
|||
public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
||||
@Test
|
||||
public void testCarrotLingo() throws Exception {
|
||||
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||
// Note: the expected number of clusters may change after upgrading Carrot2
|
||||
// due to e.g. internal improvements or tuning of Carrot2 clustering.
|
||||
final int expectedNumClusters = 10;
|
||||
checkEngine(getClusteringEngine("default"), expectedNumClusters);
|
||||
checkEngine(getClusteringEngine("default"), expectedNumClusters);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -88,10 +89,15 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
|
||||
private List<NamedList<Object>> clusterWithHighlighting(
|
||||
boolean enableHighlighting, int fragSize) throws IOException {
|
||||
// Some documents don't have mining in the snippet
|
||||
return clusterWithHighlighting(enableHighlighting, fragSize, 1, "mine", numberOfDocs - 7);
|
||||
}
|
||||
|
||||
private List<NamedList<Object>> clusterWithHighlighting(
|
||||
boolean enableHighlighting, int fragSize, int summarySnippets,
|
||||
String term, int expectedNumDocuments) throws IOException {
|
||||
|
||||
final TermQuery query = new TermQuery(new Term("snippet", "mine"));
|
||||
// Two documents don't have mining in the snippet
|
||||
int expectedNumDocuments = numberOfDocs - 2;
|
||||
final TermQuery query = new TermQuery(new Term("snippet", term));
|
||||
|
||||
final ModifiableSolrParams summaryParams = new ModifiableSolrParams();
|
||||
summaryParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
|
||||
|
@ -99,6 +105,8 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
Boolean.toString(enableHighlighting));
|
||||
summaryParams
|
||||
.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(fragSize));
|
||||
summaryParams
|
||||
.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(summarySnippets));
|
||||
final List<NamedList<Object>> summaryClusters = checkEngine(
|
||||
getClusteringEngine("echo"), expectedNumDocuments,
|
||||
expectedNumDocuments, query, summaryParams);
|
||||
|
@ -169,66 +177,180 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
params), 1, 3, 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
|
||||
"online,customsolrstopword,customsolrstoplabel");
|
||||
}
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigDefaultDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check",
|
||||
"online,customsolrstopword,customsolrstoplabel");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
|
||||
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
|
||||
}
|
||||
@Test
|
||||
public void testLexicalResourcesFromSolrConfigCustomDir() throws Exception {
|
||||
checkLexicalResourcesFromSolrConfig("lexical-resource-check-custom-resource-dir",
|
||||
"online,customsolrstopwordcustomdir,customsolrstoplabelcustomdir");
|
||||
}
|
||||
|
||||
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
|
||||
throws IOException {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
wordsToCheck);
|
||||
private void checkLexicalResourcesFromSolrConfig(String engineName, String wordsToCheck)
|
||||
throws IOException {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
wordsToCheck);
|
||||
|
||||
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
|
||||
// stoplabels.mt, so we're expecting only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine(engineName), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
||||
}
|
||||
// "customsolrstopword" is in stopwords.en, "customsolrstoplabel" is in
|
||||
// stoplabels.mt, so we're expecting only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine(engineName), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
@Test
|
||||
public void solrStopWordsUsedInCarrot2Clustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
|
||||
// "solrownstopword" is in stopwords.txt, so we're expecting
|
||||
// only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
||||
}
|
||||
// "solrownstopword" is in stopwords.txt, so we're expecting
|
||||
// only one cluster with label "online".
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 1, params);
|
||||
assertEquals(getLabels(clusters.get(0)), ImmutableList.of("online"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
// Force string fields to be used for clustering. Does not make sense
|
||||
// in a real word, but does the job in the test.
|
||||
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
|
||||
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
@Test
|
||||
public void solrStopWordsNotDefinedOnAFieldForClustering() throws Exception {
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
// Force string fields to be used for clustering. Does not make sense
|
||||
// in a real word, but does the job in the test.
|
||||
params.set(CarrotParams.TITLE_FIELD_NAME, "url");
|
||||
params.set(CarrotParams.SNIPPET_FIELD_NAME, "url");
|
||||
params.set("merge-resources", false);
|
||||
params.set(AttributeUtils.getKey(
|
||||
LexicalResourcesCheckClusteringAlgorithm.class, "wordsToCheck"),
|
||||
"online,solrownstopword");
|
||||
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 2, params);
|
||||
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
|
||||
assertEquals(ImmutableList.of("solrownstopword"),
|
||||
getLabels(clusters.get(1)));
|
||||
}
|
||||
final List<NamedList<Object>> clusters = checkEngine(
|
||||
getClusteringEngine("lexical-resource-check"), 2, params);
|
||||
assertEquals(ImmutableList.of("online"), getLabels(clusters.get(0)));
|
||||
assertEquals(ImmutableList.of("solrownstopword"),
|
||||
getLabels(clusters.get(1)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void highlightingOfMultiValueField() throws Exception {
|
||||
final String snippetWithoutSummary = getLabels(clusterWithHighlighting(
|
||||
false, 30, 3, "multi", 1).get(0)).get(1);
|
||||
assertTrue("Snippet contains first value", snippetWithoutSummary.contains("First"));
|
||||
assertTrue("Snippet contains second value", snippetWithoutSummary.contains("Second"));
|
||||
assertTrue("Snippet contains third value", snippetWithoutSummary.contains("Third"));
|
||||
|
||||
final String snippetWithSummary = getLabels(clusterWithHighlighting(
|
||||
true, 30, 3, "multi", 1).get(0)).get(1);
|
||||
assertTrue("Snippet with summary shorter than full snippet",
|
||||
snippetWithoutSummary.length() > snippetWithSummary.length());
|
||||
assertTrue("Summary covers first value", snippetWithSummary.contains("First"));
|
||||
assertTrue("Summary covers second value", snippetWithSummary.contains("Second"));
|
||||
assertTrue("Summary covers third value", snippetWithSummary.contains("Third"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void concatenatingMultipleFields() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("body",
|
||||
"snippet")), params).get(0));
|
||||
assertTrue("Snippet contains third value", labels.get(0).contains("Title field"));
|
||||
assertTrue("Snippet contains third value", labels.get(0).contains("Heading field"));
|
||||
assertTrue("Snippet contains third value", labels.get(1).contains("Snippet field"));
|
||||
assertTrue("Snippet contains third value", labels.get(1).contains("Body field"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void highlightingMultipleFields() throws Exception {
|
||||
final TermQuery query = new TermQuery(new Term("snippet", "content"));
|
||||
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.TITLE_FIELD_NAME, "title,heading");
|
||||
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet,body");
|
||||
params.add(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(false));
|
||||
|
||||
final String snippetWithoutSummary = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
|
||||
assertTrue("Snippet covers snippet field", snippetWithoutSummary.contains("snippet field"));
|
||||
assertTrue("Snippet covers body field", snippetWithoutSummary.contains("body field"));
|
||||
|
||||
params.set(CarrotParams.PRODUCE_SUMMARY, Boolean.toString(true));
|
||||
params.add(CarrotParams.SUMMARY_FRAGSIZE, Integer.toString(30));
|
||||
params.add(CarrotParams.SUMMARY_SNIPPETS, Integer.toString(2));
|
||||
final String snippetWithSummary = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, query, params).get(0)).get(1);
|
||||
assertTrue("Snippet with summary shorter than full snippet",
|
||||
snippetWithoutSummary.length() > snippetWithSummary.length());
|
||||
assertTrue("Snippet covers snippet field", snippetWithSummary.contains("snippet field"));
|
||||
assertTrue("Snippet covers body field", snippetWithSummary.contains("body field"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void oneCarrot2SupportedLanguage() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"one_supported_language")), params).get(0));
|
||||
assertEquals(3, labels.size());
|
||||
assertEquals("Correct Carrot2 language", LanguageCode.CHINESE_SIMPLIFIED.name(), labels.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void oneCarrot2SupportedLanguageOfMany() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"one_supported_language_of_many")), params).get(0));
|
||||
assertEquals(3, labels.size());
|
||||
assertEquals("Correct Carrot2 language", LanguageCode.GERMAN.name(), labels.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void languageCodeMapping() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.LANGUAGE_FIELD_NAME, "lang");
|
||||
params.add(CarrotParams.LANGUAGE_CODE_MAP, "POLISH:pl");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"one_supported_language_of_many")), params).get(0));
|
||||
assertEquals(3, labels.size());
|
||||
assertEquals("Correct Carrot2 language", LanguageCode.POLISH.name(), labels.get(2));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void passingOfCustomFields() throws Exception {
|
||||
final ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CarrotParams.CUSTOM_FIELD_NAME, "intfield_i:intfield");
|
||||
params.add(CarrotParams.CUSTOM_FIELD_NAME, "floatfield_f:floatfield");
|
||||
params.add(CarrotParams.CUSTOM_FIELD_NAME, "heading:multi");
|
||||
|
||||
// Let the echo mock clustering algorithm know which custom field to echo
|
||||
params.add("custom-fields", "intfield,floatfield,multi");
|
||||
|
||||
final List<String> labels = getLabels(checkEngine(
|
||||
getClusteringEngine("echo"), 1, 1, new TermQuery(new Term("url",
|
||||
"custom_fields")), params).get(0));
|
||||
assertEquals(5, labels.size());
|
||||
assertEquals("Integer field", "10", labels.get(2));
|
||||
assertEquals("Float field", "10.5", labels.get(3));
|
||||
assertEquals("List field", "[first, second]", labels.get(4));
|
||||
}
|
||||
|
||||
private CarrotClusteringEngine getClusteringEngine(String engineName) {
|
||||
ClusteringComponent comp = (ClusteringComponent) h.getCore()
|
||||
|
@ -273,7 +395,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
SolrDocumentList solrDocList = SolrPluginUtils.docListToSolrDocumentList( docList, searcher, engine.getFieldsToLoad(req), docIds );
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
|
||||
List<NamedList<Object>> results = (List<NamedList<Object>>) engine.cluster(query, solrDocList, docIds, req);
|
||||
req.close();
|
||||
assertEquals("number of clusters: " + results, expectedNumClusters, results.size());
|
||||
checkClusters(results, false);
|
||||
|
@ -302,7 +424,7 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
List<Object> docs = getDocs(cluster);
|
||||
assertNotNull("docs is null and it shouldn't be", docs);
|
||||
for (int j = 0; j < docs.size(); j++) {
|
||||
String id = (String) docs.get(j);
|
||||
Object id = docs.get(j);
|
||||
assertNotNull("id is null and it shouldn't be", id);
|
||||
}
|
||||
|
||||
|
@ -331,26 +453,26 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
|
||||
return (List<NamedList<Object>>) cluster.get("clusters");
|
||||
}
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<NamedList<Object>> getSubclusters(NamedList<Object> cluster) {
|
||||
return (List<NamedList<Object>>) cluster.get("clusters");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<String> getLabels(NamedList<Object> cluster) {
|
||||
return (List<String>) cluster.get("labels");
|
||||
}
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<String> getLabels(NamedList<Object> cluster) {
|
||||
return (List<String>) cluster.get("labels");
|
||||
}
|
||||
|
||||
private Double getScore(NamedList<Object> cluster) {
|
||||
return (Double) cluster.get("score");
|
||||
}
|
||||
private Double getScore(NamedList<Object> cluster) {
|
||||
return (Double) cluster.get("score");
|
||||
}
|
||||
|
||||
private Boolean isOtherTopics(NamedList<Object> cluster) {
|
||||
return (Boolean)cluster.get("other-topics");
|
||||
}
|
||||
private Boolean isOtherTopics(NamedList<Object> cluster) {
|
||||
return (Boolean)cluster.get("other-topics");
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<Object> getDocs(NamedList<Object> cluster) {
|
||||
return (List<Object>) cluster.get("docs");
|
||||
}
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<Object> getDocs(NamedList<Object> cluster) {
|
||||
return (List<Object>) cluster.get("docs");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ package org.apache.solr.handler.clustering.carrot2;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
|
@ -48,6 +49,12 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
|
|||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
private List<Cluster> clusters;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute(key = "custom-fields")
|
||||
private String customFields = "";
|
||||
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = Lists.newArrayListWithCapacity(documents.size());
|
||||
|
@ -55,6 +62,15 @@ public class EchoClusteringAlgorithm extends ProcessingComponentBase implements
|
|||
for (Document document : documents) {
|
||||
final Cluster cluster = new Cluster();
|
||||
cluster.addPhrases(document.getTitle(), document.getSummary());
|
||||
if (document.getLanguage() != null) {
|
||||
cluster.addPhrases(document.getLanguage().name());
|
||||
}
|
||||
for (String field : customFields.split(",")) {
|
||||
Object value = document.getField(field);
|
||||
if (value != null) {
|
||||
cluster.addPhrases(value.toString());
|
||||
}
|
||||
}
|
||||
cluster.addDocuments(document);
|
||||
clusters.add(cluster);
|
||||
}
|
||||
|
|
|
@ -25,9 +25,7 @@ import org.carrot2.core.ProcessingComponentBase;
|
|||
import org.carrot2.core.ProcessingException;
|
||||
import org.carrot2.core.attribute.AttributeNames;
|
||||
import org.carrot2.core.attribute.Processing;
|
||||
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
|
||||
import org.carrot2.text.linguistic.ILexicalData;
|
||||
import org.carrot2.text.linguistic.ILexicalDataFactory;
|
||||
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
|
||||
import org.carrot2.text.util.MutableCharArray;
|
||||
import org.carrot2.util.attribute.Attribute;
|
||||
|
@ -46,37 +44,37 @@ import com.google.common.collect.Lists;
|
|||
*/
|
||||
@Bindable(prefix = "LexicalResourcesCheckClusteringAlgorithm")
|
||||
public class LexicalResourcesCheckClusteringAlgorithm extends
|
||||
ProcessingComponentBase implements IClusteringAlgorithm {
|
||||
ProcessingComponentBase implements IClusteringAlgorithm {
|
||||
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
private List<Cluster> clusters;
|
||||
@Output
|
||||
@Processing
|
||||
@Attribute(key = AttributeNames.CLUSTERS)
|
||||
private List<Cluster> clusters;
|
||||
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
private String wordsToCheck;
|
||||
@Input
|
||||
@Processing
|
||||
@Attribute
|
||||
private String wordsToCheck;
|
||||
|
||||
private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
|
||||
private BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
|
||||
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = Lists.newArrayList();
|
||||
if (wordsToCheck == null) {
|
||||
return;
|
||||
}
|
||||
@Override
|
||||
public void process() throws ProcessingException {
|
||||
clusters = Lists.newArrayList();
|
||||
if (wordsToCheck == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Test with Maltese so that the English clustering performed in other tests
|
||||
// is not affected by the test stopwords and stoplabels.
|
||||
ILexicalData lexicalData = preprocessing.lexicalDataFactory
|
||||
.getLexicalData(LanguageCode.MALTESE);
|
||||
// Test with Maltese so that the English clustering performed in other tests
|
||||
// is not affected by the test stopwords and stoplabels.
|
||||
ILexicalData lexicalData = preprocessing.lexicalDataFactory
|
||||
.getLexicalData(LanguageCode.MALTESE);
|
||||
|
||||
for (String word : wordsToCheck.split(",")) {
|
||||
if (!lexicalData.isCommonWord(new MutableCharArray(word))
|
||||
&& !lexicalData.isStopLabel(word)) {
|
||||
clusters.add(new Cluster(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (String word : wordsToCheck.split(",")) {
|
||||
if (!lexicalData.isCommonWord(new MutableCharArray(word))
|
||||
&& !lexicalData.isStopLabel(word)) {
|
||||
clusters.add(new Cluster(word));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
package org.apache.solr.handler.dataimport;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.params.UpdateParams;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.CommitUpdateCommand;
|
||||
|
@ -27,8 +28,6 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* <p> Writes documents to SOLR. </p>
|
||||
|
@ -43,12 +42,14 @@ public class SolrWriter extends DIHWriterBase implements DIHWriter {
|
|||
static final String LAST_INDEX_KEY = "last_index_time";
|
||||
|
||||
private final UpdateRequestProcessor processor;
|
||||
|
||||
private final int commitWithin;
|
||||
|
||||
SolrQueryRequest req;
|
||||
|
||||
public SolrWriter(UpdateRequestProcessor processor, SolrQueryRequest req) {
|
||||
this.processor = processor;
|
||||
this.req = req;
|
||||
commitWithin = (req != null) ? req.getParams().getInt(UpdateParams.COMMIT_WITHIN, -1): -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -65,6 +66,7 @@ public class SolrWriter extends DIHWriterBase implements DIHWriter {
|
|||
try {
|
||||
AddUpdateCommand command = new AddUpdateCommand(req);
|
||||
command.solrDoc = d;
|
||||
command.commitWithin = commitWithin;
|
||||
processor.processAdd(command);
|
||||
} catch (Exception e) {
|
||||
log.warn("Error creating document : " + d, e);
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.solr.client.solrj.response.QueryResponse;
|
|||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.UpdateParams;
|
||||
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
@ -80,6 +81,33 @@ public class TestContentStreamDataSource extends AbstractDataImportHandlerTestCa
|
|||
assertEquals("Hello C1", ((List)doc.getFieldValue("desc")).get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommitWithin() throws Exception {
|
||||
DirectXmlRequest req = new DirectXmlRequest("/dataimport", xml);
|
||||
ModifiableSolrParams params = params("command", "full-import",
|
||||
"clean", "false", UpdateParams.COMMIT, "false",
|
||||
UpdateParams.COMMIT_WITHIN, "1000");
|
||||
req.setParams(params);
|
||||
String url = "http://localhost:" + jetty.getLocalPort() + "/solr";
|
||||
CommonsHttpSolrServer solrServer = new CommonsHttpSolrServer(url);
|
||||
solrServer.request(req);
|
||||
Thread.sleep(100);
|
||||
ModifiableSolrParams queryAll = params("q", "*");
|
||||
QueryResponse qres = solrServer.query(queryAll);
|
||||
SolrDocumentList results = qres.getResults();
|
||||
assertEquals(0, results.getNumFound());
|
||||
Thread.sleep(1000);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
qres = solrServer.query(queryAll);
|
||||
results = qres.getResults();
|
||||
if (2 == results.getNumFound()) {
|
||||
return;
|
||||
}
|
||||
Thread.sleep(500);
|
||||
}
|
||||
fail("Commit should have occured but it did not");
|
||||
}
|
||||
|
||||
private class SolrInstance {
|
||||
String name;
|
||||
Integer port;
|
||||
|
|
|
@ -162,13 +162,18 @@ public class QueryComponent extends SearchComponent
|
|||
|
||||
//TODO: move weighting of sort
|
||||
Sort groupSort = searcher.weightSort(cmd.getSort());
|
||||
if (groupSort == null) {
|
||||
groupSort = Sort.RELEVANCE;
|
||||
}
|
||||
|
||||
// groupSort defaults to sort
|
||||
String groupSortStr = params.get(GroupParams.GROUP_SORT);
|
||||
if (groupSort == null) {
|
||||
groupSort = new Sort();
|
||||
}
|
||||
//TODO: move weighting of sort
|
||||
Sort sortWithinGroup = groupSortStr == null ? groupSort : searcher.weightSort(QueryParsing.parseSort(groupSortStr, req));
|
||||
if (sortWithinGroup == null) {
|
||||
sortWithinGroup = Sort.RELEVANCE;
|
||||
}
|
||||
|
||||
groupingSpec.setSortWithinGroup(sortWithinGroup);
|
||||
groupingSpec.setGroupSort(groupSort);
|
||||
|
||||
|
|
|
@ -140,6 +140,7 @@ public class TestDistributedGrouping extends BaseDistributedSearchTestCase {
|
|||
query("q", "*:*", "fq", s1 + ":a", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", i1 + " asc, id asc", "group.truncate", "true", "facet", "true", "facet.field", t1);
|
||||
|
||||
// We cannot validate distributed grouping with scoring as first sort. since there is no global idf. We can check if no errors occur
|
||||
simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", i1 + " desc", "group.sort", "score desc"); // SOLR-2955
|
||||
simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10, "sort", "score desc, _docid_ asc, id asc");
|
||||
simpleQuery("q", "*:*", "rows", 100, "fl", "id," + i1, "group", "true", "group.field", i1, "group.limit", 10);
|
||||
}
|
||||
|
@ -149,6 +150,7 @@ public class TestDistributedGrouping extends BaseDistributedSearchTestCase {
|
|||
for (int i = 0; i < queryParams.length; i += 2) {
|
||||
params.add(queryParams[i].toString(), queryParams[i + 1].toString());
|
||||
}
|
||||
params.set("shards", shards);
|
||||
queryServer(params);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue