SOLR-3470: Custom Carrot2 tokenizer and stemmer factories overwritten by defaults: fixed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1340686 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stanisław Osiński 2012-05-20 11:53:51 +00:00
parent d722b19748
commit d65f086baf
7 changed files with 277 additions and 4 deletions

View File

@ -62,6 +62,7 @@ import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
import org.carrot2.util.resource.ClassLoaderLocator;
import org.carrot2.util.resource.IResource;
import org.carrot2.util.resource.IResourceLocator;
@ -255,10 +256,14 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
// Additionally, we set a custom lexical resource factory for Carrot2 that
// will use both Carrot2 default stop words as well as stop words from
// the StopFilter defined on the field.
BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
.stemmerFactory(LuceneCarrot2StemmerFactory.class)
.tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
final AttributeBuilder attributeBuilder = BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes);
attributeBuilder.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.TOKENIZER_FACTORY)) {
attributeBuilder.tokenizerFactory(LuceneCarrot2TokenizerFactory.class);
}
if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.STEMMER_FACTORY)) {
attributeBuilder.stemmerFactory(LuceneCarrot2StemmerFactory.class);
}
// Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
initAttributes.put("solrIndexSchema", core.getSchema());

View File

@ -339,6 +339,16 @@
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
<str name="carrot.lexicalResourcesDir">clustering/custom</str>
</lst>
<lst name="engine">
<str name="name">custom-duplicating-tokenizer</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoTokensClusteringAlgorithm</str>
<str name="PreprocessingPipeline.tokenizerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingTokenizerFactory</str>
</lst>
<lst name="engine">
<str name="name">custom-duplicating-stemmer</str>
<str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoStemsClusteringAlgorithm</str>
<str name="PreprocessingPipeline.stemmerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingStemmerFactory</str>
</lst>
</searchComponent>
<searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">

View File

@ -352,6 +352,34 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
assertEquals("List field", "[first, second]", labels.get(4));
}
@Test
public void customTokenizer() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("custom-duplicating-tokenizer"), 1, 16, new TermQuery(new Term("title",
"field")), params).get(0));
// The custom test tokenizer duplicates each token's text
assertTrue("First token", labels.get(0).contains("TitleTitle"));
}
@Test
public void customStemmer() throws Exception {
final ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CarrotParams.TITLE_FIELD_NAME, "title");
params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
final List<String> labels = getLabels(checkEngine(
getClusteringEngine("custom-duplicating-stemmer"), 1, 12, new TermQuery(new Term("title",
"field")), params).get(0));
// The custom test stemmer duplicates and lowercases each token's text
assertTrue("First token", labels.get(0).contains("titletitle"));
}
private CarrotClusteringEngine getClusteringEngine(String engineName) {
ClusteringComponent comp = (ClusteringComponent) h.getCore()
.getSearchComponent("clustering");

View File

@ -0,0 +1,34 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.IStemmerFactory;
public class DuplicatingStemmerFactory implements IStemmerFactory {
@Override
public IStemmer getStemmer(LanguageCode language) {
return new IStemmer() {
@Override
public CharSequence stem(CharSequence word) {
return word.toString() + word.toString();
}
};
}
}

View File

@ -0,0 +1,52 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.carrot2.core.LanguageCode;
import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ITokenizerFactory;
import org.carrot2.text.util.MutableCharArray;
public class DuplicatingTokenizerFactory implements ITokenizerFactory {
@Override
public ITokenizer getTokenizer(LanguageCode language) {
return new ITokenizer() {
private final ExtendedWhitespaceTokenizer delegate = new ExtendedWhitespaceTokenizer();
@Override
public void setTermBuffer(MutableCharArray buffer) {
delegate.setTermBuffer(buffer);
buffer.reset(buffer.toString() + buffer.toString());
}
@Override
public void reset(Reader input) throws IOException {
delegate.reset(input);
}
@Override
public short nextToken() throws IOException {
return delegate.nextToken();
}
};
}
}

View File

@ -0,0 +1,75 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.PreprocessingContext.AllStems;
import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock Carrot2 clustering algorithm that outputs stem of each token of each
* document as a separate cluster. Useful only in tests.
*/
@Bindable(prefix = "EchoTokensClusteringAlgorithm")
public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase
implements IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
private List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
documents, "", LanguageCode.ENGLISH);
final AllTokens allTokens = preprocessingContext.allTokens;
final AllWords allWords = preprocessingContext.allWords;
final AllStems allStems = preprocessingContext.allStems;
clusters = Lists.newArrayListWithCapacity(allTokens.image.length);
for (int i = 0; i < allTokens.image.length; i++) {
if (allTokens.wordIndex[i] >= 0) {
clusters.add(new Cluster(new String(
allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]])));
}
}
}
}

View File

@ -0,0 +1,69 @@
package org.apache.solr.handler.clustering.carrot2;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Output;
import com.google.common.collect.Lists;
/**
* A mock Carrot2 clustering algorithm that outputs each token of each document
* as a separate cluster. Useful only in tests.
*/
@Bindable(prefix = "EchoTokensClusteringAlgorithm")
public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase
implements IClusteringAlgorithm {
@Input
@Processing
@Attribute(key = AttributeNames.DOCUMENTS)
private List<Document> documents;
@Output
@Processing
@Attribute(key = AttributeNames.CLUSTERS)
private List<Cluster> clusters;
BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
@Override
public void process() throws ProcessingException {
final PreprocessingContext preprocessingContext = preprocessing.preprocess(
documents, "", LanguageCode.ENGLISH);
clusters = Lists
.newArrayListWithCapacity(preprocessingContext.allTokens.image.length);
for (char[] token : preprocessingContext.allTokens.image) {
if (token != null) {
clusters.add(new Cluster(new String(token)));
}
}
}
}