SOLR-3470: Custom Carrot2 tokenizer and stemmer factories overwritten by defaults: fixed

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1340686 13f79535-47bb-0310-9956-ffa450edef68
2025-02-21 09:36:46 +00:00 · 2012-05-20 11:53:51 +00:00 · 2012-05-20 11:53:51 +00:00 · d65f086baf
commit d65f086baf
parent d722b19748
7 changed files with 277 additions and 4 deletions
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
@ -62,6 +62,7 @@ import org.carrot2.core.LanguageCode;
 import org.carrot2.core.attribute.AttributeNames;
 import org.carrot2.text.linguistic.DefaultLexicalDataFactoryDescriptor;
 import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder;
 import org.carrot2.util.resource.ClassLoaderLocator;
 import org.carrot2.util.resource.IResource;
 import org.carrot2.util.resource.IResourceLocator;
@ -255,10 +256,14 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
    // Additionally, we set a custom lexical resource factory for Carrot2 that
    // will use both Carrot2 default stop words as well as stop words from
    // the StopFilter defined on the field.
-    BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes)
-        .stemmerFactory(LuceneCarrot2StemmerFactory.class)
-        .tokenizerFactory(LuceneCarrot2TokenizerFactory.class)
-        .lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+    final AttributeBuilder attributeBuilder = BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes);
+    attributeBuilder.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
+    if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.TOKENIZER_FACTORY)) {
+      attributeBuilder.tokenizerFactory(LuceneCarrot2TokenizerFactory.class);
+    }
+    if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.STEMMER_FACTORY)) {
+      attributeBuilder.stemmerFactory(LuceneCarrot2StemmerFactory.class);
+    }

    // Pass the schema to SolrStopwordsCarrot2LexicalDataFactory.
    initAttributes.put("solrIndexSchema", core.getSchema());
--- a/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
+++ b/solr/contrib/clustering/src/test-files/clustering/solr/conf/solrconfig.xml
@ -339,6 +339,16 @@
      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.LexicalResourcesCheckClusteringAlgorithm</str>
      <str name="carrot.lexicalResourcesDir">clustering/custom</str>
    </lst>
+    <lst name="engine">
+      <str name="name">custom-duplicating-tokenizer</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoTokensClusteringAlgorithm</str>
+      <str name="PreprocessingPipeline.tokenizerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingTokenizerFactory</str>
+    </lst>
+    <lst name="engine">
+      <str name="name">custom-duplicating-stemmer</str>
+      <str name="carrot.algorithm">org.apache.solr.handler.clustering.carrot2.EchoStemsClusteringAlgorithm</str>
+      <str name="PreprocessingPipeline.stemmerFactory">org.apache.solr.handler.clustering.carrot2.DuplicatingStemmerFactory</str>
+    </lst>
  </searchComponent>

  <searchComponent class="org.apache.solr.handler.clustering.ClusteringComponent" name="doc-clustering">
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
@ -352,6 +352,34 @@ public class CarrotClusteringEngineTest extends AbstractClusteringTestCase {
    assertEquals("List field", "[first, second]", labels.get(4));
  }

+  @Test
+  public void customTokenizer() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.TITLE_FIELD_NAME, "title");
+    params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
+
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("custom-duplicating-tokenizer"), 1, 16, new TermQuery(new Term("title",
+            "field")), params).get(0));
+    
+    // The custom test tokenizer duplicates each token's text
+    assertTrue("First token", labels.get(0).contains("TitleTitle"));
+  }
+  
+  @Test
+  public void customStemmer() throws Exception {
+    final ModifiableSolrParams params = new ModifiableSolrParams();
+    params.add(CarrotParams.TITLE_FIELD_NAME, "title");
+    params.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
+    
+    final List<String> labels = getLabels(checkEngine(
+        getClusteringEngine("custom-duplicating-stemmer"), 1, 12, new TermQuery(new Term("title",
+            "field")), params).get(0));
+    
+    // The custom test stemmer duplicates and lowercases each token's text
+    assertTrue("First token", labels.get(0).contains("titletitle"));
+  }
+
  private CarrotClusteringEngine getClusteringEngine(String engineName) {
    ClusteringComponent comp = (ClusteringComponent) h.getCore()
            .getSearchComponent("clustering");
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/DuplicatingStemmerFactory.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/DuplicatingStemmerFactory.java
@ -0,0 +1,34 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.linguistic.IStemmer;
+import org.carrot2.text.linguistic.IStemmerFactory;
+
+public class DuplicatingStemmerFactory implements IStemmerFactory {
+  @Override
+  public IStemmer getStemmer(LanguageCode language) {
+    return new IStemmer() {
+      @Override
+      public CharSequence stem(CharSequence word) {
+        return word.toString() + word.toString();
+      }
+    };
+  }
+}
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/DuplicatingTokenizerFactory.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/DuplicatingTokenizerFactory.java
@ -0,0 +1,52 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.ITokenizerFactory;
+import org.carrot2.text.util.MutableCharArray;
+
+public class DuplicatingTokenizerFactory implements ITokenizerFactory {
+  @Override
+  public ITokenizer getTokenizer(LanguageCode language) {
+    return new ITokenizer() {
+      private final ExtendedWhitespaceTokenizer delegate = new ExtendedWhitespaceTokenizer();
+      
+      @Override
+      public void setTermBuffer(MutableCharArray buffer) {
+        delegate.setTermBuffer(buffer);
+        buffer.reset(buffer.toString() + buffer.toString());
+      }
+      
+      @Override
+      public void reset(Reader input) throws IOException {
+        delegate.reset(input);
+      }
+      
+      @Override
+      public short nextToken() throws IOException {
+        return delegate.nextToken();
+      }
+    };
+  }
+}
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoStemsClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoStemsClusteringAlgorithm.java
@ -0,0 +1,75 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.Document;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.preprocessing.PreprocessingContext;
+import org.carrot2.text.preprocessing.PreprocessingContext.AllStems;
+import org.carrot2.text.preprocessing.PreprocessingContext.AllTokens;
+import org.carrot2.text.preprocessing.PreprocessingContext.AllWords;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock Carrot2 clustering algorithm that outputs stem of each token of each
+ * document as a separate cluster. Useful only in tests.
+ */
+@Bindable(prefix = "EchoTokensClusteringAlgorithm")
+public class EchoStemsClusteringAlgorithm extends ProcessingComponentBase
+    implements IClusteringAlgorithm {
+  @Input
+  @Processing
+  @Attribute(key = AttributeNames.DOCUMENTS)
+  private List<Document> documents;
+  
+  @Output
+  @Processing
+  @Attribute(key = AttributeNames.CLUSTERS)
+  private List<Cluster> clusters;
+  
+  BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+  
+  @Override
+  public void process() throws ProcessingException {
+    final PreprocessingContext preprocessingContext = preprocessing.preprocess(
+        documents, "", LanguageCode.ENGLISH);
+    final AllTokens allTokens = preprocessingContext.allTokens;
+    final AllWords allWords = preprocessingContext.allWords;
+    final AllStems allStems = preprocessingContext.allStems;
+    clusters = Lists.newArrayListWithCapacity(allTokens.image.length);
+    for (int i = 0; i < allTokens.image.length; i++) {
+      if (allTokens.wordIndex[i] >= 0) {
+        clusters.add(new Cluster(new String(
+            allStems.image[allWords.stemIndex[allTokens.wordIndex[i]]])));
+      }
+    }
+  }
+}
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoTokensClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoTokensClusteringAlgorithm.java
@ -0,0 +1,69 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.List;
+
+import org.carrot2.core.Cluster;
+import org.carrot2.core.Document;
+import org.carrot2.core.IClusteringAlgorithm;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.core.ProcessingComponentBase;
+import org.carrot2.core.ProcessingException;
+import org.carrot2.core.attribute.AttributeNames;
+import org.carrot2.core.attribute.Processing;
+import org.carrot2.text.preprocessing.PreprocessingContext;
+import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
+import org.carrot2.util.attribute.Attribute;
+import org.carrot2.util.attribute.Bindable;
+import org.carrot2.util.attribute.Input;
+import org.carrot2.util.attribute.Output;
+
+import com.google.common.collect.Lists;
+
+/**
+ * A mock Carrot2 clustering algorithm that outputs each token of each document
+ * as a separate cluster. Useful only in tests.
+ */
+@Bindable(prefix = "EchoTokensClusteringAlgorithm")
+public class EchoTokensClusteringAlgorithm extends ProcessingComponentBase
+    implements IClusteringAlgorithm {
+  @Input
+  @Processing
+  @Attribute(key = AttributeNames.DOCUMENTS)
+  private List<Document> documents;
+  
+  @Output
+  @Processing
+  @Attribute(key = AttributeNames.CLUSTERS)
+  private List<Cluster> clusters;
+  
+  BasicPreprocessingPipeline preprocessing = new BasicPreprocessingPipeline();
+  
+  @Override
+  public void process() throws ProcessingException {
+    final PreprocessingContext preprocessingContext = preprocessing.preprocess(
+        documents, "", LanguageCode.ENGLISH);
+    clusters = Lists
+        .newArrayListWithCapacity(preprocessingContext.allTokens.image.length);
+    for (char[] token : preprocessingContext.allTokens.image) {
+      if (token != null) {
+        clusters.add(new Cluster(new String(token)));
+      }
+    }
+  }
+}