SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik to version 1.7.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512203 13f79535-47bb-0310-9956-ffa450edef68
2013-08-09 08:39:21 +00:00 · 2013-08-09 08:39:21 +00:00 · b40f603f46
parent 8177498c86
commit b40f603f46
31 changed files with 72 additions and 98 deletions
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@ -407,12 +407,12 @@
      <dependency>
        <groupId>org.carrot2</groupId>
        <artifactId>carrot2-mini</artifactId>
-        <version>3.6.2</version>
+        <version>3.8.0</version>
      </dependency>
      <dependency>
        <groupId>org.carrot2</groupId>
        <artifactId>morfologik-polish</artifactId>
-        <version>1.6.0</version>
+        <version>1.7.1</version>
      </dependency>
      <dependency>
        <groupId>org.codehaus.woodstox</groupId>
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -23,10 +23,6 @@ Changes in backwards compatibility policy
  not positioned. This change affects all classes that inherit from
  DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)

-* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter 
-  no longer support multiple "dictionaries" as there is only one dictionary available.
-  (Dawid Weiss)
-
 * LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove 
  IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
  and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
@ -39,10 +35,6 @@ New Features
 * LUCENE-4747: Move to Java 7 as minimum Java version.
  (Robert Muir, Uwe Schindler)

-* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter 
-  no longer support multiple "dictionaries" as there is only one dictionary available.
-  (Dawid Weiss)
-
 * SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
  (Ryo Onodera via Koji Sekiguchi)

@ -184,6 +176,10 @@ Changes in backwards compatibility policy
  CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
  remove it from the arguments. (Adrien Grand)

+* LUCENE-5089, SOLR-5126: Update to Morfologik 1.7.1. MorfologikAnalyzer and MorfologikFilter 
+  no longer support multiple "dictionaries" as there is only one dictionary available.
+  (Dawid Weiss)
+
 ======================= Lucene 4.4.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/analysis/morfologik/ivy.xml
+++ b/lucene/analysis/morfologik/ivy.xml
@ -19,9 +19,9 @@
 <ivy-module version="2.0">
    <info organisation="org.apache.lucene" module="analyzers-morfologik"/>
    <dependencies>
-      <dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
      <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
    </dependencies>
 </ivy-module>
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.morfologik;

 import java.io.IOException;
 import java.util.*;
+import java.util.regex.Pattern;

 import morfologik.stemming.*;

@ -82,73 +83,31 @@ public class MorfologikFilter extends TokenFilter {
  }

  /**
-   * The tag encoding format has been changing in Morfologik from version
-   * to version. Let's keep both variants and determine which one to run
-   * based on this flag.
+   * A pattern used to split lemma forms.
   */
-  private final static boolean multipleTagsPerLemma = true;
+  private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");

  private void popNextLemma() {
-    if (multipleTagsPerLemma) {
-      // One tag (concatenated) per lemma.
-      final WordData lemma = lemmaList.get(lemmaListIndex++);
-      termAtt.setEmpty().append(lemma.getStem());
-      CharSequence tag = lemma.getTag();
-      if (tag != null) {
-        String[] tags = tag.toString().split("\\+|\\|");
-        for (int i = 0; i < tags.length; i++) {
-          if (tagsList.size() <= i) {
-            tagsList.add(new StringBuilder());
-          }
-          StringBuilder buffer = tagsList.get(i);
-          buffer.setLength(0);
-          buffer.append(tags[i]);
+    // One tag (concatenated) per lemma.
+    final WordData lemma = lemmaList.get(lemmaListIndex++);
+    termAtt.setEmpty().append(lemma.getStem());
+    CharSequence tag = lemma.getTag();
+    if (tag != null) {
+      String[] tags = lemmaSplitter.split(tag.toString());
+      for (int i = 0; i < tags.length; i++) {
+        if (tagsList.size() <= i) {
+          tagsList.add(new StringBuilder());
        }
-        tagsAtt.setTags(tagsList.subList(0, tags.length));
-      } else {
-        tagsAtt.setTags(Collections.<StringBuilder> emptyList());
+        StringBuilder buffer = tagsList.get(i);
+        buffer.setLength(0);
+        buffer.append(tags[i]);
      }
+      tagsAtt.setTags(tagsList.subList(0, tags.length));
    } else {
-      // One tag (concatenated) per stem (lemma repeated).
-      CharSequence currentStem;
-      int tags = 0;
-      do {
-        final WordData lemma = lemmaList.get(lemmaListIndex++);
-        currentStem = lemma.getStem();
-        final CharSequence tag = lemma.getTag();
-        if (tag != null) {
-          if (tagsList.size() <= tags) {
-            tagsList.add(new StringBuilder());
-          }
-  
-          final StringBuilder buffer = tagsList.get(tags++);  
-          buffer.setLength(0);
-          buffer.append(lemma.getTag());
-        }
-      } while (lemmaListIndex < lemmaList.size() &&
-               equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
-
-      // Set the lemma's base form and tags as attributes.
-      termAtt.setEmpty().append(currentStem);
-      tagsAtt.setTags(tagsList.subList(0, tags));
+      tagsAtt.setTags(Collections.<StringBuilder> emptyList());
    }
  }

-  /**
-   * Compare two char sequences for equality. Assumes non-null arguments. 
-   */
-  private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
-    int len1 = s1.length();
-    int len2 = s2.length();
-    if (len1 != len2) return false;
-    for (int i = len1; --i >= 0;) {
-      if (s1.charAt(i) != s2.charAt(i)) { 
-        return false; 
-      }
-    }
-    return true;
-  }
-
  /**
   * Lookup a given surface form of a token and update 
   * {@link #lemmaList} and {@link #lemmaListIndex} accordingly. 
--- a/lucene/licenses/morfologik-fsa-1.6.0.jar.sha1
+++ b/lucene/licenses/morfologik-fsa-1.6.0.jar.sha1
@ -1 +0,0 @@
-397a99307020797e6790f2faf8cf865983b52559
--- a/lucene/licenses/morfologik-fsa-1.7.1.jar.sha1
+++ b/lucene/licenses/morfologik-fsa-1.7.1.jar.sha1
@ -0,0 +1 @@
+fdf556c88d66f65440bd24024f55a52c227c0e3f
--- a/lucene/licenses/morfologik-polish-1.6.0.jar.sha1
+++ b/lucene/licenses/morfologik-polish-1.6.0.jar.sha1
@ -1 +0,0 @@
-ca0663530971b54420fc1cea00a6338f68428232
--- a/lucene/licenses/morfologik-polish-1.7.1.jar.sha1
+++ b/lucene/licenses/morfologik-polish-1.7.1.jar.sha1
@ -0,0 +1 @@
+e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
--- a/lucene/licenses/morfologik-stemming-1.6.0.jar.sha1
+++ b/lucene/licenses/morfologik-stemming-1.6.0.jar.sha1
@ -1 +0,0 @@
-8a284571bea2cdd305cd86fbac9bab6deef31c7f
--- a/lucene/licenses/morfologik-stemming-1.7.1.jar.sha1
+++ b/lucene/licenses/morfologik-stemming-1.7.1.jar.sha1
@ -0,0 +1 @@
+c81d6c63e22e97819063cad7f1ecd20269cba720
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -25,7 +25,7 @@ $Id$
 Versions of Major Components
 ---------------------
 Apache Tika 1.4
-Carrot2 3.6.2
+Carrot2 3.8.0
 Velocity 1.7 and Velocity Tools 2.0
 Apache UIMA 2.3.1
 Apache ZooKeeper 3.4.5
@ -53,7 +53,7 @@ Other Changes
 Versions of Major Components
 ---------------------
 Apache Tika 1.4
-Carrot2 3.6.2
+Carrot2 3.8.0
 Velocity 1.7 and Velocity Tools 2.0
 Apache UIMA 2.3.1
 Apache ZooKeeper 3.4.5
@ -76,6 +76,9 @@ Detailed Change List
 New Features
 ----------------------

+* SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik 
+  to version 1.7.1 (Dawid Weiss)
+
 * SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
  field is referenced via 'sfield' and the query point is constant.
  (David Smiley)
--- a/solr/contrib/analysis-extras/ivy.xml
+++ b/solr/contrib/analysis-extras/ivy.xml
@ -20,9 +20,9 @@
    <info organisation="org.apache.solr" module="analysis-extras"/>
    <dependencies>
      <dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
-      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
+      <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
      <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
    </dependencies>
 </ivy-module>
--- a/solr/contrib/clustering/ivy.xml
+++ b/solr/contrib/clustering/ivy.xml
@ -19,14 +19,25 @@
 <ivy-module version="2.0">
    <info organisation="org.apache.solr" module="clustering"/>
    <dependencies>
-      <dependency org="org.carrot2" name="carrot2-mini" rev="3.6.2" transitive="false"/>
-      <dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.0.1" transitive="false"/>
-      <dependency org="com.carrotsearch" name="hppc" rev="0.4.1" transitive="false"/>
+      <dependency org="org.carrot2" name="carrot2-mini" rev="3.8.0" transitive="false"/>
+
+      <dependency org="com.carrotsearch" name="hppc" rev="0.5.2" transitive="false"/>
+      <dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.2.0" transitive="false"/>
+      <dependency org="org.simpleframework" name="simple-xml" rev="2.7" transitive="false"/>
+
+      <dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
+      <dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
+
      <dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/>
      <dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/>
-      <dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
-      <dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
-      <dependency org="org.simpleframework" name="simple-xml" rev="2.6.4" transitive="false"/>
+
+      <!--
+      Included as part of Solr's environment.
+
+      com.google.guava:guava:jar:14.0.1:compile
+      commons-lang:commons-lang:jar:2.6:compile
+      -->
+
      <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> 
    </dependencies>
 </ivy-module>
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/DocumentClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/DocumentClusteringEngine.java
@ -37,8 +37,7 @@ public abstract class DocumentClusteringEngine extends ClusteringEngine {
  public abstract NamedList cluster(SolrParams solrParams);

  /**
-   *  Experimental.  Subject to change before the next release
-   *
+   * Experimental.  Subject to change before the next release
   *
   * Cluster the set of docs.  Clustering of documents is often an expensive task that can take a long time.
   * @param docs The docs to cluster.  If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
@ -77,6 +77,7 @@ import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import com.google.common.io.Closeables;
+import com.google.common.io.Closer;

 /**
 * Search results clustering engine based on Carrot2 clustering algorithms.
@ -140,7 +141,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
            + ". Using the default " + resource + " from Carrot JAR.");          
        return new IResource[] {};
      } finally {
-        if (resourceStream != null) Closeables.closeQuietly(resourceStream);
+        if (resourceStream != null) {
+          try {
+            resourceStream.close();
+          } catch (IOException e) {
+            // ignore.
+          }
+        }
      }

      log.info("Loaded Solr resource: " + resourceName);
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
@ -52,7 +52,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
    SolrRequestHandler handler = core.getRequestHandler("standard");
    SolrQueryResponse rsp;
    rsp = new SolrQueryResponse();
-    rsp.add("responseHeader", new SimpleOrderedMap());
+    rsp.add("responseHeader", new SimpleOrderedMap<Object>());
    SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
    handler.handleRequest(req, rsp);
    NamedList values = rsp.getValues();
@ -70,7 +70,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
    handler = core.getRequestHandler("docClustering");

    rsp = new SolrQueryResponse();
-    rsp.add("responseHeader", new SimpleOrderedMap());
+    rsp.add("responseHeader", new SimpleOrderedMap<Object>());
    req = new LocalSolrQueryRequest(core, params);
    handler.handleRequest(req, rsp);
    values = rsp.getValues();
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/carrot2/EchoClusteringAlgorithm.java
@ -15,7 +15,6 @@ package org.apache.solr.handler.clustering.carrot2;
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-import java.util.Collections;
 import java.util.List;

 import org.carrot2.core.Cluster;
--- a/solr/licenses/attributes-binder-1.0.1.jar.sha1
+++ b/solr/licenses/attributes-binder-1.0.1.jar.sha1
@ -1 +0,0 @@
-698f0c9427a8a94e00a59575ff6c5ff9d0bdc34a
--- a/solr/licenses/attributes-binder-1.2.0.jar.sha1
+++ b/solr/licenses/attributes-binder-1.2.0.jar.sha1
@ -0,0 +1 @@
+2aa3ce620ebadea4e385fc0a54dc363cb659dca5
--- a/solr/licenses/carrot2-mini-3.6.2.jar.sha1
+++ b/solr/licenses/carrot2-mini-3.6.2.jar.sha1
@ -1 +0,0 @@
-ffd6e0f7ef6c189bf8b456ef749f1ce600d6df74
--- a/solr/licenses/carrot2-mini-3.8.0.jar.sha1
+++ b/solr/licenses/carrot2-mini-3.8.0.jar.sha1
@ -0,0 +1 @@
+65d7bbe49bad0a95d9ae9b858abafb96a666ac5a
--- a/solr/licenses/hppc-0.4.1.jar.sha1
+++ b/solr/licenses/hppc-0.4.1.jar.sha1
@ -1 +0,0 @@
-61497cafe8201435b603c6014d2abf0b3fb7c381
--- a/solr/licenses/hppc-0.5.2.jar.sha1
+++ b/solr/licenses/hppc-0.5.2.jar.sha1
@ -0,0 +1 @@
+074bcc9d152a928a4ea9ac59a5b45850bf00cd4e
--- a/solr/licenses/morfologik-fsa-1.6.0.jar.sha1
+++ b/solr/licenses/morfologik-fsa-1.6.0.jar.sha1
@ -1 +0,0 @@
-397a99307020797e6790f2faf8cf865983b52559
--- a/solr/licenses/morfologik-fsa-1.7.1.jar.sha1
+++ b/solr/licenses/morfologik-fsa-1.7.1.jar.sha1
@ -0,0 +1 @@
+fdf556c88d66f65440bd24024f55a52c227c0e3f
--- a/solr/licenses/morfologik-polish-1.6.0.jar.sha1
+++ b/solr/licenses/morfologik-polish-1.6.0.jar.sha1
@ -1 +0,0 @@
-ca0663530971b54420fc1cea00a6338f68428232
--- a/solr/licenses/morfologik-polish-1.7.1.jar.sha1
+++ b/solr/licenses/morfologik-polish-1.7.1.jar.sha1
@ -0,0 +1 @@
+e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
--- a/solr/licenses/morfologik-stemming-1.6.0.jar.sha1
+++ b/solr/licenses/morfologik-stemming-1.6.0.jar.sha1
@ -1 +0,0 @@
-8a284571bea2cdd305cd86fbac9bab6deef31c7f
--- a/solr/licenses/morfologik-stemming-1.7.1.jar.sha1
+++ b/solr/licenses/morfologik-stemming-1.7.1.jar.sha1
@ -0,0 +1 @@
+c81d6c63e22e97819063cad7f1ecd20269cba720
--- a/solr/licenses/simple-xml-2.6.4.jar.sha1
+++ b/solr/licenses/simple-xml-2.6.4.jar.sha1
@ -1 +0,0 @@
-5b7a462882768cf65a2273d90710c9838bd5b280
--- a/solr/licenses/simple-xml-2.7.jar.sha1
+++ b/solr/licenses/simple-xml-2.7.jar.sha1
@ -0,0 +1 @@
+48f90a787b2d59faab3b8c203945e4b0db32aec4