diff --git a/dev-tools/maven/pom.xml.template b/dev-tools/maven/pom.xml.template
index 8996d7388d1..1e05fcf8668 100644
--- a/dev-tools/maven/pom.xml.template
+++ b/dev-tools/maven/pom.xml.template
@@ -407,12 +407,12 @@
org.carrot2
carrot2-mini
- 3.6.2
+ 3.8.0
org.carrot2
morfologik-polish
- 1.6.0
+ 1.7.1
org.codehaus.woodstox
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 489f79fa77c..8413b25ebb4 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -23,10 +23,6 @@ Changes in backwards compatibility policy
not positioned. This change affects all classes that inherit from
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
-* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
- no longer support multiple "dictionaries" as there is only one dictionary available.
- (Dawid Weiss)
-
* LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove
IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
@@ -39,10 +35,6 @@ New Features
* LUCENE-4747: Move to Java 7 as minimum Java version.
(Robert Muir, Uwe Schindler)
-* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
- no longer support multiple "dictionaries" as there is only one dictionary available.
- (Dawid Weiss)
-
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
(Ryo Onodera via Koji Sekiguchi)
@@ -184,6 +176,10 @@ Changes in backwards compatibility policy
CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
remove it from the arguments. (Adrien Grand)
+* LUCENE-5089, SOLR-5126: Update to Morfologik 1.7.1. MorfologikAnalyzer and MorfologikFilter
+ no longer support multiple "dictionaries" as there is only one dictionary available.
+ (Dawid Weiss)
+
======================= Lucene 4.4.0 =======================
Changes in backwards compatibility policy
diff --git a/lucene/analysis/morfologik/ivy.xml b/lucene/analysis/morfologik/ivy.xml
index 0c9c3373934..c4dd72fc7e8 100644
--- a/lucene/analysis/morfologik/ivy.xml
+++ b/lucene/analysis/morfologik/ivy.xml
@@ -19,9 +19,9 @@
-
-
-
+
+
+
diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
index 049dad1570e..5ac14cdeb55 100644
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@@ -20,6 +20,7 @@ package org.apache.lucene.analysis.morfologik;
import java.io.IOException;
import java.util.*;
+import java.util.regex.Pattern;
import morfologik.stemming.*;
@@ -82,73 +83,31 @@ public class MorfologikFilter extends TokenFilter {
}
/**
- * The tag encoding format has been changing in Morfologik from version
- * to version. Let's keep both variants and determine which one to run
- * based on this flag.
+ * A pattern used to split lemma forms.
*/
- private final static boolean multipleTagsPerLemma = true;
+ private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
private void popNextLemma() {
- if (multipleTagsPerLemma) {
- // One tag (concatenated) per lemma.
- final WordData lemma = lemmaList.get(lemmaListIndex++);
- termAtt.setEmpty().append(lemma.getStem());
- CharSequence tag = lemma.getTag();
- if (tag != null) {
- String[] tags = tag.toString().split("\\+|\\|");
- for (int i = 0; i < tags.length; i++) {
- if (tagsList.size() <= i) {
- tagsList.add(new StringBuilder());
- }
- StringBuilder buffer = tagsList.get(i);
- buffer.setLength(0);
- buffer.append(tags[i]);
+ // One tag (concatenated) per lemma.
+ final WordData lemma = lemmaList.get(lemmaListIndex++);
+ termAtt.setEmpty().append(lemma.getStem());
+ CharSequence tag = lemma.getTag();
+ if (tag != null) {
+ String[] tags = lemmaSplitter.split(tag.toString());
+ for (int i = 0; i < tags.length; i++) {
+ if (tagsList.size() <= i) {
+ tagsList.add(new StringBuilder());
}
- tagsAtt.setTags(tagsList.subList(0, tags.length));
- } else {
- tagsAtt.setTags(Collections. emptyList());
+ StringBuilder buffer = tagsList.get(i);
+ buffer.setLength(0);
+ buffer.append(tags[i]);
}
+ tagsAtt.setTags(tagsList.subList(0, tags.length));
} else {
- // One tag (concatenated) per stem (lemma repeated).
- CharSequence currentStem;
- int tags = 0;
- do {
- final WordData lemma = lemmaList.get(lemmaListIndex++);
- currentStem = lemma.getStem();
- final CharSequence tag = lemma.getTag();
- if (tag != null) {
- if (tagsList.size() <= tags) {
- tagsList.add(new StringBuilder());
- }
-
- final StringBuilder buffer = tagsList.get(tags++);
- buffer.setLength(0);
- buffer.append(lemma.getTag());
- }
- } while (lemmaListIndex < lemmaList.size() &&
- equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
-
- // Set the lemma's base form and tags as attributes.
- termAtt.setEmpty().append(currentStem);
- tagsAtt.setTags(tagsList.subList(0, tags));
+ tagsAtt.setTags(Collections. emptyList());
}
}
- /**
- * Compare two char sequences for equality. Assumes non-null arguments.
- */
- private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
- int len1 = s1.length();
- int len2 = s2.length();
- if (len1 != len2) return false;
- for (int i = len1; --i >= 0;) {
- if (s1.charAt(i) != s2.charAt(i)) {
- return false;
- }
- }
- return true;
- }
-
/**
* Lookup a given surface form of a token and update
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
diff --git a/lucene/licenses/morfologik-fsa-1.6.0.jar.sha1 b/lucene/licenses/morfologik-fsa-1.6.0.jar.sha1
deleted file mode 100644
index 8041cb4027d..00000000000
--- a/lucene/licenses/morfologik-fsa-1.6.0.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-397a99307020797e6790f2faf8cf865983b52559
diff --git a/lucene/licenses/morfologik-fsa-1.7.1.jar.sha1 b/lucene/licenses/morfologik-fsa-1.7.1.jar.sha1
new file mode 100644
index 00000000000..b71174e4d03
--- /dev/null
+++ b/lucene/licenses/morfologik-fsa-1.7.1.jar.sha1
@@ -0,0 +1 @@
+fdf556c88d66f65440bd24024f55a52c227c0e3f
diff --git a/lucene/licenses/morfologik-polish-1.6.0.jar.sha1 b/lucene/licenses/morfologik-polish-1.6.0.jar.sha1
deleted file mode 100644
index b44ead1078f..00000000000
--- a/lucene/licenses/morfologik-polish-1.6.0.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-ca0663530971b54420fc1cea00a6338f68428232
diff --git a/lucene/licenses/morfologik-polish-1.7.1.jar.sha1 b/lucene/licenses/morfologik-polish-1.7.1.jar.sha1
new file mode 100644
index 00000000000..3bd0d88ae67
--- /dev/null
+++ b/lucene/licenses/morfologik-polish-1.7.1.jar.sha1
@@ -0,0 +1 @@
+e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
diff --git a/lucene/licenses/morfologik-stemming-1.6.0.jar.sha1 b/lucene/licenses/morfologik-stemming-1.6.0.jar.sha1
deleted file mode 100644
index 4ba54674892..00000000000
--- a/lucene/licenses/morfologik-stemming-1.6.0.jar.sha1
+++ /dev/null
@@ -1 +0,0 @@
-8a284571bea2cdd305cd86fbac9bab6deef31c7f
diff --git a/lucene/licenses/morfologik-stemming-1.7.1.jar.sha1 b/lucene/licenses/morfologik-stemming-1.7.1.jar.sha1
new file mode 100644
index 00000000000..3b53503b1c2
--- /dev/null
+++ b/lucene/licenses/morfologik-stemming-1.7.1.jar.sha1
@@ -0,0 +1 @@
+c81d6c63e22e97819063cad7f1ecd20269cba720
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index ffe123ce614..4fccdf2703a 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -25,7 +25,7 @@ $Id$
Versions of Major Components
---------------------
Apache Tika 1.4
-Carrot2 3.6.2
+Carrot2 3.8.0
Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5
@@ -53,7 +53,7 @@ Other Changes
Versions of Major Components
---------------------
Apache Tika 1.4
-Carrot2 3.6.2
+Carrot2 3.8.0
Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5
@@ -76,6 +76,9 @@ Detailed Change List
New Features
----------------------
+* SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik
+ to version 1.7.1 (Dawid Weiss)
+
* SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
field is referenced via 'sfield' and the query point is constant.
(David Smiley)
diff --git a/solr/contrib/analysis-extras/ivy.xml b/solr/contrib/analysis-extras/ivy.xml
index 597f6060b4b..b8a1bfb7e36 100644
--- a/solr/contrib/analysis-extras/ivy.xml
+++ b/solr/contrib/analysis-extras/ivy.xml
@@ -20,9 +20,9 @@
-
-
-
+
+
+
diff --git a/solr/contrib/clustering/ivy.xml b/solr/contrib/clustering/ivy.xml
index ef3e4de21da..71f8c8c7fc9 100644
--- a/solr/contrib/clustering/ivy.xml
+++ b/solr/contrib/clustering/ivy.xml
@@ -19,14 +19,25 @@
-
-
-
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/DocumentClusteringEngine.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/DocumentClusteringEngine.java
index 2926b8fa643..b52e9a18a97 100644
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/DocumentClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/DocumentClusteringEngine.java
@@ -37,8 +37,7 @@ public abstract class DocumentClusteringEngine extends ClusteringEngine {
public abstract NamedList cluster(SolrParams solrParams);
/**
- * Experimental. Subject to change before the next release
- *
+ * Experimental. Subject to change before the next release
*
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
diff --git a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
index 7ac385c87ac..e788a3cedfb 100644
--- a/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
+++ b/solr/contrib/clustering/src/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
@@ -77,6 +77,7 @@ import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
+import com.google.common.io.Closer;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
@@ -140,7 +141,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
+ ". Using the default " + resource + " from Carrot JAR.");
return new IResource[] {};
} finally {
- if (resourceStream != null) Closeables.closeQuietly(resourceStream);
+ if (resourceStream != null) {
+ try {
+ resourceStream.close();
+ } catch (IOException e) {
+ // ignore.
+ }
+ }
}
log.info("Loaded Solr resource: " + resourceName);
diff --git a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
index 17029019a88..d202042bc68 100644
--- a/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
+++ b/solr/contrib/clustering/src/test/org/apache/solr/handler/clustering/ClusteringComponentTest.java
@@ -52,7 +52,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
SolrRequestHandler handler = core.getRequestHandler("standard");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
- rsp.add("responseHeader", new SimpleOrderedMap());
+ rsp.add("responseHeader", new SimpleOrderedMap