SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik to version 1.7.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512203 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2013-08-09 08:39:21 +00:00
parent 8177498c86
commit b40f603f46
31 changed files with 72 additions and 98 deletions

View File

@ -407,12 +407,12 @@
<dependency> <dependency>
<groupId>org.carrot2</groupId> <groupId>org.carrot2</groupId>
<artifactId>carrot2-mini</artifactId> <artifactId>carrot2-mini</artifactId>
<version>3.6.2</version> <version>3.8.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.carrot2</groupId> <groupId>org.carrot2</groupId>
<artifactId>morfologik-polish</artifactId> <artifactId>morfologik-polish</artifactId>
<version>1.6.0</version> <version>1.7.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.codehaus.woodstox</groupId> <groupId>org.codehaus.woodstox</groupId>

View File

@ -23,10 +23,6 @@ Changes in backwards compatibility policy
not positioned. This change affects all classes that inherit from not positioned. This change affects all classes that inherit from
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand) DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
no longer support multiple "dictionaries" as there is only one dictionary available.
(Dawid Weiss)
* LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove * LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove
IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor, IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
@ -39,10 +35,6 @@ New Features
* LUCENE-4747: Move to Java 7 as minimum Java version. * LUCENE-4747: Move to Java 7 as minimum Java version.
(Robert Muir, Uwe Schindler) (Robert Muir, Uwe Schindler)
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
no longer support multiple "dictionaries" as there is only one dictionary available.
(Dawid Weiss)
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory. * SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
(Ryo Onodera via Koji Sekiguchi) (Ryo Onodera via Koji Sekiguchi)
@ -184,6 +176,10 @@ Changes in backwards compatibility policy
CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
remove it from the arguments. (Adrien Grand) remove it from the arguments. (Adrien Grand)
* LUCENE-5089, SOLR-5126: Update to Morfologik 1.7.1. MorfologikAnalyzer and MorfologikFilter
no longer support multiple "dictionaries" as there is only one dictionary available.
(Dawid Weiss)
======================= Lucene 4.4.0 ======================= ======================= Lucene 4.4.0 =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -19,9 +19,9 @@
<ivy-module version="2.0"> <ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-morfologik"/> <info organisation="org.apache.lucene" module="analyzers-morfologik"/>
<dependencies> <dependencies>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/> <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/> <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/> <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies> </dependencies>
</ivy-module> </ivy-module>

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.morfologik;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import java.util.regex.Pattern;
import morfologik.stemming.*; import morfologik.stemming.*;
@ -82,73 +83,31 @@ public class MorfologikFilter extends TokenFilter {
} }
/** /**
* The tag encoding format has been changing in Morfologik from version * A pattern used to split lemma forms.
* to version. Let's keep both variants and determine which one to run
* based on this flag.
*/ */
private final static boolean multipleTagsPerLemma = true; private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
private void popNextLemma() { private void popNextLemma() {
if (multipleTagsPerLemma) { // One tag (concatenated) per lemma.
// One tag (concatenated) per lemma. final WordData lemma = lemmaList.get(lemmaListIndex++);
final WordData lemma = lemmaList.get(lemmaListIndex++); termAtt.setEmpty().append(lemma.getStem());
termAtt.setEmpty().append(lemma.getStem()); CharSequence tag = lemma.getTag();
CharSequence tag = lemma.getTag(); if (tag != null) {
if (tag != null) { String[] tags = lemmaSplitter.split(tag.toString());
String[] tags = tag.toString().split("\\+|\\|"); for (int i = 0; i < tags.length; i++) {
for (int i = 0; i < tags.length; i++) { if (tagsList.size() <= i) {
if (tagsList.size() <= i) { tagsList.add(new StringBuilder());
tagsList.add(new StringBuilder());
}
StringBuilder buffer = tagsList.get(i);
buffer.setLength(0);
buffer.append(tags[i]);
} }
tagsAtt.setTags(tagsList.subList(0, tags.length)); StringBuilder buffer = tagsList.get(i);
} else { buffer.setLength(0);
tagsAtt.setTags(Collections.<StringBuilder> emptyList()); buffer.append(tags[i]);
} }
tagsAtt.setTags(tagsList.subList(0, tags.length));
} else { } else {
// One tag (concatenated) per stem (lemma repeated). tagsAtt.setTags(Collections.<StringBuilder> emptyList());
CharSequence currentStem;
int tags = 0;
do {
final WordData lemma = lemmaList.get(lemmaListIndex++);
currentStem = lemma.getStem();
final CharSequence tag = lemma.getTag();
if (tag != null) {
if (tagsList.size() <= tags) {
tagsList.add(new StringBuilder());
}
final StringBuilder buffer = tagsList.get(tags++);
buffer.setLength(0);
buffer.append(lemma.getTag());
}
} while (lemmaListIndex < lemmaList.size() &&
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
// Set the lemma's base form and tags as attributes.
termAtt.setEmpty().append(currentStem);
tagsAtt.setTags(tagsList.subList(0, tags));
} }
} }
/**
* Compare two char sequences for equality. Assumes non-null arguments.
*/
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
int len1 = s1.length();
int len2 = s2.length();
if (len1 != len2) return false;
for (int i = len1; --i >= 0;) {
if (s1.charAt(i) != s2.charAt(i)) {
return false;
}
}
return true;
}
/** /**
* Lookup a given surface form of a token and update * Lookup a given surface form of a token and update
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly. * {@link #lemmaList} and {@link #lemmaListIndex} accordingly.

View File

@ -1 +0,0 @@
397a99307020797e6790f2faf8cf865983b52559

View File

@ -0,0 +1 @@
fdf556c88d66f65440bd24024f55a52c227c0e3f

View File

@ -1 +0,0 @@
ca0663530971b54420fc1cea00a6338f68428232

View File

@ -0,0 +1 @@
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28

View File

@ -1 +0,0 @@
8a284571bea2cdd305cd86fbac9bab6deef31c7f

View File

@ -0,0 +1 @@
c81d6c63e22e97819063cad7f1ecd20269cba720

View File

@ -25,7 +25,7 @@ $Id$
Versions of Major Components Versions of Major Components
--------------------- ---------------------
Apache Tika 1.4 Apache Tika 1.4
Carrot2 3.6.2 Carrot2 3.8.0
Velocity 1.7 and Velocity Tools 2.0 Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1 Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5 Apache ZooKeeper 3.4.5
@ -53,7 +53,7 @@ Other Changes
Versions of Major Components Versions of Major Components
--------------------- ---------------------
Apache Tika 1.4 Apache Tika 1.4
Carrot2 3.6.2 Carrot2 3.8.0
Velocity 1.7 and Velocity Tools 2.0 Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1 Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5 Apache ZooKeeper 3.4.5
@ -76,6 +76,9 @@ Detailed Change List
New Features New Features
---------------------- ----------------------
* SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik
to version 1.7.1 (Dawid Weiss)
* SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the * SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
field is referenced via 'sfield' and the query point is constant. field is referenced via 'sfield' and the query point is constant.
(David Smiley) (David Smiley)

View File

@ -20,9 +20,9 @@
<info organisation="org.apache.solr" module="analysis-extras"/> <info organisation="org.apache.solr" module="analysis-extras"/>
<dependencies> <dependencies>
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/> <dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/> <dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/> <dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/> <dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies> </dependencies>
</ivy-module> </ivy-module>

View File

@ -19,14 +19,25 @@
<ivy-module version="2.0"> <ivy-module version="2.0">
<info organisation="org.apache.solr" module="clustering"/> <info organisation="org.apache.solr" module="clustering"/>
<dependencies> <dependencies>
<dependency org="org.carrot2" name="carrot2-mini" rev="3.6.2" transitive="false"/> <dependency org="org.carrot2" name="carrot2-mini" rev="3.8.0" transitive="false"/>
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.0.1" transitive="false"/>
<dependency org="com.carrotsearch" name="hppc" rev="0.4.1" transitive="false"/> <dependency org="com.carrotsearch" name="hppc" rev="0.5.2" transitive="false"/>
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.2.0" transitive="false"/>
<dependency org="org.simpleframework" name="simple-xml" rev="2.7" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/> <dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/>
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/> <dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/> <!--
<dependency org="org.simpleframework" name="simple-xml" rev="2.6.4" transitive="false"/> Included as part of Solr's environment.
com.google.guava:guava:jar:14.0.1:compile
commons-lang:commons-lang:jar:2.6:compile
-->
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/> <exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies> </dependencies>
</ivy-module> </ivy-module>

View File

@ -37,8 +37,7 @@ public abstract class DocumentClusteringEngine extends ClusteringEngine {
public abstract NamedList cluster(SolrParams solrParams); public abstract NamedList cluster(SolrParams solrParams);
/** /**
* Experimental. Subject to change before the next release * Experimental. Subject to change before the next release
*
* *
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time. * Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)} * @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}

View File

@ -77,6 +77,7 @@ import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.google.common.io.Closeables; import com.google.common.io.Closeables;
import com.google.common.io.Closer;
/** /**
* Search results clustering engine based on Carrot2 clustering algorithms. * Search results clustering engine based on Carrot2 clustering algorithms.
@ -140,7 +141,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
+ ". Using the default " + resource + " from Carrot JAR."); + ". Using the default " + resource + " from Carrot JAR.");
return new IResource[] {}; return new IResource[] {};
} finally { } finally {
if (resourceStream != null) Closeables.closeQuietly(resourceStream); if (resourceStream != null) {
try {
resourceStream.close();
} catch (IOException e) {
// ignore.
}
}
} }
log.info("Loaded Solr resource: " + resourceName); log.info("Loaded Solr resource: " + resourceName);

View File

@ -52,7 +52,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
SolrRequestHandler handler = core.getRequestHandler("standard"); SolrRequestHandler handler = core.getRequestHandler("standard");
SolrQueryResponse rsp; SolrQueryResponse rsp;
rsp = new SolrQueryResponse(); rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap()); rsp.add("responseHeader", new SimpleOrderedMap<Object>());
SolrQueryRequest req = new LocalSolrQueryRequest(core, params); SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp); handler.handleRequest(req, rsp);
NamedList values = rsp.getValues(); NamedList values = rsp.getValues();
@ -70,7 +70,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
handler = core.getRequestHandler("docClustering"); handler = core.getRequestHandler("docClustering");
rsp = new SolrQueryResponse(); rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap()); rsp.add("responseHeader", new SimpleOrderedMap<Object>());
req = new LocalSolrQueryRequest(core, params); req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp); handler.handleRequest(req, rsp);
values = rsp.getValues(); values = rsp.getValues();

View File

@ -15,7 +15,6 @@ package org.apache.solr.handler.clustering.carrot2;
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
import java.util.Collections;
import java.util.List; import java.util.List;
import org.carrot2.core.Cluster; import org.carrot2.core.Cluster;

View File

@ -1 +0,0 @@
698f0c9427a8a94e00a59575ff6c5ff9d0bdc34a

View File

@ -0,0 +1 @@
2aa3ce620ebadea4e385fc0a54dc363cb659dca5

View File

@ -1 +0,0 @@
ffd6e0f7ef6c189bf8b456ef749f1ce600d6df74

View File

@ -0,0 +1 @@
65d7bbe49bad0a95d9ae9b858abafb96a666ac5a

View File

@ -1 +0,0 @@
61497cafe8201435b603c6014d2abf0b3fb7c381

View File

@ -0,0 +1 @@
074bcc9d152a928a4ea9ac59a5b45850bf00cd4e

View File

@ -1 +0,0 @@
397a99307020797e6790f2faf8cf865983b52559

View File

@ -0,0 +1 @@
fdf556c88d66f65440bd24024f55a52c227c0e3f

View File

@ -1 +0,0 @@
ca0663530971b54420fc1cea00a6338f68428232

View File

@ -0,0 +1 @@
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28

View File

@ -1 +0,0 @@
8a284571bea2cdd305cd86fbac9bab6deef31c7f

View File

@ -0,0 +1 @@
c81d6c63e22e97819063cad7f1ecd20269cba720

View File

@ -1 +0,0 @@
5b7a462882768cf65a2273d90710c9838bd5b280

View File

@ -0,0 +1 @@
48f90a787b2d59faab3b8c203945e4b0db32aec4