SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik to version 1.7.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512203 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2013-08-09 08:39:21 +00:00
parent 8177498c86
commit b40f603f46
31 changed files with 72 additions and 98 deletions

View File

@ -407,12 +407,12 @@
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>carrot2-mini</artifactId>
<version>3.6.2</version>
<version>3.8.0</version>
</dependency>
<dependency>
<groupId>org.carrot2</groupId>
<artifactId>morfologik-polish</artifactId>
<version>1.6.0</version>
<version>1.7.1</version>
</dependency>
<dependency>
<groupId>org.codehaus.woodstox</groupId>

View File

@ -23,10 +23,6 @@ Changes in backwards compatibility policy
not positioned. This change affects all classes that inherit from
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
no longer support multiple "dictionaries" as there is only one dictionary available.
(Dawid Weiss)
* LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove
IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
@ -39,10 +35,6 @@ New Features
* LUCENE-4747: Move to Java 7 as minimum Java version.
(Robert Muir, Uwe Schindler)
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
no longer support multiple "dictionaries" as there is only one dictionary available.
(Dawid Weiss)
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
(Ryo Onodera via Koji Sekiguchi)
@ -184,6 +176,10 @@ Changes in backwards compatibility policy
CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
remove it from the arguments. (Adrien Grand)
* LUCENE-5089, SOLR-5126: Update to Morfologik 1.7.1. MorfologikAnalyzer and MorfologikFilter
no longer support multiple "dictionaries" as there is only one dictionary available.
(Dawid Weiss)
======================= Lucene 4.4.0 =======================
Changes in backwards compatibility policy

View File

@ -19,9 +19,9 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
<dependencies>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.morfologik;
import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;
import morfologik.stemming.*;
@ -82,73 +83,31 @@ public class MorfologikFilter extends TokenFilter {
}
/**
* The tag encoding format has been changing in Morfologik from version
* to version. Let's keep both variants and determine which one to run
* based on this flag.
* A pattern used to split lemma forms.
*/
private final static boolean multipleTagsPerLemma = true;
private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
private void popNextLemma() {
if (multipleTagsPerLemma) {
// One tag (concatenated) per lemma.
final WordData lemma = lemmaList.get(lemmaListIndex++);
termAtt.setEmpty().append(lemma.getStem());
CharSequence tag = lemma.getTag();
if (tag != null) {
String[] tags = tag.toString().split("\\+|\\|");
for (int i = 0; i < tags.length; i++) {
if (tagsList.size() <= i) {
tagsList.add(new StringBuilder());
}
StringBuilder buffer = tagsList.get(i);
buffer.setLength(0);
buffer.append(tags[i]);
// One tag (concatenated) per lemma.
final WordData lemma = lemmaList.get(lemmaListIndex++);
termAtt.setEmpty().append(lemma.getStem());
CharSequence tag = lemma.getTag();
if (tag != null) {
String[] tags = lemmaSplitter.split(tag.toString());
for (int i = 0; i < tags.length; i++) {
if (tagsList.size() <= i) {
tagsList.add(new StringBuilder());
}
tagsAtt.setTags(tagsList.subList(0, tags.length));
} else {
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
StringBuilder buffer = tagsList.get(i);
buffer.setLength(0);
buffer.append(tags[i]);
}
tagsAtt.setTags(tagsList.subList(0, tags.length));
} else {
// One tag (concatenated) per stem (lemma repeated).
CharSequence currentStem;
int tags = 0;
do {
final WordData lemma = lemmaList.get(lemmaListIndex++);
currentStem = lemma.getStem();
final CharSequence tag = lemma.getTag();
if (tag != null) {
if (tagsList.size() <= tags) {
tagsList.add(new StringBuilder());
}
final StringBuilder buffer = tagsList.get(tags++);
buffer.setLength(0);
buffer.append(lemma.getTag());
}
} while (lemmaListIndex < lemmaList.size() &&
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
// Set the lemma's base form and tags as attributes.
termAtt.setEmpty().append(currentStem);
tagsAtt.setTags(tagsList.subList(0, tags));
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
}
}
/**
* Compare two char sequences for equality. Assumes non-null arguments.
*/
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
int len1 = s1.length();
int len2 = s2.length();
if (len1 != len2) return false;
for (int i = len1; --i >= 0;) {
if (s1.charAt(i) != s2.charAt(i)) {
return false;
}
}
return true;
}
/**
* Lookup a given surface form of a token and update
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.

View File

@ -1 +0,0 @@
397a99307020797e6790f2faf8cf865983b52559

View File

@ -0,0 +1 @@
fdf556c88d66f65440bd24024f55a52c227c0e3f

View File

@ -1 +0,0 @@
ca0663530971b54420fc1cea00a6338f68428232

View File

@ -0,0 +1 @@
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28

View File

@ -1 +0,0 @@
8a284571bea2cdd305cd86fbac9bab6deef31c7f

View File

@ -0,0 +1 @@
c81d6c63e22e97819063cad7f1ecd20269cba720

View File

@ -25,7 +25,7 @@ $Id$
Versions of Major Components
---------------------
Apache Tika 1.4
Carrot2 3.6.2
Carrot2 3.8.0
Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5
@ -53,7 +53,7 @@ Other Changes
Versions of Major Components
---------------------
Apache Tika 1.4
Carrot2 3.6.2
Carrot2 3.8.0
Velocity 1.7 and Velocity Tools 2.0
Apache UIMA 2.3.1
Apache ZooKeeper 3.4.5
@ -76,6 +76,9 @@ Detailed Change List
New Features
----------------------
* SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik
to version 1.7.1 (Dawid Weiss)
* SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
field is referenced via 'sfield' and the query point is constant.
(David Smiley)

View File

@ -20,9 +20,9 @@
<info organisation="org.apache.solr" module="analysis-extras"/>
<dependencies>
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>

View File

@ -19,14 +19,25 @@
<ivy-module version="2.0">
<info organisation="org.apache.solr" module="clustering"/>
<dependencies>
<dependency org="org.carrot2" name="carrot2-mini" rev="3.6.2" transitive="false"/>
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.0.1" transitive="false"/>
<dependency org="com.carrotsearch" name="hppc" rev="0.4.1" transitive="false"/>
<dependency org="org.carrot2" name="carrot2-mini" rev="3.8.0" transitive="false"/>
<dependency org="com.carrotsearch" name="hppc" rev="0.5.2" transitive="false"/>
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.2.0" transitive="false"/>
<dependency org="org.simpleframework" name="simple-xml" rev="2.7" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/>
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
<dependency org="org.simpleframework" name="simple-xml" rev="2.6.4" transitive="false"/>
<!--
Included as part of Solr's environment.
com.google.guava:guava:jar:14.0.1:compile
commons-lang:commons-lang:jar:2.6:compile
-->
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>

View File

@ -37,8 +37,7 @@ public abstract class DocumentClusteringEngine extends ClusteringEngine {
public abstract NamedList cluster(SolrParams solrParams);
/**
* Experimental. Subject to change before the next release
*
* Experimental. Subject to change before the next release
*
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}

View File

@ -77,6 +77,7 @@ import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import com.google.common.io.Closer;
/**
* Search results clustering engine based on Carrot2 clustering algorithms.
@ -140,7 +141,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
+ ". Using the default " + resource + " from Carrot JAR.");
return new IResource[] {};
} finally {
if (resourceStream != null) Closeables.closeQuietly(resourceStream);
if (resourceStream != null) {
try {
resourceStream.close();
} catch (IOException e) {
// ignore.
}
}
}
log.info("Loaded Solr resource: " + resourceName);

View File

@ -52,7 +52,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
SolrRequestHandler handler = core.getRequestHandler("standard");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
rsp.add("responseHeader", new SimpleOrderedMap<Object>());
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
NamedList values = rsp.getValues();
@ -70,7 +70,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
handler = core.getRequestHandler("docClustering");
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
rsp.add("responseHeader", new SimpleOrderedMap<Object>());
req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
values = rsp.getValues();

View File

@ -15,7 +15,6 @@ package org.apache.solr.handler.clustering.carrot2;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Collections;
import java.util.List;
import org.carrot2.core.Cluster;

View File

@ -1 +0,0 @@
698f0c9427a8a94e00a59575ff6c5ff9d0bdc34a

View File

@ -0,0 +1 @@
2aa3ce620ebadea4e385fc0a54dc363cb659dca5

View File

@ -1 +0,0 @@
ffd6e0f7ef6c189bf8b456ef749f1ce600d6df74

View File

@ -0,0 +1 @@
65d7bbe49bad0a95d9ae9b858abafb96a666ac5a

View File

@ -1 +0,0 @@
61497cafe8201435b603c6014d2abf0b3fb7c381

View File

@ -0,0 +1 @@
074bcc9d152a928a4ea9ac59a5b45850bf00cd4e

View File

@ -1 +0,0 @@
397a99307020797e6790f2faf8cf865983b52559

View File

@ -0,0 +1 @@
fdf556c88d66f65440bd24024f55a52c227c0e3f

View File

@ -1 +0,0 @@
ca0663530971b54420fc1cea00a6338f68428232

View File

@ -0,0 +1 @@
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28

View File

@ -1 +0,0 @@
8a284571bea2cdd305cd86fbac9bab6deef31c7f

View File

@ -0,0 +1 @@
c81d6c63e22e97819063cad7f1ecd20269cba720

View File

@ -1 +0,0 @@
5b7a462882768cf65a2273d90710c9838bd5b280

View File

@ -0,0 +1 @@
48f90a787b2d59faab3b8c203945e4b0db32aec4