mirror of https://github.com/apache/lucene.git
SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik to version 1.7.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512203 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8177498c86
commit
b40f603f46
|
@ -407,12 +407,12 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.carrot2</groupId>
|
<groupId>org.carrot2</groupId>
|
||||||
<artifactId>carrot2-mini</artifactId>
|
<artifactId>carrot2-mini</artifactId>
|
||||||
<version>3.6.2</version>
|
<version>3.8.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.carrot2</groupId>
|
<groupId>org.carrot2</groupId>
|
||||||
<artifactId>morfologik-polish</artifactId>
|
<artifactId>morfologik-polish</artifactId>
|
||||||
<version>1.6.0</version>
|
<version>1.7.1</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.codehaus.woodstox</groupId>
|
<groupId>org.codehaus.woodstox</groupId>
|
||||||
|
|
|
@ -23,10 +23,6 @@ Changes in backwards compatibility policy
|
||||||
not positioned. This change affects all classes that inherit from
|
not positioned. This change affects all classes that inherit from
|
||||||
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
|
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
|
||||||
|
|
||||||
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
|
|
||||||
no longer support multiple "dictionaries" as there is only one dictionary available.
|
|
||||||
(Dawid Weiss)
|
|
||||||
|
|
||||||
* LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove
|
* LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove
|
||||||
IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
|
IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
|
||||||
and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
|
and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
|
||||||
|
@ -39,10 +35,6 @@ New Features
|
||||||
* LUCENE-4747: Move to Java 7 as minimum Java version.
|
* LUCENE-4747: Move to Java 7 as minimum Java version.
|
||||||
(Robert Muir, Uwe Schindler)
|
(Robert Muir, Uwe Schindler)
|
||||||
|
|
||||||
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
|
|
||||||
no longer support multiple "dictionaries" as there is only one dictionary available.
|
|
||||||
(Dawid Weiss)
|
|
||||||
|
|
||||||
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
|
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
|
||||||
(Ryo Onodera via Koji Sekiguchi)
|
(Ryo Onodera via Koji Sekiguchi)
|
||||||
|
|
||||||
|
@ -184,6 +176,10 @@ Changes in backwards compatibility policy
|
||||||
CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
|
CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
|
||||||
remove it from the arguments. (Adrien Grand)
|
remove it from the arguments. (Adrien Grand)
|
||||||
|
|
||||||
|
* LUCENE-5089, SOLR-5126: Update to Morfologik 1.7.1. MorfologikAnalyzer and MorfologikFilter
|
||||||
|
no longer support multiple "dictionaries" as there is only one dictionary available.
|
||||||
|
(Dawid Weiss)
|
||||||
|
|
||||||
======================= Lucene 4.4.0 =======================
|
======================= Lucene 4.4.0 =======================
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
|
@ -19,9 +19,9 @@
|
||||||
<ivy-module version="2.0">
|
<ivy-module version="2.0">
|
||||||
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
|
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
|
||||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.morfologik;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import morfologik.stemming.*;
|
import morfologik.stemming.*;
|
||||||
|
|
||||||
|
@ -82,73 +83,31 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The tag encoding format has been changing in Morfologik from version
|
* A pattern used to split lemma forms.
|
||||||
* to version. Let's keep both variants and determine which one to run
|
|
||||||
* based on this flag.
|
|
||||||
*/
|
*/
|
||||||
private final static boolean multipleTagsPerLemma = true;
|
private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
|
||||||
|
|
||||||
private void popNextLemma() {
|
private void popNextLemma() {
|
||||||
if (multipleTagsPerLemma) {
|
// One tag (concatenated) per lemma.
|
||||||
// One tag (concatenated) per lemma.
|
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
termAtt.setEmpty().append(lemma.getStem());
|
||||||
termAtt.setEmpty().append(lemma.getStem());
|
CharSequence tag = lemma.getTag();
|
||||||
CharSequence tag = lemma.getTag();
|
if (tag != null) {
|
||||||
if (tag != null) {
|
String[] tags = lemmaSplitter.split(tag.toString());
|
||||||
String[] tags = tag.toString().split("\\+|\\|");
|
for (int i = 0; i < tags.length; i++) {
|
||||||
for (int i = 0; i < tags.length; i++) {
|
if (tagsList.size() <= i) {
|
||||||
if (tagsList.size() <= i) {
|
tagsList.add(new StringBuilder());
|
||||||
tagsList.add(new StringBuilder());
|
|
||||||
}
|
|
||||||
StringBuilder buffer = tagsList.get(i);
|
|
||||||
buffer.setLength(0);
|
|
||||||
buffer.append(tags[i]);
|
|
||||||
}
|
}
|
||||||
tagsAtt.setTags(tagsList.subList(0, tags.length));
|
StringBuilder buffer = tagsList.get(i);
|
||||||
} else {
|
buffer.setLength(0);
|
||||||
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
|
buffer.append(tags[i]);
|
||||||
}
|
}
|
||||||
|
tagsAtt.setTags(tagsList.subList(0, tags.length));
|
||||||
} else {
|
} else {
|
||||||
// One tag (concatenated) per stem (lemma repeated).
|
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
|
||||||
CharSequence currentStem;
|
|
||||||
int tags = 0;
|
|
||||||
do {
|
|
||||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
|
||||||
currentStem = lemma.getStem();
|
|
||||||
final CharSequence tag = lemma.getTag();
|
|
||||||
if (tag != null) {
|
|
||||||
if (tagsList.size() <= tags) {
|
|
||||||
tagsList.add(new StringBuilder());
|
|
||||||
}
|
|
||||||
|
|
||||||
final StringBuilder buffer = tagsList.get(tags++);
|
|
||||||
buffer.setLength(0);
|
|
||||||
buffer.append(lemma.getTag());
|
|
||||||
}
|
|
||||||
} while (lemmaListIndex < lemmaList.size() &&
|
|
||||||
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
|
|
||||||
|
|
||||||
// Set the lemma's base form and tags as attributes.
|
|
||||||
termAtt.setEmpty().append(currentStem);
|
|
||||||
tagsAtt.setTags(tagsList.subList(0, tags));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Compare two char sequences for equality. Assumes non-null arguments.
|
|
||||||
*/
|
|
||||||
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
|
|
||||||
int len1 = s1.length();
|
|
||||||
int len2 = s2.length();
|
|
||||||
if (len1 != len2) return false;
|
|
||||||
for (int i = len1; --i >= 0;) {
|
|
||||||
if (s1.charAt(i) != s2.charAt(i)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Lookup a given surface form of a token and update
|
* Lookup a given surface form of a token and update
|
||||||
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
|
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
397a99307020797e6790f2faf8cf865983b52559
|
|
|
@ -0,0 +1 @@
|
||||||
|
fdf556c88d66f65440bd24024f55a52c227c0e3f
|
|
@ -1 +0,0 @@
|
||||||
ca0663530971b54420fc1cea00a6338f68428232
|
|
|
@ -0,0 +1 @@
|
||||||
|
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
|
|
@ -1 +0,0 @@
|
||||||
8a284571bea2cdd305cd86fbac9bab6deef31c7f
|
|
|
@ -0,0 +1 @@
|
||||||
|
c81d6c63e22e97819063cad7f1ecd20269cba720
|
|
@ -25,7 +25,7 @@ $Id$
|
||||||
Versions of Major Components
|
Versions of Major Components
|
||||||
---------------------
|
---------------------
|
||||||
Apache Tika 1.4
|
Apache Tika 1.4
|
||||||
Carrot2 3.6.2
|
Carrot2 3.8.0
|
||||||
Velocity 1.7 and Velocity Tools 2.0
|
Velocity 1.7 and Velocity Tools 2.0
|
||||||
Apache UIMA 2.3.1
|
Apache UIMA 2.3.1
|
||||||
Apache ZooKeeper 3.4.5
|
Apache ZooKeeper 3.4.5
|
||||||
|
@ -53,7 +53,7 @@ Other Changes
|
||||||
Versions of Major Components
|
Versions of Major Components
|
||||||
---------------------
|
---------------------
|
||||||
Apache Tika 1.4
|
Apache Tika 1.4
|
||||||
Carrot2 3.6.2
|
Carrot2 3.8.0
|
||||||
Velocity 1.7 and Velocity Tools 2.0
|
Velocity 1.7 and Velocity Tools 2.0
|
||||||
Apache UIMA 2.3.1
|
Apache UIMA 2.3.1
|
||||||
Apache ZooKeeper 3.4.5
|
Apache ZooKeeper 3.4.5
|
||||||
|
@ -76,6 +76,9 @@ Detailed Change List
|
||||||
New Features
|
New Features
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
* SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik
|
||||||
|
to version 1.7.1 (Dawid Weiss)
|
||||||
|
|
||||||
* SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
|
* SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
|
||||||
field is referenced via 'sfield' and the query point is constant.
|
field is referenced via 'sfield' and the query point is constant.
|
||||||
(David Smiley)
|
(David Smiley)
|
||||||
|
|
|
@ -20,9 +20,9 @@
|
||||||
<info organisation="org.apache.solr" module="analysis-extras"/>
|
<info organisation="org.apache.solr" module="analysis-extras"/>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
|
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
|
||||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
|
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
|
||||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
|
|
|
@ -19,14 +19,25 @@
|
||||||
<ivy-module version="2.0">
|
<ivy-module version="2.0">
|
||||||
<info organisation="org.apache.solr" module="clustering"/>
|
<info organisation="org.apache.solr" module="clustering"/>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency org="org.carrot2" name="carrot2-mini" rev="3.6.2" transitive="false"/>
|
<dependency org="org.carrot2" name="carrot2-mini" rev="3.8.0" transitive="false"/>
|
||||||
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.0.1" transitive="false"/>
|
|
||||||
<dependency org="com.carrotsearch" name="hppc" rev="0.4.1" transitive="false"/>
|
<dependency org="com.carrotsearch" name="hppc" rev="0.5.2" transitive="false"/>
|
||||||
|
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.2.0" transitive="false"/>
|
||||||
|
<dependency org="org.simpleframework" name="simple-xml" rev="2.7" transitive="false"/>
|
||||||
|
|
||||||
|
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
|
||||||
|
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
|
||||||
|
|
||||||
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/>
|
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/>
|
||||||
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/>
|
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/>
|
||||||
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
|
|
||||||
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
|
<!--
|
||||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.6.4" transitive="false"/>
|
Included as part of Solr's environment.
|
||||||
|
|
||||||
|
com.google.guava:guava:jar:14.0.1:compile
|
||||||
|
commons-lang:commons-lang:jar:2.6:compile
|
||||||
|
-->
|
||||||
|
|
||||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</ivy-module>
|
</ivy-module>
|
||||||
|
|
|
@ -37,8 +37,7 @@ public abstract class DocumentClusteringEngine extends ClusteringEngine {
|
||||||
public abstract NamedList cluster(SolrParams solrParams);
|
public abstract NamedList cluster(SolrParams solrParams);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Experimental. Subject to change before the next release
|
* Experimental. Subject to change before the next release
|
||||||
*
|
|
||||||
*
|
*
|
||||||
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
|
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
|
||||||
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
|
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
|
||||||
|
|
|
@ -77,6 +77,7 @@ import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.google.common.io.Closeables;
|
import com.google.common.io.Closeables;
|
||||||
|
import com.google.common.io.Closer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Search results clustering engine based on Carrot2 clustering algorithms.
|
* Search results clustering engine based on Carrot2 clustering algorithms.
|
||||||
|
@ -140,7 +141,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
||||||
+ ". Using the default " + resource + " from Carrot JAR.");
|
+ ". Using the default " + resource + " from Carrot JAR.");
|
||||||
return new IResource[] {};
|
return new IResource[] {};
|
||||||
} finally {
|
} finally {
|
||||||
if (resourceStream != null) Closeables.closeQuietly(resourceStream);
|
if (resourceStream != null) {
|
||||||
|
try {
|
||||||
|
resourceStream.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore.
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Loaded Solr resource: " + resourceName);
|
log.info("Loaded Solr resource: " + resourceName);
|
||||||
|
|
|
@ -52,7 +52,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
|
||||||
SolrRequestHandler handler = core.getRequestHandler("standard");
|
SolrRequestHandler handler = core.getRequestHandler("standard");
|
||||||
SolrQueryResponse rsp;
|
SolrQueryResponse rsp;
|
||||||
rsp = new SolrQueryResponse();
|
rsp = new SolrQueryResponse();
|
||||||
rsp.add("responseHeader", new SimpleOrderedMap());
|
rsp.add("responseHeader", new SimpleOrderedMap<Object>());
|
||||||
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
|
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
|
||||||
handler.handleRequest(req, rsp);
|
handler.handleRequest(req, rsp);
|
||||||
NamedList values = rsp.getValues();
|
NamedList values = rsp.getValues();
|
||||||
|
@ -70,7 +70,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
|
||||||
handler = core.getRequestHandler("docClustering");
|
handler = core.getRequestHandler("docClustering");
|
||||||
|
|
||||||
rsp = new SolrQueryResponse();
|
rsp = new SolrQueryResponse();
|
||||||
rsp.add("responseHeader", new SimpleOrderedMap());
|
rsp.add("responseHeader", new SimpleOrderedMap<Object>());
|
||||||
req = new LocalSolrQueryRequest(core, params);
|
req = new LocalSolrQueryRequest(core, params);
|
||||||
handler.handleRequest(req, rsp);
|
handler.handleRequest(req, rsp);
|
||||||
values = rsp.getValues();
|
values = rsp.getValues();
|
||||||
|
|
|
@ -15,7 +15,6 @@ package org.apache.solr.handler.clustering.carrot2;
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.carrot2.core.Cluster;
|
import org.carrot2.core.Cluster;
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
698f0c9427a8a94e00a59575ff6c5ff9d0bdc34a
|
|
|
@ -0,0 +1 @@
|
||||||
|
2aa3ce620ebadea4e385fc0a54dc363cb659dca5
|
|
@ -1 +0,0 @@
|
||||||
ffd6e0f7ef6c189bf8b456ef749f1ce600d6df74
|
|
|
@ -0,0 +1 @@
|
||||||
|
65d7bbe49bad0a95d9ae9b858abafb96a666ac5a
|
|
@ -1 +0,0 @@
|
||||||
61497cafe8201435b603c6014d2abf0b3fb7c381
|
|
|
@ -0,0 +1 @@
|
||||||
|
074bcc9d152a928a4ea9ac59a5b45850bf00cd4e
|
|
@ -1 +0,0 @@
|
||||||
397a99307020797e6790f2faf8cf865983b52559
|
|
|
@ -0,0 +1 @@
|
||||||
|
fdf556c88d66f65440bd24024f55a52c227c0e3f
|
|
@ -1 +0,0 @@
|
||||||
ca0663530971b54420fc1cea00a6338f68428232
|
|
|
@ -0,0 +1 @@
|
||||||
|
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
|
|
@ -1 +0,0 @@
|
||||||
8a284571bea2cdd305cd86fbac9bab6deef31c7f
|
|
|
@ -0,0 +1 @@
|
||||||
|
c81d6c63e22e97819063cad7f1ecd20269cba720
|
|
@ -1 +0,0 @@
|
||||||
5b7a462882768cf65a2273d90710c9838bd5b280
|
|
|
@ -0,0 +1 @@
|
||||||
|
48f90a787b2d59faab3b8c203945e4b0db32aec4
|
Loading…
Reference in New Issue