mirror of https://github.com/apache/lucene.git
SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik to version 1.7.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1512203 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8177498c86
commit
b40f603f46
|
@ -407,12 +407,12 @@
|
|||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
<artifactId>carrot2-mini</artifactId>
|
||||
<version>3.6.2</version>
|
||||
<version>3.8.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.carrot2</groupId>
|
||||
<artifactId>morfologik-polish</artifactId>
|
||||
<version>1.6.0</version>
|
||||
<version>1.7.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codehaus.woodstox</groupId>
|
||||
|
|
|
@ -23,10 +23,6 @@ Changes in backwards compatibility policy
|
|||
not positioned. This change affects all classes that inherit from
|
||||
DocIdSetIterator, including DocsEnum and DocsAndPositionsEnum. (Adrien Grand)
|
||||
|
||||
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
|
||||
no longer support multiple "dictionaries" as there is only one dictionary available.
|
||||
(Dawid Weiss)
|
||||
|
||||
* LUCENE-5127: Reduce RAM usage of FixedGapTermsIndex. Remove
|
||||
IndexWriterConfig.setTermIndexInterval, IndexWriterConfig.setReaderTermsIndexDivisor,
|
||||
and termsIndexDivisor from StandardDirectoryReader. These options have been no-ops
|
||||
|
@ -39,10 +35,6 @@ New Features
|
|||
* LUCENE-4747: Move to Java 7 as minimum Java version.
|
||||
(Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-5089: Update to Morfologik 1.6.0. MorfologikAnalyzer and MorfologikFilter
|
||||
no longer support multiple "dictionaries" as there is only one dictionary available.
|
||||
(Dawid Weiss)
|
||||
|
||||
* SOLR-3359: Added analyzer attribute/property to SynonymFilterFactory.
|
||||
(Ryo Onodera via Koji Sekiguchi)
|
||||
|
||||
|
@ -184,6 +176,10 @@ Changes in backwards compatibility policy
|
|||
CheckIndex.fixIndex(Status). If you used to pass a codec to this method, just
|
||||
remove it from the arguments. (Adrien Grand)
|
||||
|
||||
* LUCENE-5089, SOLR-5126: Update to Morfologik 1.7.1. MorfologikAnalyzer and MorfologikFilter
|
||||
no longer support multiple "dictionaries" as there is only one dictionary available.
|
||||
(Dawid Weiss)
|
||||
|
||||
======================= Lucene 4.4.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -19,9 +19,9 @@
|
|||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.lucene" module="analyzers-morfologik"/>
|
||||
<dependencies>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.morfologik;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import morfologik.stemming.*;
|
||||
|
||||
|
@ -82,73 +83,31 @@ public class MorfologikFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/**
|
||||
* The tag encoding format has been changing in Morfologik from version
|
||||
* to version. Let's keep both variants and determine which one to run
|
||||
* based on this flag.
|
||||
* A pattern used to split lemma forms.
|
||||
*/
|
||||
private final static boolean multipleTagsPerLemma = true;
|
||||
private final static Pattern lemmaSplitter = Pattern.compile("\\+|\\|");
|
||||
|
||||
private void popNextLemma() {
|
||||
if (multipleTagsPerLemma) {
|
||||
// One tag (concatenated) per lemma.
|
||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||
termAtt.setEmpty().append(lemma.getStem());
|
||||
CharSequence tag = lemma.getTag();
|
||||
if (tag != null) {
|
||||
String[] tags = tag.toString().split("\\+|\\|");
|
||||
for (int i = 0; i < tags.length; i++) {
|
||||
if (tagsList.size() <= i) {
|
||||
tagsList.add(new StringBuilder());
|
||||
}
|
||||
StringBuilder buffer = tagsList.get(i);
|
||||
buffer.setLength(0);
|
||||
buffer.append(tags[i]);
|
||||
// One tag (concatenated) per lemma.
|
||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||
termAtt.setEmpty().append(lemma.getStem());
|
||||
CharSequence tag = lemma.getTag();
|
||||
if (tag != null) {
|
||||
String[] tags = lemmaSplitter.split(tag.toString());
|
||||
for (int i = 0; i < tags.length; i++) {
|
||||
if (tagsList.size() <= i) {
|
||||
tagsList.add(new StringBuilder());
|
||||
}
|
||||
tagsAtt.setTags(tagsList.subList(0, tags.length));
|
||||
} else {
|
||||
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
|
||||
StringBuilder buffer = tagsList.get(i);
|
||||
buffer.setLength(0);
|
||||
buffer.append(tags[i]);
|
||||
}
|
||||
tagsAtt.setTags(tagsList.subList(0, tags.length));
|
||||
} else {
|
||||
// One tag (concatenated) per stem (lemma repeated).
|
||||
CharSequence currentStem;
|
||||
int tags = 0;
|
||||
do {
|
||||
final WordData lemma = lemmaList.get(lemmaListIndex++);
|
||||
currentStem = lemma.getStem();
|
||||
final CharSequence tag = lemma.getTag();
|
||||
if (tag != null) {
|
||||
if (tagsList.size() <= tags) {
|
||||
tagsList.add(new StringBuilder());
|
||||
}
|
||||
|
||||
final StringBuilder buffer = tagsList.get(tags++);
|
||||
buffer.setLength(0);
|
||||
buffer.append(lemma.getTag());
|
||||
}
|
||||
} while (lemmaListIndex < lemmaList.size() &&
|
||||
equalCharSequences(lemmaList.get(lemmaListIndex).getStem(), currentStem));
|
||||
|
||||
// Set the lemma's base form and tags as attributes.
|
||||
termAtt.setEmpty().append(currentStem);
|
||||
tagsAtt.setTags(tagsList.subList(0, tags));
|
||||
tagsAtt.setTags(Collections.<StringBuilder> emptyList());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two char sequences for equality. Assumes non-null arguments.
|
||||
*/
|
||||
private static final boolean equalCharSequences(CharSequence s1, CharSequence s2) {
|
||||
int len1 = s1.length();
|
||||
int len2 = s2.length();
|
||||
if (len1 != len2) return false;
|
||||
for (int i = len1; --i >= 0;) {
|
||||
if (s1.charAt(i) != s2.charAt(i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lookup a given surface form of a token and update
|
||||
* {@link #lemmaList} and {@link #lemmaListIndex} accordingly.
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
397a99307020797e6790f2faf8cf865983b52559
|
|
@ -0,0 +1 @@
|
|||
fdf556c88d66f65440bd24024f55a52c227c0e3f
|
|
@ -1 +0,0 @@
|
|||
ca0663530971b54420fc1cea00a6338f68428232
|
|
@ -0,0 +1 @@
|
|||
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
|
|
@ -1 +0,0 @@
|
|||
8a284571bea2cdd305cd86fbac9bab6deef31c7f
|
|
@ -0,0 +1 @@
|
|||
c81d6c63e22e97819063cad7f1ecd20269cba720
|
|
@ -25,7 +25,7 @@ $Id$
|
|||
Versions of Major Components
|
||||
---------------------
|
||||
Apache Tika 1.4
|
||||
Carrot2 3.6.2
|
||||
Carrot2 3.8.0
|
||||
Velocity 1.7 and Velocity Tools 2.0
|
||||
Apache UIMA 2.3.1
|
||||
Apache ZooKeeper 3.4.5
|
||||
|
@ -53,7 +53,7 @@ Other Changes
|
|||
Versions of Major Components
|
||||
---------------------
|
||||
Apache Tika 1.4
|
||||
Carrot2 3.6.2
|
||||
Carrot2 3.8.0
|
||||
Velocity 1.7 and Velocity Tools 2.0
|
||||
Apache UIMA 2.3.1
|
||||
Apache ZooKeeper 3.4.5
|
||||
|
@ -76,6 +76,9 @@ Detailed Change List
|
|||
New Features
|
||||
----------------------
|
||||
|
||||
* SOLR-5126: Update Carrot2 clustering to version 3.8.0, update Morfologik
|
||||
to version 1.7.1 (Dawid Weiss)
|
||||
|
||||
* SOLR-2345: Enhanced geodist() to work with an RPT field, provided that the
|
||||
field is referenced via 'sfield' and the query point is constant.
|
||||
(David Smiley)
|
||||
|
|
|
@ -20,9 +20,9 @@
|
|||
<info organisation="org.apache.solr" module="analysis-extras"/>
|
||||
<dependencies>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.6.0" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.6.0" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.6.0" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.7.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.7.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.7.1" transitive="false"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
|
|
@ -19,14 +19,25 @@
|
|||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.solr" module="clustering"/>
|
||||
<dependencies>
|
||||
<dependency org="org.carrot2" name="carrot2-mini" rev="3.6.2" transitive="false"/>
|
||||
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.0.1" transitive="false"/>
|
||||
<dependency org="com.carrotsearch" name="hppc" rev="0.4.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="carrot2-mini" rev="3.8.0" transitive="false"/>
|
||||
|
||||
<dependency org="com.carrotsearch" name="hppc" rev="0.5.2" transitive="false"/>
|
||||
<dependency org="org.carrot2.attributes" name="attributes-binder" rev="1.2.0" transitive="false"/>
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.7" transitive="false"/>
|
||||
|
||||
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
|
||||
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
|
||||
|
||||
<dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="1.7.4" transitive="false"/>
|
||||
<dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="1.7.4" transitive="false"/>
|
||||
<dependency org="org.apache.mahout" name="mahout-collections" rev="1.0" transitive="false"/>
|
||||
<dependency org="org.apache.mahout" name="mahout-math" rev="0.6" transitive="false"/>
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.6.4" transitive="false"/>
|
||||
|
||||
<!--
|
||||
Included as part of Solr's environment.
|
||||
|
||||
com.google.guava:guava:jar:14.0.1:compile
|
||||
commons-lang:commons-lang:jar:2.6:compile
|
||||
-->
|
||||
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
|
|
@ -37,8 +37,7 @@ public abstract class DocumentClusteringEngine extends ClusteringEngine {
|
|||
public abstract NamedList cluster(SolrParams solrParams);
|
||||
|
||||
/**
|
||||
* Experimental. Subject to change before the next release
|
||||
*
|
||||
* Experimental. Subject to change before the next release
|
||||
*
|
||||
* Cluster the set of docs. Clustering of documents is often an expensive task that can take a long time.
|
||||
* @param docs The docs to cluster. If null, cluster all docs as in {@link #cluster(org.apache.solr.common.params.SolrParams)}
|
||||
|
|
|
@ -77,6 +77,7 @@ import com.google.common.collect.Lists;
|
|||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.common.io.Closeables;
|
||||
import com.google.common.io.Closer;
|
||||
|
||||
/**
|
||||
* Search results clustering engine based on Carrot2 clustering algorithms.
|
||||
|
@ -140,7 +141,13 @@ public class CarrotClusteringEngine extends SearchClusteringEngine {
|
|||
+ ". Using the default " + resource + " from Carrot JAR.");
|
||||
return new IResource[] {};
|
||||
} finally {
|
||||
if (resourceStream != null) Closeables.closeQuietly(resourceStream);
|
||||
if (resourceStream != null) {
|
||||
try {
|
||||
resourceStream.close();
|
||||
} catch (IOException e) {
|
||||
// ignore.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.info("Loaded Solr resource: " + resourceName);
|
||||
|
|
|
@ -52,7 +52,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
|
|||
SolrRequestHandler handler = core.getRequestHandler("standard");
|
||||
SolrQueryResponse rsp;
|
||||
rsp = new SolrQueryResponse();
|
||||
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||
rsp.add("responseHeader", new SimpleOrderedMap<Object>());
|
||||
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
|
||||
handler.handleRequest(req, rsp);
|
||||
NamedList values = rsp.getValues();
|
||||
|
@ -70,7 +70,7 @@ public class ClusteringComponentTest extends AbstractClusteringTestCase {
|
|||
handler = core.getRequestHandler("docClustering");
|
||||
|
||||
rsp = new SolrQueryResponse();
|
||||
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||
rsp.add("responseHeader", new SimpleOrderedMap<Object>());
|
||||
req = new LocalSolrQueryRequest(core, params);
|
||||
handler.handleRequest(req, rsp);
|
||||
values = rsp.getValues();
|
||||
|
|
|
@ -15,7 +15,6 @@ package org.apache.solr.handler.clustering.carrot2;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.carrot2.core.Cluster;
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
698f0c9427a8a94e00a59575ff6c5ff9d0bdc34a
|
|
@ -0,0 +1 @@
|
|||
2aa3ce620ebadea4e385fc0a54dc363cb659dca5
|
|
@ -1 +0,0 @@
|
|||
ffd6e0f7ef6c189bf8b456ef749f1ce600d6df74
|
|
@ -0,0 +1 @@
|
|||
65d7bbe49bad0a95d9ae9b858abafb96a666ac5a
|
|
@ -1 +0,0 @@
|
|||
61497cafe8201435b603c6014d2abf0b3fb7c381
|
|
@ -0,0 +1 @@
|
|||
074bcc9d152a928a4ea9ac59a5b45850bf00cd4e
|
|
@ -1 +0,0 @@
|
|||
397a99307020797e6790f2faf8cf865983b52559
|
|
@ -0,0 +1 @@
|
|||
fdf556c88d66f65440bd24024f55a52c227c0e3f
|
|
@ -1 +0,0 @@
|
|||
ca0663530971b54420fc1cea00a6338f68428232
|
|
@ -0,0 +1 @@
|
|||
e03b9feb39f6e2c0ac7c37e220d01cdae66d3a28
|
|
@ -1 +0,0 @@
|
|||
8a284571bea2cdd305cd86fbac9bab6deef31c7f
|
|
@ -0,0 +1 @@
|
|||
c81d6c63e22e97819063cad7f1ecd20269cba720
|
|
@ -1 +0,0 @@
|
|||
5b7a462882768cf65a2273d90710c9838bd5b280
|
|
@ -0,0 +1 @@
|
|||
48f90a787b2d59faab3b8c203945e4b0db32aec4
|
Loading…
Reference in New Issue