From 548f973766e8e31d4f2edaeebbc3e400141ab559 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Mon, 23 Nov 2009 17:18:16 +0000 Subject: [PATCH] SOLR-1302: Why should numbers get to have all the fun, add String distance functions, too git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@883421 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 +- .../apache/solr/search/ValueSourceParser.java | 29 ++++++ .../distance/StringDistanceFunction.java | 96 +++++++++++++++++++ .../search/function/TestFunctionQuery.java | 8 ++ 4 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 src/java/org/apache/solr/search/function/distance/StringDistanceFunction.java diff --git a/CHANGES.txt b/CHANGES.txt index 301d73eff00..31a85085b68 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -34,7 +34,7 @@ Detailed Change List New Features ---------------------- -* SOLR-1302: Added several new distance based functions, including Great Circle (haversine), Manhattan and Euclidean. +* SOLR-1302: Added several new distance based functions, including Great Circle (haversine), Manhattan, Euclidean and String (using the StringDistance methods in the Lucene Spellchecker). Also added geohash(), deg() and rad() convenience functions. See http://wiki.apache.org/solr/FunctionQuery. (gsingers) * SOLR-1553: New dismax parser implementation (accessible as "edismax") diff --git a/src/java/org/apache/solr/search/ValueSourceParser.java b/src/java/org/apache/solr/search/ValueSourceParser.java index 4f8f4e4c470..a7ade07a725 100755 --- a/src/java/org/apache/solr/search/ValueSourceParser.java +++ b/src/java/org/apache/solr/search/ValueSourceParser.java @@ -20,6 +20,10 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.spell.JaroWinklerDistance; +import org.apache.lucene.search.spell.LevensteinDistance; +import org.apache.lucene.search.spell.NGramDistance; +import org.apache.lucene.search.spell.StringDistance; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.NamedList; import org.apache.solr.schema.DateField; @@ -34,6 +38,7 @@ import org.apache.solr.search.function.distance.SquaredEuclideanFunction; import org.apache.solr.search.function.distance.VectorDistanceFunction; import org.apache.solr.search.function.distance.GeohashHaversineFunction; import org.apache.solr.search.function.distance.GeohashFunction; +import org.apache.solr.search.function.distance.StringDistanceFunction; import org.apache.solr.util.plugin.NamedListInitializedPlugin; import java.io.IOException; @@ -255,6 +260,30 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin { return new GeohashFunction(lat, lon); } }); + addParser("strdist", new ValueSourceParser() { + public ValueSource parse(FunctionQParser fp) throws ParseException { + + ValueSource str1 = fp.parseValueSource(); + ValueSource str2 = fp.parseValueSource(); + String distClass = fp.parseArg(); + + StringDistance dist = null; + if (distClass.equalsIgnoreCase("jw")) { + dist = new JaroWinklerDistance(); + } else if (distClass.equalsIgnoreCase("edit")) { + dist = new LevensteinDistance(); + } else if (distClass.equalsIgnoreCase("ngram")) { + int ngram = 2; + if (fp.hasMoreArguments()) { + ngram = fp.parseInt(); + } + dist = new NGramDistance(ngram); + } else { + dist = (StringDistance) fp.req.getCore().getResourceLoader().newInstance(distClass); + } + return new StringDistanceFunction(str1, str2, dist); + } + }); addParser(new DoubleParser("rad") { public double func(int doc, DocValues vals) { diff --git a/src/java/org/apache/solr/search/function/distance/StringDistanceFunction.java b/src/java/org/apache/solr/search/function/distance/StringDistanceFunction.java new file mode 100644 index 00000000000..8396d6e6bb6 --- /dev/null +++ b/src/java/org/apache/solr/search/function/distance/StringDistanceFunction.java @@ -0,0 +1,96 @@ +package org.apache.solr.search.function.distance; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.spell.StringDistance; +import org.apache.solr.search.function.DocValues; +import org.apache.solr.search.function.ValueSource; + +import java.io.IOException; +import java.util.Map; + + +/** + * + * + **/ +public class StringDistanceFunction extends ValueSource { + protected ValueSource str1, str2; + protected StringDistance dist; + + /** + * @param str1 + * @param str2 + * @param measure + */ + public StringDistanceFunction(ValueSource str1, ValueSource str2, StringDistance measure) { + this.str1 = str1; + this.str2 = str2; + dist = measure; + + + } + + @Override + public DocValues getValues(Map context, IndexReader reader) throws IOException { + final DocValues str1DV = str1.getValues(context, reader); + final DocValues str2DV = str2.getValues(context, reader); + return new DocValues() { + + public float floatVal(int doc) { + return (float) dist.getDistance(str1DV.strVal(doc), str2DV.strVal(doc)); + } + + public int intVal(int doc) { + return (int) doubleVal(doc); + } + + public long longVal(int doc) { + return (long) doubleVal(doc); + } + + public double doubleVal(int doc) { + return (double) floatVal(doc); + } + + @Override + public String toString(int doc) { + StringBuilder sb = new StringBuilder(); + sb.append("strdist").append('('); + sb.append(str1DV.toString(doc)).append(',').append(str2DV.toString(doc)) + .append(", dist=").append(dist.getClass().getName()); + sb.append(')'); + return sb.toString(); + } + }; + } + + public String description() { + StringBuilder sb = new StringBuilder(); + sb.append("strdist").append('('); + sb.append(str1).append(',').append(str2).append(", dist=").append(dist.getClass().getName()); + sb.append(')'); + return sb.toString(); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof StringDistanceFunction)) return false; + + StringDistanceFunction that = (StringDistanceFunction) o; + + if (!dist.equals(that.dist)) return false; + if (!str1.equals(that.str1)) return false; + if (!str2.equals(that.str2)) return false; + + return true; + } + + @Override + public int hashCode() { + int result = str1.hashCode(); + result = 31 * result + str2.hashCode(); + result = 31 * result + dist.hashCode(); + return result; + } +} diff --git a/src/test/org/apache/solr/search/function/TestFunctionQuery.java b/src/test/org/apache/solr/search/function/TestFunctionQuery.java index f2f53720f8e..4df20dd67f0 100755 --- a/src/test/org/apache/solr/search/function/TestFunctionQuery.java +++ b/src/test/org/apache/solr/search/function/TestFunctionQuery.java @@ -347,6 +347,14 @@ public class TestFunctionQuery extends AbstractSolrTestCase { assertQ(req("fl", "*,score", "q", "{!func}deg(y_td)", "fq", "id:3"), "//float[@name='score']='45.0'"); } + public void testStrDistance() throws Exception { + assertU(adoc("id", "1", "x_s", "foil")); + assertU(commit()); + assertQ(req("fl", "*,score", "q", "{!func}strdist(x_s, 'foit', edit)", "fq", "id:1"), "//float[@name='score']='0.75'"); + assertQ(req("fl", "*,score", "q", "{!func}strdist(x_s, 'foit', jw)", "fq", "id:1"), "//float[@name='score']='0.8833333'"); + assertQ(req("fl", "*,score", "q", "{!func}strdist(x_s, 'foit', ngram, 2)", "fq", "id:1"), "//float[@name='score']='0.875'"); + } + public void dofunc(String func, double val) throws Exception { // String sval = Double.toString(val); String sval = Float.toString((float)val);