5080 new similarity algorithm to handle numeric normalization (#5082)

* solution

* test

* docs + changelog

* fix formatting

---------

Co-authored-by: justindar <justin.dar@smilecdr.com>
This commit is contained in:
jdar8 2023-07-20 10:20:51 -07:00 committed by GitHub
parent 2fbd3fa272
commit e02468b552
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 147 additions and 7 deletions

View File

@ -0,0 +1,7 @@
---
type: add
issue: 5080
title: "Extended the existing MDM similarity algorithms to numeric values such that the input is normalized
by removing all non-numeric characters from the string before the similarity algorithm is applied. This can
be useful when wanting to measure similarity between identifying numbers or phone numbers where dashes or
other special separating characters may be used."

View File

@ -616,6 +616,46 @@ The following algorithms are currently supported:
</td> </td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>NUMERIC_JARO_WINKLER</td>
<td>similarity</td>
<td>
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#jaro-winkler">tdebatty Jaro Winkler</a>
</td>
<td></td>
</tr>
<tr>
<td>NUMERIC_COSINE</td>
<td>similarity</td>
<td>
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#cosine-similarity">tdebatty Cosine Similarity</a>
</td>
<td></td>
</tr>
<tr>
<td>NUMERIC_JACCARD</td>
<td>similarity</td>
<td>
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#jaccard-index">tdebatty Jaccard Index</a>
</td>
<td></td>
</tr>
<tr>
<td>NUMERIC_LEVENSCHTEIN</td>
<td>similarity</td>
<td>
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#normalized-levenshtein">tdebatty Normalized Levenshtein</a>
</td>
<td></td>
</tr>
<tr>
<td>NUMERIC_SORENSEN_DICE</td>
<td>similarity</td>
<td>
Removes all non-numeric characters before applying <a href="https://github.com/tdebatty/java-string-similarity#sorensen-dice-coefficient">tdebatty Sorensen-Dice coefficient</a>
</td>
<td></td>
</tr>
</tbody> </tbody>
</table> </table>

View File

@ -0,0 +1,30 @@
package ca.uhn.fhir.mdm.rules.similarity;
import ca.uhn.fhir.context.FhirContext;
import ca.uhn.fhir.context.phonetic.NumericEncoder;
import ca.uhn.fhir.mdm.rules.matcher.util.StringMatcherUtils;
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
import org.hl7.fhir.instance.model.api.IBase;
import org.hl7.fhir.instance.model.api.IPrimitiveType;
public class HapiNumericSimilarity extends HapiStringSimilarity {
private final NumericEncoder encoder = new NumericEncoder();
public HapiNumericSimilarity(NormalizedStringSimilarity theStringSimilarity) {
super(theStringSimilarity);
}
@Override
public double similarity(FhirContext theFhirContext, IBase theLeftBase, IBase theRightBase, boolean theExact) {
if (theLeftBase instanceof IPrimitiveType && theRightBase instanceof IPrimitiveType) {
String leftString =
encoder.encode(StringMatcherUtils.extractString((IPrimitiveType<?>) theLeftBase, theExact));
String rightString =
encoder.encode(StringMatcherUtils.extractString((IPrimitiveType<?>) theRightBase, theExact));
return myStringSimilarity.similarity(leftString, rightString);
}
return 0.0;
}
}

View File

@ -29,7 +29,7 @@ import org.hl7.fhir.instance.model.api.IPrimitiveType;
* Similarity measure for two IBase fields whose similarity can be measured by their String representations. * Similarity measure for two IBase fields whose similarity can be measured by their String representations.
*/ */
public class HapiStringSimilarity implements IMdmFieldSimilarity { public class HapiStringSimilarity implements IMdmFieldSimilarity {
private final NormalizedStringSimilarity myStringSimilarity; protected final NormalizedStringSimilarity myStringSimilarity;
public HapiStringSimilarity(NormalizedStringSimilarity theStringSimilarity) { public HapiStringSimilarity(NormalizedStringSimilarity theStringSimilarity) {
myStringSimilarity = theStringSimilarity; myStringSimilarity = theStringSimilarity;

View File

@ -35,7 +35,12 @@ public enum MdmSimilarityEnum {
COSINE(new HapiStringSimilarity(new Cosine())), COSINE(new HapiStringSimilarity(new Cosine())),
JACCARD(new HapiStringSimilarity(new Jaccard())), JACCARD(new HapiStringSimilarity(new Jaccard())),
LEVENSCHTEIN(new HapiStringSimilarity(new NormalizedLevenshtein())), LEVENSCHTEIN(new HapiStringSimilarity(new NormalizedLevenshtein())),
SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice())); SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice())),
NUMERIC_JARO_WINKLER(new HapiNumericSimilarity(new JaroWinkler())),
NUMERIC_COSINE(new HapiNumericSimilarity(new Cosine())),
NUMERIC_JACCARD(new HapiNumericSimilarity(new Jaccard())),
NUMERIC_LEVENSCHTEIN(new HapiNumericSimilarity(new NormalizedLevenshtein())),
NUMERIC_SORENSEN_DICE(new HapiNumericSimilarity(new SorensenDice()));
private final IMdmFieldSimilarity myMdmFieldSimilarity; private final IMdmFieldSimilarity myMdmFieldSimilarity;

View File

@ -0,0 +1,58 @@
package ca.uhn.fhir.mdm.rules.matcher;
import ca.uhn.fhir.mdm.rules.similarity.HapiNumericSimilarity;
import ca.uhn.fhir.mdm.rules.similarity.HapiStringSimilarity;
import ca.uhn.fhir.mdm.rules.similarity.IMdmFieldSimilarity;
import info.debatty.java.stringsimilarity.Cosine;
import info.debatty.java.stringsimilarity.Jaccard;
import info.debatty.java.stringsimilarity.JaroWinkler;
import info.debatty.java.stringsimilarity.NormalizedLevenshtein;
import info.debatty.java.stringsimilarity.SorensenDice;
import org.hl7.fhir.r4.model.StringType;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class NumericSimilarityR4Test extends StringSimilarityR4Test {
private static final HapiStringSimilarity NUMERIC_JARO_WINKLER = new HapiNumericSimilarity(new JaroWinkler());
private static final HapiStringSimilarity NUMERIC_COSINE = new HapiNumericSimilarity(new Cosine());
private static final HapiStringSimilarity NUMERIC_JACCARD = new HapiNumericSimilarity(new Jaccard());
private static final HapiStringSimilarity NUMERIC_LEVENSCHTEIN = new HapiNumericSimilarity(new NormalizedLevenshtein());
private static final HapiStringSimilarity NUMERIC_SORENSEN_DICE = new HapiNumericSimilarity(new SorensenDice());
@ParameterizedTest
@CsvSource({
"123-45-6789, 123456789",
"1234-5-6789, 123456789",
"abc123, 123",
"(416) 967-1111, 4169671111,"
})
public void testNumericSimilarity_withExactMatches(String theLeft, String theRight) {
assertEquals(1.0, similarity(NUMERIC_JARO_WINKLER, theLeft, theRight));
assertEquals(1.0, similarity(NUMERIC_COSINE, theLeft, theRight));
assertEquals(1.0, similarity(NUMERIC_JACCARD, theLeft, theRight));
assertEquals(1.0, similarity(NUMERIC_LEVENSCHTEIN, theLeft, theRight));
assertEquals(1.0, similarity(NUMERIC_SORENSEN_DICE, theLeft, theRight));
}
@ParameterizedTest
@CsvSource({
"123546789, 123-54-6789, 123456789",
"123456789, 1234-5-6789, 123456789",
"321, abc321, 123",
"1231231234, (123) 123-1234, 1231234321,"
})
public void testNumericSimilarity_givesSameResultAsStringSimilarity(String theLeft, String theLeftWithNonNumerics, String theRight) {
assertEquals(similarity(JARO_WINKLER, theLeft, theRight), similarity(NUMERIC_JARO_WINKLER, theLeftWithNonNumerics, theRight));
assertEquals(similarity(COSINE, theLeft, theRight), similarity(NUMERIC_COSINE, theLeftWithNonNumerics, theRight));
assertEquals(similarity(JACCARD, theLeft, theRight), similarity(NUMERIC_JACCARD, theLeftWithNonNumerics, theRight));
assertEquals(similarity(LEVENSCHTEIN, theLeft, theRight), similarity(NUMERIC_LEVENSCHTEIN, theLeftWithNonNumerics, theRight));
assertEquals(similarity(SORENSEN_DICE, theLeft, theRight), similarity(NUMERIC_SORENSEN_DICE, theLeftWithNonNumerics, theRight));
}
private double similarity(IMdmFieldSimilarity theSimilarity, String theLeft, String theRight) {
return theSimilarity.similarity(ourFhirContext, new StringType(theLeft), new StringType(theRight), false);
}
}

View File

@ -17,11 +17,11 @@ public class StringSimilarityR4Test extends BaseMatcherR4Test {
public static final String LEFT = "somon"; public static final String LEFT = "somon";
public static final String RIGHT = "slomon"; public static final String RIGHT = "slomon";
private static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler()); protected static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler());
private static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine()); protected static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine());
private static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard()); protected static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard());
private static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein()); protected static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein());
private static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice()); protected static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice());
@Test @Test
public void testSlomon() { public void testSlomon() {