diff --git a/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_8_0/5080-new-similarity-algorithm-to-handle-numeric-normalization.yaml b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_8_0/5080-new-similarity-algorithm-to-handle-numeric-normalization.yaml new file mode 100644 index 00000000000..6219850a3af --- /dev/null +++ b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_8_0/5080-new-similarity-algorithm-to-handle-numeric-normalization.yaml @@ -0,0 +1,7 @@ +--- +type: add +issue: 5080 +title: "Extended the existing MDM similarity algorithms to numeric values such that the input is normalized +by removing all non-numeric characters from the string before the similarity algorithm is applied. This can +be useful when wanting to measure similarity between identifying numbers or phone numbers where dashes or +other special separating characters may be used." diff --git a/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md index 85a43e30ac6..0a56dea7a22 100644 --- a/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md +++ b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md @@ -616,6 +616,46 @@ The following algorithms are currently supported: + + NUMERIC_JARO_WINKLER + similarity + + Removes all non-numeric characters before applying tdebatty Jaro Winkler + + + + + NUMERIC_COSINE + similarity + + Removes all non-numeric characters before applying tdebatty Cosine Similarity + + + + + NUMERIC_JACCARD + similarity + + Removes all non-numeric characters before applying tdebatty Jaccard Index + + + + + NUMERIC_LEVENSCHTEIN + similarity + + Removes all non-numeric characters before applying tdebatty Normalized Levenshtein + + + + + NUMERIC_SORENSEN_DICE + similarity + + Removes all non-numeric characters before applying tdebatty Sorensen-Dice coefficient + + + diff --git a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiNumericSimilarity.java b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiNumericSimilarity.java new file mode 100644 index 00000000000..96aa0f0c5ec --- /dev/null +++ b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiNumericSimilarity.java @@ -0,0 +1,30 @@ +package ca.uhn.fhir.mdm.rules.similarity; + +import ca.uhn.fhir.context.FhirContext; +import ca.uhn.fhir.context.phonetic.NumericEncoder; +import ca.uhn.fhir.mdm.rules.matcher.util.StringMatcherUtils; +import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity; +import org.hl7.fhir.instance.model.api.IBase; +import org.hl7.fhir.instance.model.api.IPrimitiveType; + +public class HapiNumericSimilarity extends HapiStringSimilarity { + private final NumericEncoder encoder = new NumericEncoder(); + + public HapiNumericSimilarity(NormalizedStringSimilarity theStringSimilarity) { + super(theStringSimilarity); + } + + @Override + public double similarity(FhirContext theFhirContext, IBase theLeftBase, IBase theRightBase, boolean theExact) { + + if (theLeftBase instanceof IPrimitiveType && theRightBase instanceof IPrimitiveType) { + String leftString = + encoder.encode(StringMatcherUtils.extractString((IPrimitiveType) theLeftBase, theExact)); + String rightString = + encoder.encode(StringMatcherUtils.extractString((IPrimitiveType) theRightBase, theExact)); + + return myStringSimilarity.similarity(leftString, rightString); + } + return 0.0; + } +} diff --git a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java index c5e50effeae..1a9e0f23d36 100644 --- a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java +++ b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java @@ -29,7 +29,7 @@ import org.hl7.fhir.instance.model.api.IPrimitiveType; * Similarity measure for two IBase fields whose similarity can be measured by their String representations. */ public class HapiStringSimilarity implements IMdmFieldSimilarity { - private final NormalizedStringSimilarity myStringSimilarity; + protected final NormalizedStringSimilarity myStringSimilarity; public HapiStringSimilarity(NormalizedStringSimilarity theStringSimilarity) { myStringSimilarity = theStringSimilarity; diff --git a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java index c62bd4ed355..a1460c2ff68 100644 --- a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java +++ b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java @@ -35,7 +35,12 @@ public enum MdmSimilarityEnum { COSINE(new HapiStringSimilarity(new Cosine())), JACCARD(new HapiStringSimilarity(new Jaccard())), LEVENSCHTEIN(new HapiStringSimilarity(new NormalizedLevenshtein())), - SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice())); + SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice())), + NUMERIC_JARO_WINKLER(new HapiNumericSimilarity(new JaroWinkler())), + NUMERIC_COSINE(new HapiNumericSimilarity(new Cosine())), + NUMERIC_JACCARD(new HapiNumericSimilarity(new Jaccard())), + NUMERIC_LEVENSCHTEIN(new HapiNumericSimilarity(new NormalizedLevenshtein())), + NUMERIC_SORENSEN_DICE(new HapiNumericSimilarity(new SorensenDice())); private final IMdmFieldSimilarity myMdmFieldSimilarity; diff --git a/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/NumericSimilarityR4Test.java b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/NumericSimilarityR4Test.java new file mode 100644 index 00000000000..77c15407ea0 --- /dev/null +++ b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/NumericSimilarityR4Test.java @@ -0,0 +1,58 @@ +package ca.uhn.fhir.mdm.rules.matcher; + +import ca.uhn.fhir.mdm.rules.similarity.HapiNumericSimilarity; +import ca.uhn.fhir.mdm.rules.similarity.HapiStringSimilarity; +import ca.uhn.fhir.mdm.rules.similarity.IMdmFieldSimilarity; +import info.debatty.java.stringsimilarity.Cosine; +import info.debatty.java.stringsimilarity.Jaccard; +import info.debatty.java.stringsimilarity.JaroWinkler; +import info.debatty.java.stringsimilarity.NormalizedLevenshtein; +import info.debatty.java.stringsimilarity.SorensenDice; +import org.hl7.fhir.r4.model.StringType; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class NumericSimilarityR4Test extends StringSimilarityR4Test { + private static final HapiStringSimilarity NUMERIC_JARO_WINKLER = new HapiNumericSimilarity(new JaroWinkler()); + private static final HapiStringSimilarity NUMERIC_COSINE = new HapiNumericSimilarity(new Cosine()); + private static final HapiStringSimilarity NUMERIC_JACCARD = new HapiNumericSimilarity(new Jaccard()); + private static final HapiStringSimilarity NUMERIC_LEVENSCHTEIN = new HapiNumericSimilarity(new NormalizedLevenshtein()); + private static final HapiStringSimilarity NUMERIC_SORENSEN_DICE = new HapiNumericSimilarity(new SorensenDice()); + + + @ParameterizedTest + @CsvSource({ + "123-45-6789, 123456789", + "1234-5-6789, 123456789", + "abc123, 123", + "(416) 967-1111, 4169671111," + }) + public void testNumericSimilarity_withExactMatches(String theLeft, String theRight) { + assertEquals(1.0, similarity(NUMERIC_JARO_WINKLER, theLeft, theRight)); + assertEquals(1.0, similarity(NUMERIC_COSINE, theLeft, theRight)); + assertEquals(1.0, similarity(NUMERIC_JACCARD, theLeft, theRight)); + assertEquals(1.0, similarity(NUMERIC_LEVENSCHTEIN, theLeft, theRight)); + assertEquals(1.0, similarity(NUMERIC_SORENSEN_DICE, theLeft, theRight)); + } + + @ParameterizedTest + @CsvSource({ + "123546789, 123-54-6789, 123456789", + "123456789, 1234-5-6789, 123456789", + "321, abc321, 123", + "1231231234, (123) 123-1234, 1231234321," + }) + public void testNumericSimilarity_givesSameResultAsStringSimilarity(String theLeft, String theLeftWithNonNumerics, String theRight) { + assertEquals(similarity(JARO_WINKLER, theLeft, theRight), similarity(NUMERIC_JARO_WINKLER, theLeftWithNonNumerics, theRight)); + assertEquals(similarity(COSINE, theLeft, theRight), similarity(NUMERIC_COSINE, theLeftWithNonNumerics, theRight)); + assertEquals(similarity(JACCARD, theLeft, theRight), similarity(NUMERIC_JACCARD, theLeftWithNonNumerics, theRight)); + assertEquals(similarity(LEVENSCHTEIN, theLeft, theRight), similarity(NUMERIC_LEVENSCHTEIN, theLeftWithNonNumerics, theRight)); + assertEquals(similarity(SORENSEN_DICE, theLeft, theRight), similarity(NUMERIC_SORENSEN_DICE, theLeftWithNonNumerics, theRight)); + } + + private double similarity(IMdmFieldSimilarity theSimilarity, String theLeft, String theRight) { + return theSimilarity.similarity(ourFhirContext, new StringType(theLeft), new StringType(theRight), false); + } +} diff --git a/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java index c806be9b99e..9fa76bb4a02 100644 --- a/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java +++ b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java @@ -17,11 +17,11 @@ public class StringSimilarityR4Test extends BaseMatcherR4Test { public static final String LEFT = "somon"; public static final String RIGHT = "slomon"; - private static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler()); - private static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine()); - private static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard()); - private static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein()); - private static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice()); + protected static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler()); + protected static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine()); + protected static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard()); + protected static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein()); + protected static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice()); @Test public void testSlomon() {