diff --git a/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_8_0/5080-new-similarity-algorithm-to-handle-numeric-normalization.yaml b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_8_0/5080-new-similarity-algorithm-to-handle-numeric-normalization.yaml
new file mode 100644
index 00000000000..6219850a3af
--- /dev/null
+++ b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_8_0/5080-new-similarity-algorithm-to-handle-numeric-normalization.yaml
@@ -0,0 +1,7 @@
+---
+type: add
+issue: 5080
+title: "Extended the existing MDM similarity algorithms to numeric values such that the input is normalized
+by removing all non-numeric characters from the string before the similarity algorithm is applied. This can
+be useful when wanting to measure similarity between identifying numbers or phone numbers where dashes or
+other special separating characters may be used."
diff --git a/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md
index 85a43e30ac6..0a56dea7a22 100644
--- a/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md
+++ b/hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/docs/server_jpa_mdm/mdm_rules.md
@@ -616,6 +616,46 @@ The following algorithms are currently supported:
|
+
+ NUMERIC_JARO_WINKLER |
+ similarity |
+
+ Removes all non-numeric characters before applying tdebatty Jaro Winkler
+ |
+ |
+
+
+ NUMERIC_COSINE |
+ similarity |
+
+ Removes all non-numeric characters before applying tdebatty Cosine Similarity
+ |
+ |
+
+
+ NUMERIC_JACCARD |
+ similarity |
+
+ Removes all non-numeric characters before applying tdebatty Jaccard Index
+ |
+ |
+
+
+ NUMERIC_LEVENSCHTEIN |
+ similarity |
+
+ Removes all non-numeric characters before applying tdebatty Normalized Levenshtein
+ |
+ |
+
+
+ NUMERIC_SORENSEN_DICE |
+ similarity |
+
+ Removes all non-numeric characters before applying tdebatty Sorensen-Dice coefficient
+ |
+ |
+
diff --git a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiNumericSimilarity.java b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiNumericSimilarity.java
new file mode 100644
index 00000000000..96aa0f0c5ec
--- /dev/null
+++ b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiNumericSimilarity.java
@@ -0,0 +1,30 @@
+package ca.uhn.fhir.mdm.rules.similarity;
+
+import ca.uhn.fhir.context.FhirContext;
+import ca.uhn.fhir.context.phonetic.NumericEncoder;
+import ca.uhn.fhir.mdm.rules.matcher.util.StringMatcherUtils;
+import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
+import org.hl7.fhir.instance.model.api.IBase;
+import org.hl7.fhir.instance.model.api.IPrimitiveType;
+
+public class HapiNumericSimilarity extends HapiStringSimilarity {
+ private final NumericEncoder encoder = new NumericEncoder();
+
+ public HapiNumericSimilarity(NormalizedStringSimilarity theStringSimilarity) {
+ super(theStringSimilarity);
+ }
+
+ @Override
+ public double similarity(FhirContext theFhirContext, IBase theLeftBase, IBase theRightBase, boolean theExact) {
+
+ if (theLeftBase instanceof IPrimitiveType && theRightBase instanceof IPrimitiveType) {
+ String leftString =
+ encoder.encode(StringMatcherUtils.extractString((IPrimitiveType>) theLeftBase, theExact));
+ String rightString =
+ encoder.encode(StringMatcherUtils.extractString((IPrimitiveType>) theRightBase, theExact));
+
+ return myStringSimilarity.similarity(leftString, rightString);
+ }
+ return 0.0;
+ }
+}
diff --git a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java
index c5e50effeae..1a9e0f23d36 100644
--- a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java
+++ b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/HapiStringSimilarity.java
@@ -29,7 +29,7 @@ import org.hl7.fhir.instance.model.api.IPrimitiveType;
* Similarity measure for two IBase fields whose similarity can be measured by their String representations.
*/
public class HapiStringSimilarity implements IMdmFieldSimilarity {
- private final NormalizedStringSimilarity myStringSimilarity;
+ protected final NormalizedStringSimilarity myStringSimilarity;
public HapiStringSimilarity(NormalizedStringSimilarity theStringSimilarity) {
myStringSimilarity = theStringSimilarity;
diff --git a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java
index c62bd4ed355..a1460c2ff68 100644
--- a/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java
+++ b/hapi-fhir-server-mdm/src/main/java/ca/uhn/fhir/mdm/rules/similarity/MdmSimilarityEnum.java
@@ -35,7 +35,12 @@ public enum MdmSimilarityEnum {
COSINE(new HapiStringSimilarity(new Cosine())),
JACCARD(new HapiStringSimilarity(new Jaccard())),
LEVENSCHTEIN(new HapiStringSimilarity(new NormalizedLevenshtein())),
- SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice()));
+ SORENSEN_DICE(new HapiStringSimilarity(new SorensenDice())),
+ NUMERIC_JARO_WINKLER(new HapiNumericSimilarity(new JaroWinkler())),
+ NUMERIC_COSINE(new HapiNumericSimilarity(new Cosine())),
+ NUMERIC_JACCARD(new HapiNumericSimilarity(new Jaccard())),
+ NUMERIC_LEVENSCHTEIN(new HapiNumericSimilarity(new NormalizedLevenshtein())),
+ NUMERIC_SORENSEN_DICE(new HapiNumericSimilarity(new SorensenDice()));
private final IMdmFieldSimilarity myMdmFieldSimilarity;
diff --git a/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/NumericSimilarityR4Test.java b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/NumericSimilarityR4Test.java
new file mode 100644
index 00000000000..77c15407ea0
--- /dev/null
+++ b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/NumericSimilarityR4Test.java
@@ -0,0 +1,58 @@
+package ca.uhn.fhir.mdm.rules.matcher;
+
+import ca.uhn.fhir.mdm.rules.similarity.HapiNumericSimilarity;
+import ca.uhn.fhir.mdm.rules.similarity.HapiStringSimilarity;
+import ca.uhn.fhir.mdm.rules.similarity.IMdmFieldSimilarity;
+import info.debatty.java.stringsimilarity.Cosine;
+import info.debatty.java.stringsimilarity.Jaccard;
+import info.debatty.java.stringsimilarity.JaroWinkler;
+import info.debatty.java.stringsimilarity.NormalizedLevenshtein;
+import info.debatty.java.stringsimilarity.SorensenDice;
+import org.hl7.fhir.r4.model.StringType;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class NumericSimilarityR4Test extends StringSimilarityR4Test {
+ private static final HapiStringSimilarity NUMERIC_JARO_WINKLER = new HapiNumericSimilarity(new JaroWinkler());
+ private static final HapiStringSimilarity NUMERIC_COSINE = new HapiNumericSimilarity(new Cosine());
+ private static final HapiStringSimilarity NUMERIC_JACCARD = new HapiNumericSimilarity(new Jaccard());
+ private static final HapiStringSimilarity NUMERIC_LEVENSCHTEIN = new HapiNumericSimilarity(new NormalizedLevenshtein());
+ private static final HapiStringSimilarity NUMERIC_SORENSEN_DICE = new HapiNumericSimilarity(new SorensenDice());
+
+
+ @ParameterizedTest
+ @CsvSource({
+ "123-45-6789, 123456789",
+ "1234-5-6789, 123456789",
+ "abc123, 123",
+ "(416) 967-1111, 4169671111,"
+ })
+ public void testNumericSimilarity_withExactMatches(String theLeft, String theRight) {
+ assertEquals(1.0, similarity(NUMERIC_JARO_WINKLER, theLeft, theRight));
+ assertEquals(1.0, similarity(NUMERIC_COSINE, theLeft, theRight));
+ assertEquals(1.0, similarity(NUMERIC_JACCARD, theLeft, theRight));
+ assertEquals(1.0, similarity(NUMERIC_LEVENSCHTEIN, theLeft, theRight));
+ assertEquals(1.0, similarity(NUMERIC_SORENSEN_DICE, theLeft, theRight));
+ }
+
+ @ParameterizedTest
+ @CsvSource({
+ "123546789, 123-54-6789, 123456789",
+ "123456789, 1234-5-6789, 123456789",
+ "321, abc321, 123",
+ "1231231234, (123) 123-1234, 1231234321,"
+ })
+ public void testNumericSimilarity_givesSameResultAsStringSimilarity(String theLeft, String theLeftWithNonNumerics, String theRight) {
+ assertEquals(similarity(JARO_WINKLER, theLeft, theRight), similarity(NUMERIC_JARO_WINKLER, theLeftWithNonNumerics, theRight));
+ assertEquals(similarity(COSINE, theLeft, theRight), similarity(NUMERIC_COSINE, theLeftWithNonNumerics, theRight));
+ assertEquals(similarity(JACCARD, theLeft, theRight), similarity(NUMERIC_JACCARD, theLeftWithNonNumerics, theRight));
+ assertEquals(similarity(LEVENSCHTEIN, theLeft, theRight), similarity(NUMERIC_LEVENSCHTEIN, theLeftWithNonNumerics, theRight));
+ assertEquals(similarity(SORENSEN_DICE, theLeft, theRight), similarity(NUMERIC_SORENSEN_DICE, theLeftWithNonNumerics, theRight));
+ }
+
+ private double similarity(IMdmFieldSimilarity theSimilarity, String theLeft, String theRight) {
+ return theSimilarity.similarity(ourFhirContext, new StringType(theLeft), new StringType(theRight), false);
+ }
+}
diff --git a/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java
index c806be9b99e..9fa76bb4a02 100644
--- a/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java
+++ b/hapi-fhir-server-mdm/src/test/java/ca/uhn/fhir/mdm/rules/matcher/StringSimilarityR4Test.java
@@ -17,11 +17,11 @@ public class StringSimilarityR4Test extends BaseMatcherR4Test {
public static final String LEFT = "somon";
public static final String RIGHT = "slomon";
- private static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler());
- private static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine());
- private static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard());
- private static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein());
- private static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice());
+ protected static final HapiStringSimilarity JARO_WINKLER = new HapiStringSimilarity(new JaroWinkler());
+ protected static final HapiStringSimilarity COSINE = new HapiStringSimilarity(new Cosine());
+ protected static final HapiStringSimilarity JACCARD = new HapiStringSimilarity(new Jaccard());
+ protected static final HapiStringSimilarity LEVENSCHTEIN = new HapiStringSimilarity(new NormalizedLevenshtein());
+ protected static final HapiStringSimilarity SORENSEN_DICE = new HapiStringSimilarity(new SorensenDice());
@Test
public void testSlomon() {