added numeric matcher (#2547)

* added numeric matcher

* changelog

* performance optimization

* fix test
This commit is contained in:
Ken Stevens 2021-04-14 13:15:30 -04:00 committed by GitHub
parent 4cd0409bae
commit 550602b2f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 110 additions and 28 deletions

View File

@ -0,0 +1,18 @@
package ca.uhn.fhir.context.phonetic;
import com.google.common.base.CharMatcher;
// Useful for numerical identifiers like phone numbers, address parts etc.
// This should not be used where decimals are important. A new "quantity encoder" should be added to handle cases like that.
public class NumericEncoder implements IPhoneticEncoder {
@Override
public String name() {
return "NUMERIC";
}
@Override
public String encode(String theString) {
// Remove everything but the numbers
return CharMatcher.inRange('0', '9').retainFrom(theString);
}
}

View File

@ -39,7 +39,8 @@ public enum PhoneticEncoderEnum {
METAPHONE(new ApacheEncoder("METAPHONE", new Metaphone())),
NYSIIS(new ApacheEncoder("NYSIIS", new Nysiis())),
REFINED_SOUNDEX(new ApacheEncoder("REFINED_SOUNDEX", new RefinedSoundex())),
SOUNDEX(new ApacheEncoder("SOUNDEX", new Soundex()));
SOUNDEX(new ApacheEncoder("SOUNDEX", new Soundex())),
NUMERIC(new NumericEncoder());
private final IPhoneticEncoder myPhoneticEncoder;

View File

@ -1,14 +1,14 @@
package ca.uhn.fhir.context.phonetic;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.hamcrest.Matchers.startsWith;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.endsWith;
import static org.hamcrest.Matchers.startsWith;
import static org.junit.jupiter.api.Assertions.assertEquals;
class PhoneticEncoderTest {
private static final Logger ourLog = LoggerFactory.getLogger(PhoneticEncoderTest.class);
@ -23,7 +23,11 @@ class PhoneticEncoderTest {
public void testEncodeAddress(PhoneticEncoderEnum thePhoneticEncoderEnum) {
String encoded = thePhoneticEncoderEnum.getPhoneticEncoder().encode(ADDRESS_LINE);
ourLog.info("{}: {}", thePhoneticEncoderEnum.name(), encoded);
assertThat(encoded, startsWith(NUMBER + " "));
assertThat(encoded, endsWith(" " + SUITE));
if (thePhoneticEncoderEnum == PhoneticEncoderEnum.NUMERIC) {
assertEquals(NUMBER + SUITE, encoded);
} else {
assertThat(encoded, startsWith(NUMBER + " "));
assertThat(encoded, endsWith(" " + SUITE));
}
}
}

View File

@ -0,0 +1,5 @@
---
type: add
issue: 2547
title: "Added new NUMERIC mdm matcher for matching phone numbers. Also added NUMERIC phonetic encoder to support
adding NUMERIC encoded search parameter (e.g. if searching for matching phone numbers is required by mdm candidate searching)."

View File

@ -292,10 +292,10 @@ The following algorithms are currently supported:
<td>Gail = Gael, Gail != Gale, Thomas != Tom</td>
</tr>
<tr>
<td>CAVERPHONE1</td>
<td>CAVERPHONE2</td>
<td>matcher</td>
<td>
<a href="https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/language/Caverphone1.html">Apache Caverphone1</a>
<a href="https://commons.apache.org/proper/commons-codec/apidocs/org/apache/commons/codec/language/Caverphone2.html">Apache Caverphone2</a>
</td>
<td>Gail = Gael, Gail = Gale, Thomas != Tom</td>
</tr>
@ -379,6 +379,14 @@ The following algorithms are currently supported:
</td>
<td>2019-12,Month = 2019-12-19,Day</td>
</tr>
<tr>
<td>NUMERIC</td>
<td>matcher</td>
<td>
Remove all non-numeric characters from the string before comparing.
</td>
<td>4169671111 = (416) 967-1111</td>
</tr>
<tr>
<td>NAME_ANY_ORDER</td>
<td>matcher</td>

View File

@ -1,12 +1,13 @@
package ca.uhn.fhir.jpa.dao.dstu3;
import ca.uhn.fhir.context.phonetic.ApacheEncoder;
import ca.uhn.fhir.context.phonetic.NumericEncoder;
import ca.uhn.fhir.context.phonetic.PhoneticEncoderEnum;
import ca.uhn.fhir.jpa.api.config.DaoConfig;
import ca.uhn.fhir.jpa.model.entity.ResourceIndexedSearchParamString;
import ca.uhn.fhir.jpa.searchparam.SearchParameterMap;
import ca.uhn.fhir.rest.server.util.ISearchParamRegistry;
import ca.uhn.fhir.rest.param.StringParam;
import ca.uhn.fhir.rest.server.util.ISearchParamRegistry;
import ca.uhn.fhir.util.HapiExtensions;
import org.apache.commons.codec.language.Soundex;
import org.hl7.fhir.dstu3.model.Enumerations;
@ -35,10 +36,14 @@ public class FhirResourceDaoDstu3PhoneticSearchNoFtTest extends BaseJpaDstu3Test
public static final String GAIL = "Gail";
public static final String NAME_SOUNDEX_SP = "nameSoundex";
public static final String ADDRESS_LINE_SOUNDEX_SP = "addressLineSoundex";
public static final String PHONE_NUMBER_SP = "phoneNumber";
private static final String BOB = "BOB";
private static final String ADDRESS = "123 Nohili St";
private static final String ADDRESS_CLOSE = "123 Nohily St";
private static final String ADDRESS_FAR = "123 College St";
private static final String PHONE = "4169671111";
private static final String PHONE_CLOSE = "(416) 967-1111";
private static final String PHONE_FAR = "416 421 0421";
@Autowired
ISearchParamRegistry mySearchParamRegistry;
@ -49,8 +54,9 @@ public class FhirResourceDaoDstu3PhoneticSearchNoFtTest extends BaseJpaDstu3Test
myDaoConfig.setReuseCachedSearchResultsForMillis(null);
myDaoConfig.setFetchSizeDefaultMaximum(new DaoConfig().getFetchSizeDefaultMaximum());
createSoundexSearchParameter(NAME_SOUNDEX_SP, PhoneticEncoderEnum.SOUNDEX, "Patient.name");
createSoundexSearchParameter(ADDRESS_LINE_SOUNDEX_SP, PhoneticEncoderEnum.SOUNDEX, "Patient.address.line");
createPhoneticSearchParameter(NAME_SOUNDEX_SP, PhoneticEncoderEnum.SOUNDEX, "Patient.name");
createPhoneticSearchParameter(ADDRESS_LINE_SOUNDEX_SP, PhoneticEncoderEnum.SOUNDEX, "Patient.address.line");
createPhoneticSearchParameter(PHONE_NUMBER_SP, PhoneticEncoderEnum.NUMERIC, "Patient.telecom");
mySearchParamRegistry.forceRefresh();
mySearchParamRegistry.setPhoneticEncoder(new ApacheEncoder(PhoneticEncoderEnum.SOUNDEX.name(), new Soundex()));
}
@ -70,6 +76,15 @@ public class FhirResourceDaoDstu3PhoneticSearchNoFtTest extends BaseJpaDstu3Test
ourLog.info("Encoded address: {}", soundex.encode(ADDRESS));
}
@Test
public void testNumeric() {
NumericEncoder numeric = new NumericEncoder();
assertEquals(PHONE, numeric.encode(PHONE_CLOSE));
assertEquals(PHONE, numeric.encode(PHONE));
assertEquals(numeric.encode(PHONE), numeric.encode(PHONE_CLOSE));
assertNotEquals(numeric.encode(PHONE), numeric.encode(PHONE_FAR));
}
@Test
public void phoneticMatch() {
Patient patient;
@ -77,15 +92,16 @@ public class FhirResourceDaoDstu3PhoneticSearchNoFtTest extends BaseJpaDstu3Test
patient = new Patient();
patient.addName().addGiven(GALE);
patient.addAddress().addLine(ADDRESS);
patient.addTelecom().setValue(PHONE);
ourLog.info(myFhirCtx.newJsonParser().setPrettyPrint(true).encodeResourceToString(patient));
IIdType pId = myPatientDao.create(patient, mySrd).getId().toUnqualifiedVersionless();
List<ResourceIndexedSearchParamString> stringParams = myResourceIndexedSearchParamStringDao.findAll();
assertThat(stringParams, hasSize(6));
assertThat(stringParams, hasSize(7));
List<String> stringParamNames = stringParams.stream().map(ResourceIndexedSearchParamString::getParamName).collect(Collectors.toList());
assertThat(stringParamNames, containsInAnyOrder(Patient.SP_NAME, Patient.SP_GIVEN, Patient.SP_PHONETIC, NAME_SOUNDEX_SP, Patient.SP_ADDRESS, ADDRESS_LINE_SOUNDEX_SP));
assertThat(stringParamNames, containsInAnyOrder(Patient.SP_NAME, Patient.SP_GIVEN, Patient.SP_PHONETIC, NAME_SOUNDEX_SP, Patient.SP_ADDRESS, ADDRESS_LINE_SOUNDEX_SP, PHONE_NUMBER_SP));
assertSearchMatch(pId, Patient.SP_PHONETIC, GALE);
assertSearchMatch(pId, Patient.SP_PHONETIC, GAIL);
@ -98,6 +114,10 @@ public class FhirResourceDaoDstu3PhoneticSearchNoFtTest extends BaseJpaDstu3Test
assertSearchMatch(pId, ADDRESS_LINE_SOUNDEX_SP, ADDRESS);
assertSearchMatch(pId, ADDRESS_LINE_SOUNDEX_SP, ADDRESS_CLOSE);
assertNoMatch(ADDRESS_LINE_SOUNDEX_SP, ADDRESS_FAR);
assertSearchMatch(pId, PHONE_NUMBER_SP, PHONE);
assertSearchMatch(pId, PHONE_NUMBER_SP, PHONE_CLOSE);
assertNoMatch(PHONE_NUMBER_SP, PHONE_FAR);
}
private void assertSearchMatch(IIdType thePId1, String theSp, String theValue) {
@ -114,7 +134,7 @@ public class FhirResourceDaoDstu3PhoneticSearchNoFtTest extends BaseJpaDstu3Test
assertThat(toUnqualifiedVersionlessIdValues(myPatientDao.search(map)), hasSize(0));
}
private void createSoundexSearchParameter(String theCode, PhoneticEncoderEnum theEncoder, String theFhirPath) {
private void createPhoneticSearchParameter(String theCode, PhoneticEncoderEnum theEncoder, String theFhirPath) {
SearchParameter searchParameter = new SearchParameter();
searchParameter.addBase("Patient");
searchParameter.setCode(theCode);

View File

@ -51,7 +51,8 @@ public enum MdmMatcherEnum {
IDENTIFIER(new IdentifierMatcher()),
EMPTY_FIELD(new EmptyFieldMatcher()),
EXTENSION_ANY_ORDER(new ExtensionMatcher());
EXTENSION_ANY_ORDER(new ExtensionMatcher()),
NUMERIC(new HapiStringMatcher(new NumericMatcher()));
private final IMdmFieldMatcher myMdmFieldMatcher;

View File

@ -0,0 +1,16 @@
package ca.uhn.fhir.mdm.rules.matcher;
import ca.uhn.fhir.context.phonetic.NumericEncoder;
// Useful for numerical identifiers like phone numbers, address parts etc.
// This should not be used where decimals are important. A new "quantity matcher" should be added to handle cases like that.
public class NumericMatcher implements IMdmStringMatcher {
private final NumericEncoder encoder = new NumericEncoder();
@Override
public boolean matches(String theLeftString, String theRightString) {
String left = encoder.encode(theLeftString);
String right = encoder.encode(theRightString);
return left.equals(right);
}
}

View File

@ -14,24 +14,33 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
public class StringMatcherR4Test extends BaseMatcherR4Test {
private static final Logger ourLog = LoggerFactory.getLogger(StringMatcherR4Test.class);
public static final String LEFT = "namadega";
public static final String RIGHT = "namaedga";
public static final String LEFT_NAME = "namadega";
public static final String RIGHT_NAME = "namaedga";
@Test
public void testNamadega() {
assertTrue(match(MdmMatcherEnum.COLOGNE, LEFT, RIGHT));
assertTrue(match(MdmMatcherEnum.DOUBLE_METAPHONE, LEFT, RIGHT));
assertTrue(match(MdmMatcherEnum.MATCH_RATING_APPROACH, LEFT, RIGHT));
assertTrue(match(MdmMatcherEnum.METAPHONE, LEFT, RIGHT));
assertTrue(match(MdmMatcherEnum.SOUNDEX, LEFT, RIGHT));
assertTrue(match(MdmMatcherEnum.METAPHONE, LEFT, RIGHT));
String left = LEFT_NAME;
String right = RIGHT_NAME;
assertTrue(match(MdmMatcherEnum.COLOGNE, left, right));
assertTrue(match(MdmMatcherEnum.DOUBLE_METAPHONE, left, right));
assertTrue(match(MdmMatcherEnum.MATCH_RATING_APPROACH, left, right));
assertTrue(match(MdmMatcherEnum.METAPHONE, left, right));
assertTrue(match(MdmMatcherEnum.SOUNDEX, left, right));
assertTrue(match(MdmMatcherEnum.METAPHONE, left, right));
assertFalse(match(MdmMatcherEnum.CAVERPHONE1, LEFT, RIGHT));
assertFalse(match(MdmMatcherEnum.CAVERPHONE2, LEFT, RIGHT));
assertFalse(match(MdmMatcherEnum.NYSIIS, LEFT, RIGHT));
assertFalse(match(MdmMatcherEnum.REFINED_SOUNDEX, LEFT, RIGHT));
assertFalse(match(MdmMatcherEnum.STRING, LEFT, RIGHT));
assertFalse(match(MdmMatcherEnum.SUBSTRING, LEFT, RIGHT));
assertFalse(match(MdmMatcherEnum.CAVERPHONE1, left, right));
assertFalse(match(MdmMatcherEnum.CAVERPHONE2, left, right));
assertFalse(match(MdmMatcherEnum.NYSIIS, left, right));
assertFalse(match(MdmMatcherEnum.REFINED_SOUNDEX, left, right));
assertFalse(match(MdmMatcherEnum.STRING, left, right));
assertFalse(match(MdmMatcherEnum.SUBSTRING, left, right));
}
@Test
public void testNumeric() {
assertTrue(match(MdmMatcherEnum.NUMERIC, "4169671111", "(416) 967-1111"));
assertFalse(match(MdmMatcherEnum.NUMERIC, "5169671111", "(416) 967-1111"));
assertFalse(match(MdmMatcherEnum.NUMERIC, "4169671111", "(416) 967-1111x123"));
}
@Test