From e565b1c94873bc0dbe7adacfc3ecbfd6b7dca425 Mon Sep 17 00:00:00 2001 From: Ken Stevens Date: Fri, 10 Jul 2020 15:08:50 -0400 Subject: [PATCH] done (#1976) --- .../fhir/context/phonetic/ApacheEncoder.java | 37 +++++++++++++++++++ .../context/phonetic/PhoneticEncoderTest.java | 29 +++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 hapi-fhir-base/src/test/java/ca/uhn/fhir/context/phonetic/PhoneticEncoderTest.java diff --git a/hapi-fhir-base/src/main/java/ca/uhn/fhir/context/phonetic/ApacheEncoder.java b/hapi-fhir-base/src/main/java/ca/uhn/fhir/context/phonetic/ApacheEncoder.java index 11bd84abf0d..2f2c11838df 100644 --- a/hapi-fhir-base/src/main/java/ca/uhn/fhir/context/phonetic/ApacheEncoder.java +++ b/hapi-fhir-base/src/main/java/ca/uhn/fhir/context/phonetic/ApacheEncoder.java @@ -22,9 +22,12 @@ package ca.uhn.fhir.context.phonetic; import org.apache.commons.codec.EncoderException; import org.apache.commons.codec.StringEncoder; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.StringJoiner; + public class ApacheEncoder implements IPhoneticEncoder { private static final Logger ourLog = LoggerFactory.getLogger(ApacheEncoder.class); @@ -44,10 +47,44 @@ public class ApacheEncoder implements IPhoneticEncoder { @Override public String encode(String theString) { try { + // If the string contains a space, encode alpha parts separately so, for example, numbers are preserved in address lines. + if (theString.contains(" ")) { + return encodeStringWithSpaces(theString); + } return myStringEncoder.encode(theString); } catch (EncoderException e) { ourLog.error("Failed to encode string " + theString, e); return theString; } } + + private String encodeStringWithSpaces(String theString) throws EncoderException { + StringJoiner joiner = new StringJoiner(" "); + + // This sub-stack holds the alpha parts + StringJoiner alphaJoiner = new StringJoiner(" "); + + for (String part : theString.split("[\\s\\W]+")) { + if (StringUtils.isAlpha(part)) { + alphaJoiner.add(part); + } else { + // Once we hit a non-alpha part, encode all the alpha parts together as a single string + // This is to allow encoders like METAPHONE to match Hans Peter to Hanspeter + alphaJoiner = encodeAlphaParts(joiner, alphaJoiner); + joiner.add(part); + } + } + encodeAlphaParts(joiner, alphaJoiner); + + return joiner.toString(); + } + + private StringJoiner encodeAlphaParts(StringJoiner theJoiner, StringJoiner theAlphaJoiner) throws EncoderException { + // Encode the alpha parts as a single string and then flush the alpha encoder + if (theAlphaJoiner.length() > 0) { + theJoiner.add(myStringEncoder.encode(theAlphaJoiner.toString())); + theAlphaJoiner = new StringJoiner(" "); + } + return theAlphaJoiner; + } } diff --git a/hapi-fhir-base/src/test/java/ca/uhn/fhir/context/phonetic/PhoneticEncoderTest.java b/hapi-fhir-base/src/test/java/ca/uhn/fhir/context/phonetic/PhoneticEncoderTest.java new file mode 100644 index 00000000000..bca150978cb --- /dev/null +++ b/hapi-fhir-base/src/test/java/ca/uhn/fhir/context/phonetic/PhoneticEncoderTest.java @@ -0,0 +1,29 @@ +package ca.uhn.fhir.context.phonetic; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static org.hamcrest.Matchers.startsWith; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.endsWith; + +class PhoneticEncoderTest { + private static final Logger ourLog = LoggerFactory.getLogger(PhoneticEncoderTest.class); + + private static final String NUMBER = "123"; + private static final String STREET = "Nohili St, Suite"; + private static final String SUITE = "456"; + private static final String ADDRESS_LINE = NUMBER + " " + STREET + " " + SUITE; + + @ParameterizedTest + @EnumSource(PhoneticEncoderEnum.class) + public void testEncodeAddress(PhoneticEncoderEnum thePhoneticEncoderEnum) { + String encoded = thePhoneticEncoderEnum.getPhoneticEncoder().encode(ADDRESS_LINE); + ourLog.info("{}: {}", thePhoneticEncoderEnum.name(), encoded); + assertThat(encoded, startsWith(NUMBER + " ")); + assertThat(encoded, endsWith(" " + SUITE)); + } +}