Change hsearch to consider unquoted parameter strings as prefix match (#4045)

* Change hsearch to consider unquoted parameter strings as prefix match

* Adjust test according to spec (code fixed before).

* Add :text handling to StringParam

* Adjust StringParam tokenization to spec.
Enable StringParam tests.
Add changelog.

* Update hapi-fhir-docs/src/main/resources/ca/uhn/hapi/fhir/changelog/6_2_0/4034-align-text-query-syntax-with-hapi-string-search.yaml

Co-authored-by: michaelabuckley <michael.buckley@smilecdr.com>

* Avoid changing simple query parameters.
Implement suggested tests.

* Consider a few Lucene Simple Query Syntax special cases

* Add search syntax changes to documentation and test to make sure samples are good.

Co-authored-by: juan.marchionatto <juan.marchionatto@smilecdr.com>
Co-authored-by: michaelabuckley <michael.buckley@smilecdr.com>
This commit is contained in:
jmarchionatto 2022-09-21 16:09:09 -04:00 committed by GitHub
parent 392bfb790f
commit 27ecba796b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 346 additions and 15 deletions

View File

@ -0,0 +1,4 @@
---
type: add
issue: 4034
title: "Added support for the `:text` modifier on string search parameters. This corrects an issue when using Elastic/Lucene indexing enabled where prefix match was used instead."

View File

@ -1,4 +1,4 @@
# HAPI FHIR JPA Lucene/Elasticsearch Indexing
**# HAPI FHIR JPA Lucene/Elasticsearch Indexing
The HAPI JPA Server supports optional indexing via Hibernate Search when configured to use Lucene or Elasticsearch.
This is required to support the `_content`, or `_text` search parameters.
@ -15,17 +15,36 @@ The Extended Lucene string search indexing supports the default search, as well
- Default searching matches by prefix, insensitive to case or accents
- `:exact` matches the entire string, matching case and accents
- `:contains` extends the default search to match any substring of the text
- `:text` provides a rich search syntax as using the Simple Query Syntax as defined by
[Lucene](https://lucene.apache.org/core/8_10_1/queryparser/org/apache/lucene/queryparser/simple/SimpleQueryParser.html) and
[Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax).
- `:text` provides a rich search syntax as using a [modified Simple Query Syntax](#modified-simple-query-syntax).
## Token search
The Extended Lucene Indexing supports the default token search by code, system, or system+code,
as well as with the `:text` modifier.
The `:text` modifier provides the same Simple Query Syntax used by string `:text` searches.
The `:text` modifier provides the same modified Simple Query Syntax used by string `:text` searches.
See https://www.hl7.org/fhir/search.html#token.
## Modified Simple Query Syntax
The `:text` search for token and string, Hapi provides a modified version of the Simple Query Syntax provided by
[Lucene](https://lucene.apache.org/core/8_10_1/queryparser/org/apache/lucene/queryparser/simple/SimpleQueryParser.html) and
[Elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax).
Terms are delimited by whitespace, or query punctuation `"()|+'`. Literal uses of these characters must be escaped by '\'.
When the query only contains one or more bare terms, they are each converted to a prefix search to match the behaviour of a normal string search.
When multiple terms are present, they must all match (i.e. `AND`). For `OR` behaviour use the `|` operator between terms.
But if any special SQS syntax is active, the query is used as is.
To ensure that the query is used as-is, quote bare terms with the `"` or character. E.g. `without any special syntax characters
Examples:
| Fhir Query String | Executed Query | Matches | No Match | Note |
|-----------------|------------------|-------------|----------------|--------------------------------------------|
| Smit | Smit* | John Smith | John Smi | |
| Jo Smit | Jo* Smit* | John Smith | John Frank | Multiple bare terms are `AND` |
| frank &vert; john | frank &vert; john | Frank Smith | Franklin Smith | SQS characters disable prefix wildcard |
| 'frank' | 'frank' | Frank Smith | Franklin Smith | Quoted terms are exact match |
## Quantity search

View File

@ -221,7 +221,7 @@ public class ExtendedHSearchClauseBuilder {
}
for (List<? extends IQueryParameterType> nextAnd : stringAndOrTerms) {
Set<String> terms = extractOrStringParams(nextAnd);
Set<String> terms = TermHelper.makePrefixSearchTerm(extractOrStringParams(nextAnd));
ourLog.debug("addStringTextSearch {}, {}", theSearchParamName, terms);
if (!terms.isEmpty()) {
String query = terms.stream()

View File

@ -113,7 +113,7 @@ public class ExtendedHSearchSearchBuilder {
} else if (param instanceof StringParam) {
switch (modifier) {
// we support string:text, string:contains, string:exact, and unmodified string.
case Constants.PARAMQUALIFIER_TOKEN_TEXT:
case Constants.PARAMQUALIFIER_STRING_TEXT:
case Constants.PARAMQUALIFIER_STRING_EXACT:
case Constants.PARAMQUALIFIER_STRING_CONTAINS:
case EMPTY_MODIFIER:

View File

@ -0,0 +1,58 @@
package ca.uhn.fhir.jpa.dao.search;
import org.apache.commons.lang3.StringUtils;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Collectors;
public class TermHelper {
/** characters which indicate the string parameter is a simple query string */
private static final char[] simpleQuerySyntaxCharacters = new char[] { '+', '|', '"', '(', ')', '~' };
/**
* Each input set element is:
* _ copied to the output set unchanged if it contains a '*' character or is quoted
* _ trimmed, tokenized by spaces, and suffixed by ' *', and each resulting string copied to the output set
*/
public static Set<String> makePrefixSearchTerm(Set<String> theStringSet) {
return theStringSet.stream()
.map(s -> isToLeftUntouched(s) || isQuoted(s) ? s : suffixTokensWithStar(s) )
.collect(Collectors.toSet());
}
private static String suffixTokensWithStar(String theStr) {
StringBuilder sb = new StringBuilder();
Arrays.stream(theStr.trim().split(" "))
.forEach(s -> sb.append(s).append("* "));
return sb.toString().trim();
}
private static boolean isQuoted(String theS) {
return ( theS.startsWith("\"") && theS.endsWith("\"") ) ||
( theS.startsWith("'") && theS.endsWith("'") );
}
/**
* Returns true when the input string is recognized as Lucene Simple Query Syntax
* @see "https://lucene.apache.org/core/8_11_2/queryparser/org/apache/lucene/queryparser/simple/SimpleQueryParser.html"
*/
static boolean isToLeftUntouched(String theString) {
// remove backslashed * and - characters from string before checking, as those shouldn't be considered
if (theString.startsWith("-")) { return true; } // it is SimpleQuerySyntax
if (theString.endsWith("*")) { return true; } // it is SimpleQuerySyntax
return StringUtils.containsAny(theString, simpleQuerySyntaxCharacters);
}
}

View File

@ -0,0 +1,65 @@
package ca.uhn.fhir.jpa.dao.search;
import org.junit.jupiter.api.Test;
import java.util.Collections;
import java.util.Set;
import static org.junit.jupiter.api.Assertions.*;
class TermHelperTest {
@Test
void empty_returns_empty() {
assertEquals( Collections.emptySet(), TermHelper.makePrefixSearchTerm(Collections.emptySet()) );
}
@Test
void noQuotedSpcedOrStarElements_return_star_suffixed() {
Set<String> result = TermHelper.makePrefixSearchTerm(Set.of("abc", "def", "ghi"));
assertEquals( Set.of("abc*", "def*", "ghi*"), result );
}
@Test
void quotedElements_return_unchanged() {
Set<String> result = TermHelper.makePrefixSearchTerm(Set.of("'abc'", "\"def ghi\"", "\"jkl\""));
assertEquals( Set.of("'abc'", "\"def ghi\"", "\"jkl\""), result );
}
@Test
void unquotedStarContainingElements_spaces_or_not_return_unchanged() {
Set<String> result = TermHelper.makePrefixSearchTerm(Set.of("abc*", "*cde", "ef*g", "hij* klm"));
assertEquals( TermHelper.makePrefixSearchTerm(Set.of("abc*", "*cde", "ef*g", "hij* klm")), result );
}
@Test
void unquotedSpaceContainingElements_return_splitted_in_spaces_and_star_suffixed() {
Set<String> result = TermHelper.makePrefixSearchTerm(Set.of("abc", "cde", "hij klm"));
assertEquals( TermHelper.makePrefixSearchTerm(Set.of("abc*", "cde*", "hij* klm*")), result );
}
@Test
void multiSimpleTerm_hasSimpleTermsWildcarded() {
Set<String> result = TermHelper.makePrefixSearchTerm(Set.of("abc def"));
assertEquals( Set.of("abc* def*"), result );
}
@Test
void simpleQuerySyntax_mustBeLeftUnchanged() {
Set<String> result = TermHelper.makePrefixSearchTerm(Set.of("(def | efg)", "(def efg)", "ghi +(\"abc\" \"def\")"));
assertEquals( Set.of("(def | efg)", "(def efg)", "ghi +(\"abc\" \"def\")"), result );
}
@Test
void isToLeftUntouchedRemovesbackslashedStarAndHypenBeforeChecking() {
assertTrue(TermHelper.isToLeftUntouched("-ab\\*cd\\-ef"), "When first char is a hyphen");
assertTrue(TermHelper.isToLeftUntouched("abcdef*"), "When last char is a star");
assertFalse(TermHelper.isToLeftUntouched("\\-ab\\*cd\\-ef"), "When all stars and hyphens are backslashed");
assertFalse(TermHelper.isToLeftUntouched("\\-ab*cd-ef"), "When all stars and hyphens are backslashed or internal");
assertFalse(TermHelper.isToLeftUntouched("\\-ab\\*c*d\\-ef"), "When all stars and hyphens are backslashed minus an internal star");
assertFalse(TermHelper.isToLeftUntouched("\\-ab\\*cd\\-e-f"), "When all stars and hyphens are backslashed minus an internal hyphen");
assertTrue(TermHelper.isToLeftUntouched("\\-ab\\*c+d\\-ef"), "When all stars and hyphens are backslashed but there is a plus");
assertFalse(TermHelper.isToLeftUntouched("\\ab cd\\fg"), "When only backslashes");
}
}

View File

@ -34,14 +34,12 @@ import ca.uhn.fhir.rest.api.Constants;
import ca.uhn.fhir.rest.api.SearchTotalModeEnum;
import ca.uhn.fhir.rest.api.SortSpec;
import ca.uhn.fhir.rest.api.server.IBundleProvider;
import ca.uhn.fhir.rest.api.server.RequestDetails;
import ca.uhn.fhir.rest.api.server.storage.ResourcePersistentId;
import ca.uhn.fhir.rest.param.ReferenceParam;
import ca.uhn.fhir.rest.param.StringOrListParam;
import ca.uhn.fhir.rest.param.StringParam;
import ca.uhn.fhir.rest.param.TokenParam;
import ca.uhn.fhir.rest.param.TokenParamModifier;
import ca.uhn.fhir.rest.server.IPagingProvider;
import ca.uhn.fhir.rest.server.servlet.ServletRequestDetails;
import ca.uhn.fhir.rest.server.util.ISearchParamRegistry;
import ca.uhn.fhir.storage.test.BaseDateSearchDaoTests;
@ -75,7 +73,6 @@ import org.hl7.fhir.r4.model.RiskAssessment;
import org.hl7.fhir.r4.model.StringType;
import org.hl7.fhir.r4.model.ValueSet;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Nested;
@ -122,6 +119,7 @@ import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasItems;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.notNullValue;
@ -130,9 +128,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.spy;
import static org.mockito.Mockito.when;
@ExtendWith(SpringExtension.class)
@ExtendWith(MockitoExtension.class)
@ -151,9 +146,12 @@ import static org.mockito.Mockito.when;
,FhirResourceDaoR4SearchWithElasticSearchIT.TestDirtiesContextTestExecutionListener.class
})
public class FhirResourceDaoR4SearchWithElasticSearchIT extends BaseJpaTest implements ITestDataBuilder {
private static final org.slf4j.Logger ourLog = org.slf4j.LoggerFactory.getLogger(FhirResourceDaoR4SearchWithElasticSearchIT.class);
public static final String URL_MY_CODE_SYSTEM = "http://example.com/my_code_system";
public static final String URL_MY_VALUE_SET = "http://example.com/my_value_set";
private static final org.slf4j.Logger ourLog = org.slf4j.LoggerFactory.getLogger(FhirResourceDaoR4SearchWithElasticSearchIT.class);
private static final String SPACE = "%20";
@Autowired
protected DaoConfig myDaoConfig;
@Autowired
@ -416,6 +414,185 @@ public class FhirResourceDaoR4SearchWithElasticSearchIT extends BaseJpaTest impl
}
}
@Nested
public class StringTextSearch {
@Test
void secondWordFound() {
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Cloudy, yellow") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=yellow");
assertThat(resourceIds, hasItem(id1));
}
@Test
void stringMatchesPrefixAndWhole() {
// smit - matches "smit" and "smith"
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Carl Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=smit");
assertThat(resourceIds, hasItems(id1, id2));
}
@Test
void stringPlusStarMatchesPrefixAndWhole() {
// smit* - matches "smit" and "smith"
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Carl Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?_elements=valueString&value-string:text=smit*");
assertThat(resourceIds, hasItems(id1, id2));
}
@Test
void quotedStringMatchesIdenticalButNotAsPrefix() {
// "smit"- matches "smit", but not "smith"
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Carl Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=\"smit\"");
assertThat(resourceIds, contains(id2));
}
@Test
void stringTokensAreAnded() {
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Carl Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=car%20smit");
assertThat(resourceIds, hasItems(id2));
}
@Nested
public class DocumentationSamplesTests {
@Test
void line1() {
// | Fhir Query String | Executed Query | Matches | No Match
// | Smit | Smit* | John Smith | John Smi
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smi") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=Smit");
assertThat(resourceIds, hasItems(id1));
}
@Test
void line2() {
// | Fhir Query String | Executed Query | Matches | No Match | Note
// | Jo Smit | Jo* Smit* | John Smith | John Frank | Multiple bare terms are `AND`
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "John Frank") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=Jo%20Smit");
assertThat(resourceIds, hasItems(id1));
}
@Test
void line3() {
// | Fhir Query String | Executed Query | Matches | No Match | Note
// | frank &vert; john | frank &vert; john | Frank Smith | Franklin Smith | SQS characters disable prefix wildcard
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Frank Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Franklin Smith") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text=frank|john");
assertThat(resourceIds, hasItems(id1));
}
@Test
void line4() {
// | Fhir Query String | Executed Query | Matches | No Match | Note
// | 'frank' | 'frank' | Frank Smith | Franklin Smith | Quoted terms are exact match
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Frank Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withPrimitiveAttribute("valueString", "Franklin Smith") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?value-string:text='frank'");
assertThat(resourceIds, hasItems(id1));
}
}
}
@Nested
public class TokenTextSearch {
@Test
void secondWordFound() {
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-AA", "Cloudy, yellow") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?code:text=yellow");
assertThat(resourceIds, hasItem(id1));
}
@Test
void stringMatchesPrefixAndWhole() {
// smit - matches "smit" and "smith"
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-AA", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-BB", "Carl Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?code:text=smit");
assertThat(resourceIds, hasItems(id1, id2));
}
@Test
void stringPlusStarMatchesPrefixAndWhole() {
// smit* - matches "smit" and "smith"
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-AA", "Adam Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-BB", "John Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?code:text=smit*");
assertThat(resourceIds, hasItems(id1, id2));
}
@Test
void quotedStringMatchesIdenticalButNotAsPrefix() {
// "smit"- matches "smit", but not "smith"
String id1 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-AA", "John Smith") )).getIdPart();
String id2 = myTestDataBuilder.createObservation(List.of(
myTestDataBuilder.withObservationCode("http://example.com", "code-BB", "Karl Smit") )).getIdPart();
List<String> resourceIds = myTestDaoSearch.searchForIds("/Observation?code:text=\"Smit\"");
assertThat(resourceIds, contains(id2));
}
}
@Test
public void testResourceCodeTextSearch() {
IIdType id1, id2, id3, id4;
@ -492,7 +669,14 @@ public class FhirResourceDaoR4SearchWithElasticSearchIT extends BaseJpaTest impl
// prefix
SearchParameterMap map = new SearchParameterMap();
map.add("code", new TokenParam("Bod").setModifier(TokenParamModifier.TEXT));
assertThat("Bare prefix does not match", toUnqualifiedVersionlessIdValues(myObservationDao.search(map)), Matchers.empty());
assertObservationSearchMatches("Bare prefix matches", map, id2);
}
{
// prefix
SearchParameterMap map = new SearchParameterMap();
map.add("code", new TokenParam("Bod").setModifier(TokenParamModifier.TEXT));
assertObservationSearchMatches("Bare prefix matches any word, not only first", map, id2);
}
{
@ -529,6 +713,7 @@ public class FhirResourceDaoR4SearchWithElasticSearchIT extends BaseJpaTest impl
}
}
@Test
public void testResourceReferenceSearchForCanonicalReferences() {
String questionnaireCanonicalUrl = "https://test.fhir.org/R4/Questionnaire/xl-5000-q";