Work on fulltext suggestions

2025-02-06 21:18:21 +00:00 · 2015-11-03 21:34:35 -05:00 · 2015-11-03 21:34:35 -05:00 · 1fb9f02d14
commit 1fb9f02d14
parent 3fc7a16735
7 changed files with 146 additions and 53 deletions
--- a/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/dao/BaseHapiFhirDao.java
+++ b/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/dao/BaseHapiFhirDao.java
@ -106,6 +106,7 @@ import ca.uhn.fhir.model.dstu.resource.BaseResource;
 import ca.uhn.fhir.model.dstu2.composite.MetaDt;
 import ca.uhn.fhir.model.primitive.IdDt;
 import ca.uhn.fhir.model.primitive.InstantDt;
+import ca.uhn.fhir.model.primitive.StringDt;
 import ca.uhn.fhir.model.valueset.BundleEntryTransactionMethodEnum;
 import ca.uhn.fhir.parser.DataFormatException;
 import ca.uhn.fhir.parser.IParser;
@ -174,7 +175,7 @@ public abstract class BaseHapiFhirDao<T extends IBaseResource> implements IDao {

 	// @PersistenceContext(name = "FHIR_UT", type = PersistenceContextType.TRANSACTION, unitName = "FHIR_UT")
 	@PersistenceContext(type = PersistenceContextType.TRANSACTION)
-	private EntityManager myEntityManager;
+	protected EntityManager myEntityManager;

 	@Autowired
 	private PlatformTransactionManager myPlatformTransactionManager;
@ -1546,19 +1547,19 @@ public abstract class BaseHapiFhirDao<T extends IBaseResource> implements IDao {
 	}

 	private String parseContentTextIntoWords(IResource theResource) {
-		StringBuilder b = new StringBuilder();
+		StringBuilder retVal = new StringBuilder(); 
 		@SuppressWarnings("rawtypes")
 		List<IPrimitiveType> childElements = getContext().newTerser().getAllPopulatedChildElementsOfType(theResource, IPrimitiveType.class);
 		for (@SuppressWarnings("rawtypes") IPrimitiveType nextType : childElements) {
-			String nextValue = nextType.getValueAsString();
-			if (isNotBlank(nextValue)) {
-				if (b.length() > 0 && b.charAt(b.length() - 1) != ' ') {
-					b.append(' ');
+			if (nextType instanceof StringDt) {
+				String nextValue = nextType.getValueAsString();
+				if (isNotBlank(nextValue)) {
+					retVal.append(nextValue.replace("\n", " ").replace("\r", " "));
+					retVal.append("\n");
 				}
-				b.append(nextValue);
 			}
 		}
-		return b.toString();
+		return retVal.toString();
 	}

 	public BaseHasResource readEntity(IIdType theValueId) {
--- a/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/dao/BaseHapiFhirSystemDao.java
+++ b/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/dao/BaseHapiFhirSystemDao.java
@ -26,8 +26,6 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;

-import javax.persistence.EntityManager;
-import javax.persistence.PersistenceContext;
 import javax.persistence.Query;
 import javax.persistence.Tuple;
 import javax.persistence.TypedQuery;
@ -60,9 +58,6 @@ public abstract class BaseHapiFhirSystemDao<T> extends BaseHapiFhirDao<IBaseReso

 	private static final org.slf4j.Logger ourLog = org.slf4j.LoggerFactory.getLogger(BaseHapiFhirSystemDao.class);

-	@PersistenceContext()
-	protected EntityManager myEntityManager;
-
 	@Autowired
 	private PlatformTransactionManager myTxManager;

@ -222,10 +217,6 @@ public abstract class BaseHapiFhirSystemDao<T> extends BaseHapiFhirDao<IBaseReso
 		}
 	}

-	public void setEntityManager(EntityManager theEntityManager) {
-		myEntityManager = theEntityManager;
-	}
-
 	public void setTxManager(PlatformTransactionManager theTxManager) {
 		myTxManager = theTxManager;
 	}
--- a/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/dao/FhirSearchDao.java
+++ b/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/dao/FhirSearchDao.java
@ -35,16 +35,13 @@ import javax.persistence.PersistenceContextType;

 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.Validate;
-import org.apache.commons.lang3.builder.HashCodeBuilder;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.Token;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.highlight.Formatter;
 import org.apache.lucene.search.highlight.Highlighter;
 import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.Scorer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 import org.apache.lucene.search.highlight.TokenGroup;
 import org.hibernate.search.jpa.FullTextEntityManager;
 import org.hibernate.search.jpa.FullTextQuery;
@ -56,7 +53,6 @@ import org.springframework.transaction.annotation.Transactional;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;

-import ca.uhn.fhir.jpa.dao.FhirSearchDao.MySuggestionFormatter;
 import ca.uhn.fhir.jpa.entity.ResourceTable;
 import ca.uhn.fhir.model.api.IQueryParameterType;
 import ca.uhn.fhir.model.dstu.resource.BaseResource;
@ -207,16 +203,29 @@ public class FhirSearchDao extends BaseHapiFhirDao<IBaseResource> implements ISe
 			String nextValue = (String) nextAsArray[0];

 			try {
-				MySuggestionFormatter formatter = new MySuggestionFormatter(suggestions);
-
+				MySuggestionFormatter formatter = new MySuggestionFormatter(theText, suggestions);
 				Scorer scorer = new QueryScorer(textQuery);
 				Highlighter highlighter = new Highlighter(formatter, scorer);
-
 				Analyzer analyzer = em.getSearchFactory().getAnalyzer(ResourceTable.class);
-				highlighter.getBestFragment(analyzer.tokenStream("myContentText", nextValue), nextValue);
-				highlighter.getBestFragment(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue);
-				highlighter.getBestFragment(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue);
-				highlighter.getBestFragment(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue);
+
+				formatter.setAnalyzer("myContentTextPhonetic");
+				highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue, 10);
+				
+				formatter.setAnalyzer("myContentTextNGram");
+				highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);
+
+				formatter.setFindPhrasesWith();
+				formatter.setAnalyzer("myContentTextEdgeNGram");
+				highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue, 10);
+
+//				formatter.setAnalyzer("myContentText");
+//				highlighter.getBestFragments(analyzer.tokenStream("myContentText", nextValue), nextValue, 10);
+//				formatter.setAnalyzer("myContentTextNGram");
+//				highlighter.getBestFragments(analyzer.tokenStream("myContentTextNGram", nextValue), nextValue, 10);
+//				formatter.setAnalyzer("myContentTextEdgeNGram");
+//				highlighter.getBestFragments(analyzer.tokenStream("myContentTextEdgeNGram", nextValue), nextValue, 10);
+//				formatter.setAnalyzer("myContentTextPhonetic");
+//				highlighter.getBestFragments(analyzer.tokenStream("myContentTextPhonetic", nextValue), nextValue, 10);
 			} catch (Exception e) {
 				throw new InternalErrorException(e);
 			}
@ -227,7 +236,11 @@ public class FhirSearchDao extends BaseHapiFhirDao<IBaseResource> implements ISe
 		
 		Set<String> terms = Sets.newHashSet();
 		for (Iterator<Suggestion> iter = suggestions.iterator(); iter.hasNext(); ) {
-			if (!terms.add(iter.next().getTerm())) {
+			String nextTerm = iter.next().getTerm().toLowerCase();
+//			if (nextTerm.contains("\n")) {
+//				iter.remove();
+//			} else 
+			if (!terms.add(nextTerm)) {
 				iter.remove();
 			}
 		}
@ -269,16 +282,52 @@ public class FhirSearchDao extends BaseHapiFhirDao<IBaseResource> implements ISe
 	public class MySuggestionFormatter implements Formatter {

 		private List<Suggestion> mySuggestions;
+		private String myAnalyzer;
+		private ArrayList<String> myPartialMatchPhrases;
+		private ArrayList<Float> myPartialMatchScores;
+		private String myOriginalSearch;

-		public MySuggestionFormatter(List<Suggestion> theSuggestions) {
+		public MySuggestionFormatter(String theOriginalSearch, List<Suggestion> theSuggestions) {
+			myOriginalSearch = theOriginalSearch;
 			mySuggestions = theSuggestions;
 		}

+		public void setFindPhrasesWith() {
+			myPartialMatchPhrases = new ArrayList<String>();
+			myPartialMatchScores = new ArrayList<Float>();
+			
+			for (Suggestion next : mySuggestions) {
+				myPartialMatchPhrases.add(' ' + next.myTerm);
+				myPartialMatchScores.add(next.myScore);
+			}
+			
+			myPartialMatchPhrases.add(myOriginalSearch);
+			myPartialMatchScores.add(1.0f);
+		}
+
+		public void setAnalyzer(String theString) {
+			myAnalyzer = theString;
+		}
+
 		@Override
 		public String highlightTerm(String theOriginalText, TokenGroup theTokenGroup) {
+			ourLog.info("{} Found {} with score {}", new Object[] {myAnalyzer, theOriginalText, theTokenGroup.getTotalScore()});
 			if (theTokenGroup.getTotalScore() > 0) {
-				mySuggestions.add(new Suggestion(theOriginalText, theTokenGroup.getTotalScore()));
+				float score = theTokenGroup.getTotalScore();
+				if (theOriginalText.equalsIgnoreCase(myOriginalSearch)) {
+					score = score + 1.0f;
+				}
+				mySuggestions.add(new Suggestion(theOriginalText, score));
+			} else if (myPartialMatchPhrases != null) {
+				if (theOriginalText.length() < 100) {
+					for (int i = 0; i < myPartialMatchPhrases.size(); i++) {
+						if (theOriginalText.contains(myPartialMatchPhrases.get(i))) {
+							mySuggestions.add(new Suggestion(theOriginalText, myPartialMatchScores.get(i) - 0.5f));
+						}
+					}
+				}
 			}
+			
 			return null;
 		}

--- a/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/entity/ResourceTable.java
+++ b/hapi-fhir-jpaserver-base/src/main/java/ca/uhn/fhir/jpa/entity/ResourceTable.java
@ -41,14 +41,14 @@ import javax.persistence.Transient;

 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
-import org.apache.lucene.analysis.core.KeywordTokenizerFactory;
 import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
 import org.apache.lucene.analysis.core.StopFilterFactory;
 import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
 import org.apache.lucene.analysis.ngram.EdgeNGramFilterFactory;
 import org.apache.lucene.analysis.ngram.NGramFilterFactory;
-import org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory;
+import org.apache.lucene.analysis.pattern.PatternTokenizerFactory;
 import org.apache.lucene.analysis.phonetic.PhoneticFilterFactory;
+import org.apache.lucene.analysis.shingle.ShingleFilterFactory;
 import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
 import org.apache.lucene.analysis.standard.StandardFilterFactory;
 import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
@ -81,19 +81,22 @@ import ca.uhn.fhir.rest.server.exceptions.UnprocessableEntityException;
 })
@AnalyzerDefs({
 	@AnalyzerDef(name = "autocompleteEdgeAnalyzer",
-		tokenizer = @TokenizerDef(factory = KeywordTokenizerFactory.class),
+		tokenizer = @TokenizerDef(factory = PatternTokenizerFactory.class, params= {
+			@Parameter(name="pattern", value="(.*)"),
+			@Parameter(name="group", value="1")
+		}),
 		filters = {
-			@TokenFilterDef(factory = PatternReplaceFilterFactory.class, params = {
-				@Parameter(name = "pattern",value = "([^a-zA-Z0-9\\.])"),
-				@Parameter(name = "replacement", value = " "),
-				@Parameter(name = "replace", value = "all") 
-			}),
+//			@TokenFilterDef(factory = PatternReplaceFilterFactory.class, params = {
+//				@Parameter(name = "pattern",value = "([^a-zA-Z0-9\\.])"),
+//				@Parameter(name = "replacement", value = " "),
+//				@Parameter(name = "replace", value = "all") 
+//			}),
 			@TokenFilterDef(factory = LowerCaseFilterFactory.class),
 			@TokenFilterDef(factory = StopFilterFactory.class),
 			@TokenFilterDef(factory = EdgeNGramFilterFactory.class, params = {
 				@Parameter(name = "minGramSize", value = "3"),
 				@Parameter(name = "maxGramSize", value = "50") 
-			}) 
+			}), 
 		}),
 	@AnalyzerDef(name = "autocompletePhoneticAnalyzer",
 		tokenizer = @TokenizerDef(factory=StandardTokenizerFactory.class),
--- a/hapi-fhir-jpaserver-base/src/test/java/ca/uhn/fhir/jpa/dao/BaseJpaDstu2Test.java
+++ b/hapi-fhir-jpaserver-base/src/test/java/ca/uhn/fhir/jpa/dao/BaseJpaDstu2Test.java
@ -52,6 +52,7 @@ import ca.uhn.fhir.model.dstu2.resource.DiagnosticReport;
 import ca.uhn.fhir.model.dstu2.resource.Encounter;
 import ca.uhn.fhir.model.dstu2.resource.Immunization;
 import ca.uhn.fhir.model.dstu2.resource.Location;
+import ca.uhn.fhir.model.dstu2.resource.Media;
 import ca.uhn.fhir.model.dstu2.resource.Medication;
 import ca.uhn.fhir.model.dstu2.resource.MedicationOrder;
 import ca.uhn.fhir.model.dstu2.resource.Observation;
@ -126,6 +127,9 @@ public abstract class BaseJpaDstu2Test extends BaseJpaTest {
 	@Qualifier("myPatientDaoDstu2")
 	protected IFhirResourceDaoPatient<Patient> myPatientDao;
 	@Autowired
+	@Qualifier("myMediaDaoDstu2")
+	protected IFhirResourceDao<Media> myMediaDao;
+	@Autowired
 	@Qualifier("myPractitionerDaoDstu2")
 	protected IFhirResourceDao<Practitioner> myPractitionerDao;
 	@Autowired
--- a/hapi-fhir-jpaserver-base/src/test/java/ca/uhn/fhir/jpa/dao/FhirResourceDaoDstu2SearchFtTest.java
+++ b/hapi-fhir-jpaserver-base/src/test/java/ca/uhn/fhir/jpa/dao/FhirResourceDaoDstu2SearchFtTest.java
@ -16,8 +16,10 @@ import org.junit.Test;

 import ca.uhn.fhir.jpa.dao.FhirSearchDao.Suggestion;
 import ca.uhn.fhir.model.dstu2.resource.Device;
+import ca.uhn.fhir.model.dstu2.resource.Media;
 import ca.uhn.fhir.model.dstu2.resource.Observation;
 import ca.uhn.fhir.model.dstu2.resource.Patient;
+import ca.uhn.fhir.model.primitive.Base64BinaryDt;
 import ca.uhn.fhir.model.primitive.StringDt;
 import ca.uhn.fhir.rest.param.StringAndListParam;
 import ca.uhn.fhir.rest.param.StringOrListParam;
@ -28,6 +30,46 @@ public class FhirResourceDaoDstu2SearchFtTest extends BaseJpaDstu2Test {
 	
 	private static final org.slf4j.Logger ourLog = org.slf4j.LoggerFactory.getLogger(FhirResourceDaoDstu2SearchFtTest.class);

+	@Test
+	public void testSuggestIgnoresBase64Content() {
+		Patient patient = new Patient();
+		patient.addName().addFamily("testSuggest");
+		IIdType ptId = myPatientDao.create(patient).getId().toUnqualifiedVersionless();
+
+		Media med = new Media();
+		med.getSubject().setReference(ptId);
+		med.getSubtype().setText("Systolic Blood Pressure");
+		med.getContent().setContentType("LCws");
+		med.getContent().setData(new Base64BinaryDt(new byte[] {44,44,44,44,44,44,44,44}));
+		med.getContent().setTitle("bbbb syst");
+		myMediaDao.create(med);
+		ourLog.info(myFhirCtx.newJsonParser().encodeResourceToString(med));
+		
+		List<Suggestion> output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "press");
+		ourLog.info("Found: " + output);
+		assertEquals(2, output.size());
+		assertEquals("Pressure", output.get(0).getTerm());
+		assertEquals("Systolic Blood Pressure", output.get(1).getTerm());
+
+		output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "prezure");
+		ourLog.info("Found: " + output);
+		assertEquals(2, output.size());
+		assertEquals("Pressure", output.get(0).getTerm());
+		assertEquals("Systolic Blood Pressure", output.get(1).getTerm());
+
+		output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "syst");
+		ourLog.info("Found: " + output);
+		assertEquals(4, output.size());
+		assertEquals("syst", output.get(0).getTerm());
+		assertEquals("bbbb syst", output.get(1).getTerm());
+		assertEquals("Systolic", output.get(2).getTerm());
+		assertEquals("Systolic Blood Pressure", output.get(3).getTerm());
+		
+		output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "LCws");
+		ourLog.info("Found: " + output);
+		assertEquals(0, output.size());
+	}
+	
 	@Test
 	public void testSuggest() {
 		Patient patient = new Patient();
@ -47,6 +89,7 @@ public class FhirResourceDaoDstu2SearchFtTest extends BaseJpaDstu2Test {
 		obs = new Observation();
 		obs.getSubject().setReference(ptId);
 		obs.getCode().setText("ZXC HELLO");
+		obs.addComponent().getCode().setText("HHHHHHHHHH");
 		myObservationDao.create(obs);

 		/*
@ -79,8 +122,9 @@ public class FhirResourceDaoDstu2SearchFtTest extends BaseJpaDstu2Test {

 		output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "HELO");
 		ourLog.info("Found: " + output);
-		assertEquals(1, output.size());
+		assertEquals(2, output.size());
 		assertEquals("HELLO", output.get(0).getTerm());
+		assertEquals("ZXC HELLO", output.get(1).getTerm());
 		
 		output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "Z");
 		ourLog.info("Found: " + output);
@ -88,8 +132,9 @@ public class FhirResourceDaoDstu2SearchFtTest extends BaseJpaDstu2Test {

 		output = mySearchDao.suggestKeywords("Patient/" + ptId.getIdPart() + "/$everything", "_content", "ZX");
 		ourLog.info("Found: " + output);
-		assertEquals(1, output.size());
+		assertEquals(2, output.size());
 		assertEquals("ZXC", output.get(0).getTerm());
+		assertEquals("ZXC HELLO", output.get(1).getTerm());

 	}
 	
--- a/pom.xml
+++ b/pom.xml
@ -425,47 +425,47 @@
 			<dependency>
 				<groupId>org.eclipse.jetty</groupId>
 				<artifactId>jetty-http</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty</groupId>
 				<artifactId>jetty-servlets</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty</groupId>
 				<artifactId>jetty-servlet</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty</groupId>
 				<artifactId>jetty-server</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty</groupId>
 				<artifactId>jetty-util</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty</groupId>
 				<artifactId>jetty-webapp</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty.websocket</groupId>
 				<artifactId>websocket-api</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty.websocket</groupId>
 				<artifactId>websocket-client</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.eclipse.jetty.websocket</groupId>
 				<artifactId>websocket-server</artifactId>
-				<version>9.2.6.v20141205</version>
+				<version>9.2.13.v20150730</version>
 			</dependency>
 			<dependency>
 				<groupId>org.fusesource.jansi</groupId>
@ -742,7 +742,7 @@
 				<plugin>
 					<groupId>org.eclipse.jetty</groupId>
 					<artifactId>jetty-maven-plugin</artifactId>
-					<version>9.2.6.v20141205</version>
+					<version>9.2.13.v20150730</version>
 				</plugin>
 				<plugin>
 					<groupId>org.eluder.coveralls</groupId>