Added fix to QueryScorer - if a query has multiple WeightedTerms with different weights for the same term the highest weight is used for scoring that term (previously selected last weight in list)

SimpleHTMLEncoder now encodes characters outside of ASCII range as character entities as per suggestion here: http://issues.apache.org/bugzilla/show_bug.cgi?id=36333 git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@279088 13f79535-47bb-0310-9956-ffa450edef68
2005-09-06 20:19:50 +00:00 · 2005-09-06 20:19:50 +00:00 · c00b260ecf
parent f6b07dabe8
commit c00b260ecf
2 changed files with 15 additions and 3 deletions
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
@ -67,8 +67,13 @@ public class QueryScorer implements Scorer
 		termsToFind = new HashMap();
 		for (int i = 0; i < weightedTerms.length; i++)
 		{
-			termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
-			maxTermWeight=Math.max(maxTermWeight,weightedTerms[i].getWeight());
+			WeightedTerm existingTerm=(WeightedTerm) termsToFind.get(weightedTerms[i].term);
+			if( (existingTerm==null) ||(existingTerm.weight<weightedTerms[i].weight) )
+			{
+				//if a term is defined more than once, always use the highest scoring weight
+				termsToFind.put(weightedTerms[i].term,weightedTerms[i]);
+				maxTermWeight=Math.max(maxTermWeight,weightedTerms[i].getWeight());
+			}
 		}
 	}
 	
--- a/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
+++ b/contrib/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLEncoder.java
@ -66,7 +66,14 @@ public class SimpleHTMLEncoder implements Encoder
 				break;

 			default:
-				result.append(ch);
+				   if (ch < 128) 
+				   {
+			           result.append(ch);
+			       } 
+				   else 
+			       {
+			           result.append("&#").append((int)ch).append(";");
+			       }
 			}
 		}