LUCENE-590: Demo HTML parser gives incorrect summaries when title is repeated as a heading

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031467 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-11-05 08:19:34 +00:00
parent 048cdb57f4
commit c54ea4da67
4 changed files with 12 additions and 2 deletions

View File

@ -141,6 +141,9 @@ Bug fixes
* LUCENE-2246: Fix contrib/demo for Turkish html documents.
(Selim Nadi via Robert Muir)
* LUCENE-590: Demo HTML parser gives incorrect summaries when title is repeated as a heading
(Curtis d'Entremont via Robert Muir)
API Changes
* LUCENE-2147: Spatial GeoHashUtils now always decode GeoHash strings

View File

@ -84,7 +84,7 @@ InterruptedException {
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit) || sum.equals(""))
if (sum.equals(""))
return tit;
else
return sum;

View File

@ -111,7 +111,7 @@ InterruptedException {
String sum = summary.toString().trim();
String tit = getTitle();
if (sum.startsWith(tit) || sum.equals(""))
if (sum.equals(""))
return tit;
else
return sum;

View File

@ -105,6 +105,13 @@ public class TestHtmlParser extends LuceneTestCase {
assertEquals(200, parser.getSummary().length());
}
// LUCENE-590
public void testSummaryTitle() throws Exception {
String text = "<html><head><title>Summary</title></head><body>Summary of the document</body></html>";
HTMLParser parser = new HTMLParser(new StringReader(text));
assertEquals("Summary of the document", parser.getSummary());
}
// LUCENE-2246
public void testTurkish() throws Exception {
String text = "<html><body>" +