From c54ea4da676733a34db2042509e586173d4187b5 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 5 Nov 2010 08:19:34 +0000 Subject: [PATCH] LUCENE-590: Demo HTML parser gives incorrect summaries when title is repeated as a heading git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031467 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 3 +++ .../src/java/org/apache/lucene/demo/html/HTMLParser.java | 2 +- .../src/java/org/apache/lucene/demo/html/HTMLParser.jj | 2 +- .../test/org/apache/lucene/demo/html/TestHtmlParser.java | 7 +++++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c559127a85a..8be38816a0e 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -140,6 +140,9 @@ Bug fixes * LUCENE-2246: Fix contrib/demo for Turkish html documents. (Selim Nadi via Robert Muir) + +* LUCENE-590: Demo HTML parser gives incorrect summaries when title is repeated as a heading + (Curtis d'Entremont via Robert Muir) API Changes diff --git a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java index c982ba23c3e..7bf9e38f560 100644 --- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java +++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.java @@ -84,7 +84,7 @@ InterruptedException { String sum = summary.toString().trim(); String tit = getTitle(); - if (sum.startsWith(tit) || sum.equals("")) + if (sum.equals("")) return tit; else return sum; diff --git a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj index 683d3a37f80..25504aebc8c 100644 --- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj +++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/html/HTMLParser.jj @@ -111,7 +111,7 @@ InterruptedException { String sum = summary.toString().trim(); String tit = getTitle(); - if (sum.startsWith(tit) || sum.equals("")) + if (sum.equals("")) return tit; else return sum; diff --git a/lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java b/lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java index 10db66168b8..c567de18a4a 100644 --- a/lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java +++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/html/TestHtmlParser.java @@ -105,6 +105,13 @@ public class TestHtmlParser extends LuceneTestCase { assertEquals(200, parser.getSummary().length()); } + // LUCENE-590 + public void testSummaryTitle() throws Exception { + String text = "SummarySummary of the document"; + HTMLParser parser = new HTMLParser(new StringReader(text)); + assertEquals("Summary of the document", parser.getSummary()); + } + // LUCENE-2246 public void testTurkish() throws Exception { String text = "" +