From fe8274576962da09d8ccfa0545d9ff6692af93ae Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Fri, 5 Nov 2010 08:46:20 +0000 Subject: [PATCH] LUCENE-591: index meta keywords in contrib/demo git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031474 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 3 +++ .../org/apache/lucene/demo/HTMLDocument.java | 5 +++++ .../test/org/apache/lucene/demo/TestDemo.java | 20 +++++++++++++++++++ .../lucene/demo/test-files/html/test2.html | 9 +++++++++ .../lucene/demo/test-files/queries2.txt | 1 + 5 files changed, 38 insertions(+) create mode 100644 lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test2.html create mode 100644 lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries2.txt diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 8be38816a0e..63b253af23d 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -143,6 +143,9 @@ Bug fixes * LUCENE-590: Demo HTML parser gives incorrect summaries when title is repeated as a heading (Curtis d'Entremont via Robert Muir) + +* LUCENE-591: The demo indexer now indexes meta keywords. + (Curtis d'Entremont via Robert Muir) API Changes diff --git a/lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java b/lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java index 64ab05acc2e..67cafbf2fad 100644 --- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java +++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/HTMLDocument.java @@ -70,6 +70,11 @@ public class HTMLDocument { // Add the tag-stripped contents as a Reader-valued Text field so it will // get tokenized and indexed. doc.add(new Field("contents", parser.getReader())); + + // add any document keywords if they exist + String keywords = parser.getMetaTags().getProperty("keywords"); + if (keywords != null) + doc.add(new Field("contents", keywords, Field.Store.NO, Field.Index.ANALYZED)); // Add the summary as a field that is stored and returned with // hit documents for display. diff --git a/lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java b/lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java index 79e96dc287b..6fbe35187e8 100644 --- a/lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java +++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/TestDemo.java @@ -43,4 +43,24 @@ public class TestDemo extends LuceneTestCase { System.setOut(outSave); } } + + // LUCENE-591 + public void testIndexKeywords() throws Exception { + File dir = getDataFile("test-files/html"); + File indexDir = new File(TEMP_DIR, "demoIndex2"); + IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() }); + File queries = getDataFile("test-files/queries2.txt"); + PrintStream outSave = System.out; + try { + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + PrintStream fakeSystemOut = new PrintStream(bytes); + System.setOut(fakeSystemOut); + SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()}); + fakeSystemOut.flush(); + String output = bytes.toString(); // intentionally use default encoding + assertTrue(output.contains("1 total matching documents")); + } finally { + System.setOut(outSave); + } + } } diff --git a/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test2.html b/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test2.html new file mode 100644 index 00000000000..cddd3effa16 --- /dev/null +++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/html/test2.html @@ -0,0 +1,9 @@ + + + + + + + This document is actually not about cats! + + diff --git a/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries2.txt b/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries2.txt new file mode 100644 index 00000000000..86cc85730b7 --- /dev/null +++ b/lucene/contrib/demo/src/test/org/apache/lucene/demo/test-files/queries2.txt @@ -0,0 +1 @@ ++contents:dogs +contents:fish