LUCENE-591: index meta keywords in contrib/demo

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1031474 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-11-05 08:46:20 +00:00
parent c54ea4da67
commit fe82745769
5 changed files with 38 additions and 0 deletions

View File

@ -143,6 +143,9 @@ Bug fixes
* LUCENE-590: Demo HTML parser gives incorrect summaries when title is repeated as a heading
(Curtis d'Entremont via Robert Muir)
* LUCENE-591: The demo indexer now indexes meta keywords.
(Curtis d'Entremont via Robert Muir)
API Changes

View File

@ -70,6 +70,11 @@ public class HTMLDocument {
// Add the tag-stripped contents as a Reader-valued Text field so it will
// get tokenized and indexed.
doc.add(new Field("contents", parser.getReader()));
// add any document keywords if they exist
String keywords = parser.getMetaTags().getProperty("keywords");
if (keywords != null)
doc.add(new Field("contents", keywords, Field.Store.NO, Field.Index.ANALYZED));
// Add the summary as a field that is stored and returned with
// hit documents for display.

View File

@ -43,4 +43,24 @@ public class TestDemo extends LuceneTestCase {
System.setOut(outSave);
}
}
// LUCENE-591
public void testIndexKeywords() throws Exception {
File dir = getDataFile("test-files/html");
File indexDir = new File(TEMP_DIR, "demoIndex2");
IndexHTML.main(new String[] { "-create", "-index", indexDir.getPath(), dir.getPath() });
File queries = getDataFile("test-files/queries2.txt");
PrintStream outSave = System.out;
try {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
PrintStream fakeSystemOut = new PrintStream(bytes);
System.setOut(fakeSystemOut);
SearchFiles.main(new String[] { "-index", indexDir.getPath(), "-queries", queries.getPath()});
fakeSystemOut.flush();
String output = bytes.toString(); // intentionally use default encoding
assertTrue(output.contains("1 total matching documents"));
} finally {
System.setOut(outSave);
}
}
}

View File

@ -0,0 +1,9 @@
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
<meta name="keywords" content="dogs,fish" />
</head>
<body>
This document is actually not about cats!
</body>
</html>

View File

@ -0,0 +1 @@
+contents:dogs +contents:fish