From 333e77a43177386e7b2d6377ea6397dd98778402 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Mon, 6 Jul 2009 19:55:05 +0000 Subject: [PATCH] LUCENE-1704: allow specifying the Tidy configuration file when parsing HTML docs with contrib/ant git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791587 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 4 ++ .../org/apache/lucene/ant/HtmlDocument.java | 53 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 92992d89ae7..3fa53425272 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -61,6 +61,10 @@ New features 6. LUCENE-1578: Support for loading unoptimized readers to the constructor of InstantiatedIndex. (Karl Wettin) + + 7. LUCENE-1704: Allow specifying the Tidy configuration file when + parsing HTML docs with contrib/ant. (Keith Sprochi via Mike + McCandless) Optimizations diff --git a/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java b/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java index 956523e433c..c45d55ac9a7 100644 --- a/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java +++ b/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java @@ -87,6 +87,59 @@ public class HtmlDocument { } + /** + * Constructs an HtmlDocument from a + * {@link java.io.File}. + * @param file the File containing the + * HTML to parse + * @param tidyConfigFile the String + * containing the full path to the Tidy config file + * @exception IOException if an I/O exception occurs */ + public HtmlDocument(File file, String tidyConfigFile) throws IOException { + Tidy tidy = new Tidy(); + tidy.setConfigurationFromFile(tidyConfigFile); + tidy.setQuiet(true); + tidy.setShowWarnings(false); + org.w3c.dom.Document root = + tidy.parseDOM(new FileInputStream(file), null); + rawDoc = root.getDocumentElement(); + } + + /** + * Creates a Lucene Document from a + * {@link java.io.File}. + * @param file + * @param tidyConfigFile the full path to the Tidy + * config file + * @exception IOException */ + public static org.apache.lucene.document.Document + Document(File file, String tidyConfigFile) throws IOException { + + HtmlDocument htmlDoc = new HtmlDocument(file, tidyConfigFile); + + org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document(); + + luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); + luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED)); + + String contents = null; + BufferedReader br = + new BufferedReader(new FileReader(file)); + StringWriter sw = new StringWriter(); + String line = br.readLine(); + while (line != null) { + sw.write(line); + line = br.readLine(); + } + br.close(); + contents = sw.toString(); + sw.close(); + + luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO)); + + return luceneDoc; + } + /** * Creates a Lucene Document from an {@link * java.io.InputStream}.