diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt
index 92992d89ae7..3fa53425272 100644
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@@ -61,6 +61,10 @@ New features
6. LUCENE-1578: Support for loading unoptimized readers to the
constructor of InstantiatedIndex. (Karl Wettin)
+
+ 7. LUCENE-1704: Allow specifying the Tidy configuration file when
+ parsing HTML docs with contrib/ant. (Keith Sprochi via Mike
+ McCandless)
Optimizations
diff --git a/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java b/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java
index 956523e433c..c45d55ac9a7 100644
--- a/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java
+++ b/contrib/ant/src/java/org/apache/lucene/ant/HtmlDocument.java
@@ -87,6 +87,59 @@ public class HtmlDocument {
}
+ /**
+ * Constructs an HtmlDocument
from a
+ * {@link java.io.File}.
+ * @param file the File
containing the
+ * HTML to parse
+ * @param tidyConfigFile the String
+ * containing the full path to the Tidy config file
+ * @exception IOException if an I/O exception occurs */
+ public HtmlDocument(File file, String tidyConfigFile) throws IOException {
+ Tidy tidy = new Tidy();
+ tidy.setConfigurationFromFile(tidyConfigFile);
+ tidy.setQuiet(true);
+ tidy.setShowWarnings(false);
+ org.w3c.dom.Document root =
+ tidy.parseDOM(new FileInputStream(file), null);
+ rawDoc = root.getDocumentElement();
+ }
+
+ /**
+ * Creates a Lucene Document
from a
+ * {@link java.io.File}.
+ * @param file
+ * @param tidyConfigFile the full path to the Tidy
+ * config file
+ * @exception IOException */
+ public static org.apache.lucene.document.Document
+ Document(File file, String tidyConfigFile) throws IOException {
+
+ HtmlDocument htmlDoc = new HtmlDocument(file, tidyConfigFile);
+
+ org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
+
+ luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
+ luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
+
+ String contents = null;
+ BufferedReader br =
+ new BufferedReader(new FileReader(file));
+ StringWriter sw = new StringWriter();
+ String line = br.readLine();
+ while (line != null) {
+ sw.write(line);
+ line = br.readLine();
+ }
+ br.close();
+ contents = sw.toString();
+ sw.close();
+
+ luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
+
+ return luceneDoc;
+ }
+
/**
* Creates a Lucene Document
from an {@link
* java.io.InputStream}.