LUCENE-1704: allow specifying the Tidy configuration file when parsing HTML docs with contrib/ant

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791587 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-07-06 19:55:05 +00:00
parent f7fa579971
commit 333e77a431
2 changed files with 57 additions and 0 deletions

View File

@ -61,6 +61,10 @@ New features
6. LUCENE-1578: Support for loading unoptimized readers to the 6. LUCENE-1578: Support for loading unoptimized readers to the
constructor of InstantiatedIndex. (Karl Wettin) constructor of InstantiatedIndex. (Karl Wettin)
7. LUCENE-1704: Allow specifying the Tidy configuration file when
parsing HTML docs with contrib/ant. (Keith Sprochi via Mike
McCandless)
Optimizations Optimizations

View File

@ -87,6 +87,59 @@ public class HtmlDocument {
} }
/**
* Constructs an <code>HtmlDocument</code> from a
* {@link java.io.File}.
* @param file the <code>File</code> containing the
* HTML to parse
* @param tidyConfigFile the <code>String</code>
* containing the full path to the Tidy config file
* @exception IOException if an I/O exception occurs */
public HtmlDocument(File file, String tidyConfigFile) throws IOException {
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(tidyConfigFile);
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root =
tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
}
/**
* Creates a Lucene <code>Document</code> from a
* {@link java.io.File}.
* @param file
* @param tidyConfigFile the full path to the Tidy
* config file
* @exception IOException */
public static org.apache.lucene.document.Document
Document(File file, String tidyConfigFile) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file, tidyConfigFile);
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
return luceneDoc;
}
/** /**
* Creates a Lucene <code>Document</code> from an {@link * Creates a Lucene <code>Document</code> from an {@link
* java.io.InputStream}. * java.io.InputStream}.