mirror of https://github.com/apache/lucene.git
LUCENE-1704: allow specifying the Tidy configuration file when parsing HTML docs with contrib/ant
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791587 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f7fa579971
commit
333e77a431
|
@ -62,6 +62,10 @@ New features
|
|||
6. LUCENE-1578: Support for loading unoptimized readers to the
|
||||
constructor of InstantiatedIndex. (Karl Wettin)
|
||||
|
||||
7. LUCENE-1704: Allow specifying the Tidy configuration file when
|
||||
parsing HTML docs with contrib/ant. (Keith Sprochi via Mike
|
||||
McCandless)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-1643: Re-use the collation key (RawCollationKey) for
|
||||
|
|
|
@ -87,6 +87,59 @@ public class HtmlDocument {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs an <code>HtmlDocument</code> from a
|
||||
* {@link java.io.File}.
|
||||
* @param file the <code>File</code> containing the
|
||||
* HTML to parse
|
||||
* @param tidyConfigFile the <code>String</code>
|
||||
* containing the full path to the Tidy config file
|
||||
* @exception IOException if an I/O exception occurs */
|
||||
public HtmlDocument(File file, String tidyConfigFile) throws IOException {
|
||||
Tidy tidy = new Tidy();
|
||||
tidy.setConfigurationFromFile(tidyConfigFile);
|
||||
tidy.setQuiet(true);
|
||||
tidy.setShowWarnings(false);
|
||||
org.w3c.dom.Document root =
|
||||
tidy.parseDOM(new FileInputStream(file), null);
|
||||
rawDoc = root.getDocumentElement();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Lucene <code>Document</code> from a
|
||||
* {@link java.io.File}.
|
||||
* @param file
|
||||
* @param tidyConfigFile the full path to the Tidy
|
||||
* config file
|
||||
* @exception IOException */
|
||||
public static org.apache.lucene.document.Document
|
||||
Document(File file, String tidyConfigFile) throws IOException {
|
||||
|
||||
HtmlDocument htmlDoc = new HtmlDocument(file, tidyConfigFile);
|
||||
|
||||
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
|
||||
|
||||
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
|
||||
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
|
||||
|
||||
String contents = null;
|
||||
BufferedReader br =
|
||||
new BufferedReader(new FileReader(file));
|
||||
StringWriter sw = new StringWriter();
|
||||
String line = br.readLine();
|
||||
while (line != null) {
|
||||
sw.write(line);
|
||||
line = br.readLine();
|
||||
}
|
||||
br.close();
|
||||
contents = sw.toString();
|
||||
sw.close();
|
||||
|
||||
luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
|
||||
|
||||
return luceneDoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a Lucene <code>Document</code> from an {@link
|
||||
* java.io.InputStream}.
|
||||
|
|
Loading…
Reference in New Issue