mirror of https://github.com/apache/lucene.git
LUCENE-1704: allow specifying the Tidy configuration file when parsing HTML docs with contrib/ant
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@791587 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f7fa579971
commit
333e77a431
|
@ -61,6 +61,10 @@ New features
|
||||||
|
|
||||||
6. LUCENE-1578: Support for loading unoptimized readers to the
|
6. LUCENE-1578: Support for loading unoptimized readers to the
|
||||||
constructor of InstantiatedIndex. (Karl Wettin)
|
constructor of InstantiatedIndex. (Karl Wettin)
|
||||||
|
|
||||||
|
7. LUCENE-1704: Allow specifying the Tidy configuration file when
|
||||||
|
parsing HTML docs with contrib/ant. (Keith Sprochi via Mike
|
||||||
|
McCandless)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
|
|
|
@ -87,6 +87,59 @@ public class HtmlDocument {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs an <code>HtmlDocument</code> from a
|
||||||
|
* {@link java.io.File}.
|
||||||
|
* @param file the <code>File</code> containing the
|
||||||
|
* HTML to parse
|
||||||
|
* @param tidyConfigFile the <code>String</code>
|
||||||
|
* containing the full path to the Tidy config file
|
||||||
|
* @exception IOException if an I/O exception occurs */
|
||||||
|
public HtmlDocument(File file, String tidyConfigFile) throws IOException {
|
||||||
|
Tidy tidy = new Tidy();
|
||||||
|
tidy.setConfigurationFromFile(tidyConfigFile);
|
||||||
|
tidy.setQuiet(true);
|
||||||
|
tidy.setShowWarnings(false);
|
||||||
|
org.w3c.dom.Document root =
|
||||||
|
tidy.parseDOM(new FileInputStream(file), null);
|
||||||
|
rawDoc = root.getDocumentElement();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a Lucene <code>Document</code> from a
|
||||||
|
* {@link java.io.File}.
|
||||||
|
* @param file
|
||||||
|
* @param tidyConfigFile the full path to the Tidy
|
||||||
|
* config file
|
||||||
|
* @exception IOException */
|
||||||
|
public static org.apache.lucene.document.Document
|
||||||
|
Document(File file, String tidyConfigFile) throws IOException {
|
||||||
|
|
||||||
|
HtmlDocument htmlDoc = new HtmlDocument(file, tidyConfigFile);
|
||||||
|
|
||||||
|
org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
|
||||||
|
|
||||||
|
luceneDoc.add(new Field("title", htmlDoc.getTitle(), Field.Store.YES, Field.Index.ANALYZED));
|
||||||
|
luceneDoc.add(new Field("contents", htmlDoc.getBody(), Field.Store.YES, Field.Index.ANALYZED));
|
||||||
|
|
||||||
|
String contents = null;
|
||||||
|
BufferedReader br =
|
||||||
|
new BufferedReader(new FileReader(file));
|
||||||
|
StringWriter sw = new StringWriter();
|
||||||
|
String line = br.readLine();
|
||||||
|
while (line != null) {
|
||||||
|
sw.write(line);
|
||||||
|
line = br.readLine();
|
||||||
|
}
|
||||||
|
br.close();
|
||||||
|
contents = sw.toString();
|
||||||
|
sw.close();
|
||||||
|
|
||||||
|
luceneDoc.add(new Field("rawcontents", contents, Field.Store.YES, Field.Index.NO));
|
||||||
|
|
||||||
|
return luceneDoc;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a Lucene <code>Document</code> from an {@link
|
* Creates a Lucene <code>Document</code> from an {@link
|
||||||
* java.io.InputStream}.
|
* java.io.InputStream}.
|
||||||
|
|
Loading…
Reference in New Issue