mirror of https://github.com/apache/lucene.git
Use the HTMLParser constructor that takes a FileInputStream and make sure it gets closed. This was not the case with the constructor that takes a File.
Thus I deprecated that one. I guess the demo isn't part of the "official" API but there are surely people who use it for more than just testing. PR: 28187 git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150395 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
fecb54ff64
commit
35d5406541
|
@ -61,19 +61,26 @@ public class HTMLDocument {
|
||||||
// tokenized prior to indexing.
|
// tokenized prior to indexing.
|
||||||
doc.add(new Field("uid", uid(f), false, true, false));
|
doc.add(new Field("uid", uid(f), false, true, false));
|
||||||
|
|
||||||
HTMLParser parser = new HTMLParser(f);
|
FileInputStream fis = null;
|
||||||
|
try {
|
||||||
|
fis = new FileInputStream(f);
|
||||||
|
HTMLParser parser = new HTMLParser(fis);
|
||||||
|
|
||||||
|
// Add the tag-stripped contents as a Reader-valued Text field so it will
|
||||||
|
// get tokenized and indexed.
|
||||||
|
doc.add(Field.Text("contents", parser.getReader()));
|
||||||
|
|
||||||
// Add the tag-stripped contents as a Reader-valued Text field so it will
|
// Add the summary as an UnIndexed field, so that it is stored and returned
|
||||||
// get tokenized and indexed.
|
// with hit documents for display.
|
||||||
doc.add(Field.Text("contents", parser.getReader()));
|
doc.add(Field.UnIndexed("summary", parser.getSummary()));
|
||||||
|
|
||||||
// Add the summary as an UnIndexed field, so that it is stored and returned
|
// Add the title as a separate Text field, so that it can be searched
|
||||||
// with hit documents for display.
|
// separately.
|
||||||
doc.add(Field.UnIndexed("summary", parser.getSummary()));
|
doc.add(Field.Text("title", parser.getTitle()));
|
||||||
|
} finally {
|
||||||
// Add the title as a separate Text field, so that it can be searched
|
if (fis != null)
|
||||||
// separately.
|
fis.close();
|
||||||
doc.add(Field.Text("title", parser.getTitle()));
|
}
|
||||||
|
|
||||||
// return the document
|
// return the document
|
||||||
return doc;
|
return doc;
|
||||||
|
|
|
@ -40,6 +40,9 @@ public class HTMLParser implements HTMLParserConstants {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||||
|
*/
|
||||||
public HTMLParser(File file) throws FileNotFoundException {
|
public HTMLParser(File file) throws FileNotFoundException {
|
||||||
this(new FileInputStream(file));
|
this(new FileInputStream(file));
|
||||||
}
|
}
|
||||||
|
@ -450,18 +453,18 @@ null)
|
||||||
finally { jj_save(1, xla); }
|
finally { jj_save(1, xla); }
|
||||||
}
|
}
|
||||||
|
|
||||||
final private boolean jj_3_1() {
|
|
||||||
if (jj_scan_token(ArgQuote1)) return true;
|
|
||||||
if (jj_scan_token(CloseQuote1)) return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
final private boolean jj_3_2() {
|
final private boolean jj_3_2() {
|
||||||
if (jj_scan_token(ArgQuote2)) return true;
|
if (jj_scan_token(ArgQuote2)) return true;
|
||||||
if (jj_scan_token(CloseQuote2)) return true;
|
if (jj_scan_token(CloseQuote2)) return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final private boolean jj_3_1() {
|
||||||
|
if (jj_scan_token(ArgQuote1)) return true;
|
||||||
|
if (jj_scan_token(CloseQuote1)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
public HTMLParserTokenManager token_source;
|
public HTMLParserTokenManager token_source;
|
||||||
SimpleCharStream jj_input_stream;
|
SimpleCharStream jj_input_stream;
|
||||||
public Token token, jj_nt;
|
public Token token, jj_nt;
|
||||||
|
|
|
@ -104,6 +104,9 @@ public class HTMLParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use HTMLParser(FileInputStream) instead
|
||||||
|
*/
|
||||||
public HTMLParser(File file) throws FileNotFoundException {
|
public HTMLParser(File file) throws FileNotFoundException {
|
||||||
this(new FileInputStream(file));
|
this(new FileInputStream(file));
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.demo.html;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
||||||
class Test {
|
class Test {
|
||||||
public static void main(String[] argv) throws Exception {
|
public static void main(String[] argv) throws IOException, InterruptedException {
|
||||||
if ("-dir".equals(argv[0])) {
|
if ("-dir".equals(argv[0])) {
|
||||||
String[] files = new File(argv[1]).list();
|
String[] files = new File(argv[1]).list();
|
||||||
java.util.Arrays.sort(files);
|
java.util.Arrays.sort(files);
|
||||||
|
@ -32,12 +32,19 @@ class Test {
|
||||||
parse(new File(argv[0]));
|
parse(new File(argv[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void parse(File file) throws Exception {
|
public static void parse(File file) throws IOException, InterruptedException {
|
||||||
HTMLParser parser = new HTMLParser(file);
|
FileInputStream fis = null;
|
||||||
System.out.println("Title: " + Entities.encode(parser.getTitle()));
|
try {
|
||||||
System.out.println("Summary: " + Entities.encode(parser.getSummary()));
|
fis = new FileInputStream(file);
|
||||||
LineNumberReader reader = new LineNumberReader(parser.getReader());
|
HTMLParser parser = new HTMLParser(fis);
|
||||||
for (String l = reader.readLine(); l != null; l = reader.readLine())
|
System.out.println("Title: " + Entities.encode(parser.getTitle()));
|
||||||
System.out.println(l);
|
System.out.println("Summary: " + Entities.encode(parser.getSummary()));
|
||||||
|
System.out.println("Content:");
|
||||||
|
LineNumberReader reader = new LineNumberReader(parser.getReader());
|
||||||
|
for (String l = reader.readLine(); l != null; l = reader.readLine())
|
||||||
|
System.out.println(l);
|
||||||
|
} finally {
|
||||||
|
if (fis != null) fis.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue