my long promised index task. its in need of refactoring, but it does the job it was intended to do. docs and cleanup will follow.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2002-07-10 02:20:40 +00:00
parent 2bd4df4bcb
commit 6bf08ccff7
14 changed files with 1028 additions and 0 deletions

View File

@ -0,0 +1,121 @@
<?xml version="1.0"?>
<project name="lucene-ant" default="default">
<description>
Lucene Ant integration
</description>
<property name="build.dir" location="build"/>
<property name="build.classes.dir" location="${build.dir}/classes"/>
<property name="test.dir" location="${build.dir}/test"/>
<property name="test.classes.dir" location="${test.dir}/classes"/>
<property name="dist.dir" location="dist"/>
<property name="lucene-ant.jar" location="${dist.dir}/lucene-ant.jar"/>
<property name="jtidy.jar" location="lib/Tidy.jar"/>
<property name="junit.jar" location="${ant.home}/lib/junit.jar"/>
<property name="lucene.bin.dir" location="../../../jakarta-lucene/bin"/>
<property name="build.debug" value="true"/>
<property name="junit.fork" value="true"/>
<!-- ========================================================== -->
<!-- Datatype declarations -->
<!-- ========================================================== -->
<path id="compile.classpath">
<fileset dir="${lucene.bin.dir}" includes="lucene*.jar"/>
<pathelement location="${jtidy.jar}"/>
</path>
<path id="test.classpath">
<path refid="compile.classpath"/>
<pathelement location="${junit.jar}"/>
<pathelement location="${build.classes.dir}"/>
<pathelement location="${test.classes.dir}"/>
</path>
<target name="default" depends="test,dist"
description="build everything"
/>
<target name="init">
<echo message="Building ${ant.project.name}"/>
<tstamp/>
<mkdir dir="${build.dir}"/>
<mkdir dir="${build.classes.dir}"/>
<mkdir dir="${dist.dir}"/>
<mkdir dir="${test.dir}"/>
<mkdir dir="${test.classes.dir}"/>
</target>
<target name="clean"
description="Deletes all previous build artifacts">
<delete dir="${build.dir}"/>
<delete dir="${build.classes.dir}"/>
<delete dir="${dist.dir}"/>
<delete dir="${test.dir}"/>
<delete dir="${test.classes.dir}"/>
</target>
<target name="dist" depends="compile"
description="Create JAR">
<echo file="${build.classes.dir}/taskdef.properties">
index=org.apache.lucene.ant.IndexTask
</echo>
<jar destfile="${lucene-ant.jar}"
basedir="${build.classes.dir}"
/>
</target>
<target name="compile" depends="init">
<javac destdir="${build.classes.dir}"
debug="${build.debug}"
includeAntRuntime="yes"
srcdir="src/main"
classpathref="compile.classpath"
/>
</target>
<target name="test-compile" depends="compile">
<javac destdir="${test.classes.dir}"
debug="${build.debug}"
includeAntRuntime="yes"
srcdir="src/test"
classpathref="test.classpath"
/>
<copy todir="${test.classes.dir}">
<fileset dir="src/test" excludes="**/*.java"/>
</copy>
</target>
<target name="test" depends="test-compile">
<junit printsummary="no"
errorProperty="test.failed"
failureProperty="test.failed"
fork="${junit.fork}">
<classpath refid="test.classpath"/>
<sysproperty key="docs.dir" file="${test.classes.dir}"/>
<sysproperty key="index.dir" file="${test.dir}/index"/>
<formatter type="brief" usefile="false"/>
<test name="${testcase}" if="testcase"/>
<batchtest todir="${test.data.dir}" unless="testcase">
<fileset dir="${test.classes.dir}"
includes="**/*Test.class"
/>
</batchtest>
</junit>
<fail if="test.failed">
Unit tests failed. Check log or reports for details
</fail>
</target>
</project>

View File

@ -0,0 +1,2 @@
AnyObjectId[af4eed0506b53f17a4d22e4f1630ee03cb7991e5] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,23 @@
package org.apache.lucene.ant;
import java.io.File;
import org.apache.lucene.document.Document;
/**
* Allows a class to act as a Lucene document handler
*
*@author Erik Hatcher
*@created October 27, 2001
*/
public interface DocumentHandler {
/**
* Gets the document attribute of the DocumentHandler object
*
*@param file Description of Parameter
*@return The document value
*@throws DocumentHandlerException
*/
public Document getDocument(File file)
throws DocumentHandlerException;
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.ant;
import java.io.PrintStream;
import java.io.PrintWriter;
/**
*/
public class DocumentHandlerException extends Exception
{
private Throwable cause;
public DocumentHandlerException() {
super();
}
public DocumentHandlerException(String message) {
super(message);
}
public DocumentHandlerException(Throwable cause) {
super(cause.toString());
this.cause = cause;
}
public Throwable getException() {
return cause;
}
// Override stack trace methods to show original cause:
public void printStackTrace() {
printStackTrace(System.err);
}
public void printStackTrace(PrintStream ps) {
synchronized (ps) {
super.printStackTrace(ps);
if (cause != null) {
ps.println("--- Nested Exception ---");
cause.printStackTrace(ps);
}
}
}
public void printStackTrace(PrintWriter pw) {
synchronized (pw) {
super.printStackTrace(pw);
if (cause != null) {
pw.println("--- Nested Exception ---");
cause.printStackTrace(pw);
}
}
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.ant;
import java.io.File;
import org.apache.lucene.document.Document;
/**
* Decides which class used to create the Lucene Document
* object based on its file extension.
*
*@author Erik Hatcher
*@created October 28, 2001
*@todo Add dynamic file extension/classname mappings for
* extensibility
*/
public class FileExtensionDocumentHandler
implements DocumentHandler {
/**
* Gets the document attribute of the
* FileExtensionDocumentHandler object
*
*@param file Description of
* Parameter
*@return The document value
*@exception DocumentHandlerException Description of
* Exception
*/
public Document getDocument(File file)
throws DocumentHandlerException {
Document doc = null;
String name = file.getName();
try {
if (name.endsWith(".txt")) {
doc = TextDocument.Document(file);
}
if (name.endsWith(".html")) {
doc = HtmlDocument.Document(file);
}
}
catch (java.io.IOException e) {
throw new DocumentHandlerException(e);
}
return doc;
}
}

View File

@ -0,0 +1,232 @@
package org.apache.lucene.ant;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
// Imports commented out since there is a name clash and fully
// qualified class names will be used in the code. Imports are
// left for ease of maintenance.
import org.apache.lucene.document.Field;
//import org.apache.lucene.document.Document;
//import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
/**
* The <code>HtmlDocument</code> class creates a Lucene {@link
* org.apache.lucene.document.Document} from an HTML document. <P>
*
* It does this by using JTidy package. It can take input input
* from {@link java.io.File} or {@link java.io.InputStream}.
*
*@author Erik Hatcher
*@created October 27, 2001
*/
public class HtmlDocument {
private Element rawDoc;
//-------------------------------------------------------------
// Constructors
//-------------------------------------------------------------
/**
* Constructs an <code>HtmlDocument</code> from a {@link
* java.io.File}.
*
*@param file the <code>File</code> containing the
* HTML to parse
*@exception IOException if an I/O exception occurs
*@since
*/
public HtmlDocument(File file) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root =
tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
}
/**
* Constructs an <code>HtmlDocument</code> from an {@link
* java.io.InputStream}.
*
*@param is the <code>InputStream</code>
* containing the HTML
*@exception IOException if I/O exception occurs
*@since
*/
public HtmlDocument(InputStream is) throws IOException {
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
rawDoc = root.getDocumentElement();
}
/**
* Creates a Lucene <code>Document</code> from an {@link
* java.io.InputStream}.
*
*@param is
*@return
*@exception IOException
*/
public static org.apache.lucene.document.Document
getDocument(InputStream is) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
return luceneDoc;
}
//-------------------------------------------------------------
// Public methods
//-------------------------------------------------------------
/**
* Creates a Lucene <code>Document</code> from a {@link
* java.io.File}.
*
*@param file
*@return
*@exception IOException
*/
public static org.apache.lucene.document.Document
Document(File file) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
luceneDoc.add(Field.UnIndexed("rawcontents", contents));
return luceneDoc;
}
//-------------------------------------------------------------
// Private methods
//-------------------------------------------------------------
/**
* Runs <code>HtmlDocument</code> on the files specified on
* the command line.
*
*@param args Command line arguments
*@exception Exception Description of Exception
*/
private static void main(String args[]) throws Exception {
// HtmlDocument doc = new HtmlDocument(new File(args[0]));
// System.out.println("Title = " + doc.getTitle());
// System.out.println("Body = " + doc.getBody());
HtmlDocument doc =
new HtmlDocument(new FileInputStream(new File(args[0])));
System.out.println("Title = " + doc.getTitle());
System.out.println("Body = " + doc.getBody());
}
/**
* Gets the title attribute of the <code>HtmlDocument</code>
* object.
*
*@return the title value
*/
public String getTitle() {
if (rawDoc == null) {
return null;
}
String title = "";
NodeList nl = rawDoc.getElementsByTagName("title");
if (nl.getLength() > 0) {
Element titleElement = ((Element) nl.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
}
}
return title;
}
/**
* Gets the bodyText attribute of the
* <code>HtmlDocument</code> object.
*
*@return the bodyText value
*/
public String getBody() {
if (rawDoc == null) {
return null;
}
String body = "";
NodeList nl = rawDoc.getElementsByTagName("body");
if (nl.getLength() > 0) {
body = getBodyText(nl.item(0));
}
return body;
}
/**
* Gets the bodyText attribute of the
* <code>HtmlDocument</code> object.
*
*@param node a DOM Node
*@return The bodyText value
*/
private String getBodyText(Node node) {
NodeList nl = node.getChildNodes();
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < nl.getLength(); i++) {
Node child = nl.item(i);
switch (child.getNodeType()) {
case Node.ELEMENT_NODE:
buffer.append(getBodyText(child));
buffer.append(" ");
break;
case Node.TEXT_NODE:
buffer.append(((Text) child).getData());
break;
}
}
return buffer.toString();
}
}

View File

@ -0,0 +1,286 @@
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.DirectoryScanner;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.Task;
import org.apache.tools.ant.types.FileSet;
/**
* Builds a Lucene index from a fileset.
*
* @author Erik Hatcher
*/
public class IndexTask extends Task {
/**
* file list
*/
private Vector filesets = new Vector();
/**
* overwrite index?
*/
private boolean overwrite = false;
/**
* index path
*/
private File indexPath;
/**
* document handler classname
*/
private String handlerClassName =
"org.apache.lucene.ant.FileExtensionDocumentHandler";
/**
* document handler instance
*/
private DocumentHandler handler;
/**
* Lucene merge factor
*/
private int mergeFactor = 20;
/**
* Specifies the directory where the index will be stored
*
* @param indexPath The new index value
*/
public void setIndex(File indexPath) {
this.indexPath = indexPath;
}
/**
* Sets the mergeFactor attribute of the IndexTask object
*
*@param mergeFactor The new mergeFactor value
*/
public void setMergeFactor(int mergeFactor) {
this.mergeFactor = mergeFactor;
}
/**
* If true, index will be overwritten.
*
* @param overwrite The new overwrite value
*/
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
}
/**
* Classname of document handler.
*
* @param classname The new documentHandler value
*/
public void setDocumentHandler(String classname) {
handlerClassName = classname;
}
/**
* Adds a set of files.
*
* @param set FileSet to be added
*/
public void addFileset(FileSet set) {
filesets.addElement(set);
}
/**
* Begins the indexing
*
* @exception BuildException If an error occurs indexing the
* fileset
* @todo add classpath handling so handler does not
* have to be in system classpath
*/
public void execute() throws BuildException {
try {
Class clazz = Class.forName(handlerClassName);
handler = (DocumentHandler) clazz.newInstance();
}
catch (ClassNotFoundException cnfe) {
throw new BuildException(cnfe);
}
catch (InstantiationException ie) {
throw new BuildException(ie);
}
catch (IllegalAccessException iae) {
throw new BuildException(iae);
}
try {
indexDocs();
}
catch (IOException e) {
throw new BuildException(e);
}
}
/**
* index the fileset
*
* @exception IOException Description of Exception
* @todo refactor - definitely lots of room for improvement here
*/
private void indexDocs() throws IOException {
Date start = new Date();
boolean create = overwrite;
// If the index directory doesn't exist,
// create it and force create mode
if (indexPath.mkdirs() && !overwrite) {
create = true;
}
Searcher searcher = null;
Analyzer analyzer = new StopAnalyzer();
boolean checkLastModified = false;
if (!create) {
try {
searcher = new IndexSearcher(indexPath.getAbsolutePath());
checkLastModified = true;
}
catch (IOException ioe) {
log("IOException: " + ioe.getMessage());
// Empty - ignore, which indicates to index all
// documents
}
}
log("checkLastModified = " + checkLastModified);
IndexWriter writer =
new IndexWriter(indexPath, analyzer, create);
int totalFiles = 0;
int totalIndexed = 0;
int totalIgnored = 0;
try {
writer.mergeFactor = mergeFactor;
for (int i = 0; i < filesets.size(); i++) {
FileSet fs = (FileSet) filesets.elementAt(i);
if (fs != null) {
DirectoryScanner ds =
fs.getDirectoryScanner(project);
String[] dsfiles = ds.getIncludedFiles();
File baseDir = ds.getBasedir();
for (int j = 0; j < dsfiles.length; j++) {
File file = new File(baseDir, dsfiles[j]);
totalFiles++;
if (!file.exists() || !file.canRead()) {
throw new BuildException("File \"" +
file.getAbsolutePath()
+ "\" does not exist or is not readable.");
}
boolean indexIt = true;
if (checkLastModified) {
Hits hits = null;
Term pathTerm =
new Term("path", file.getPath());
TermQuery query =
new TermQuery(pathTerm);
hits = searcher.search(query);
// if document is found, compare the
// indexed last modified time with the
// current file
// - don't index if up to date
if (hits.length() > 0) {
Document doc = hits.doc(0);
String indexModified =
doc.get("modified");
if (indexModified != null) {
if (DateField.stringToTime(indexModified)
== file.lastModified()) {
indexIt = false;
}
}
}
}
if (indexIt) {
try {
log("Indexing " + file.getPath(),
Project.MSG_VERBOSE);
Document doc =
handler.getDocument(file);
if (doc == null) {
totalIgnored++;
}
else {
// Add the path of the file as a field named "path". Use a Text field, so
// that the index stores the path, and so that the path is searchable
doc.add(Field.Keyword("path", file.getPath()));
// Add the last modified date of the file a field named "modified". Use a
// Keyword field, so that it's searchable, but so that no attempt is made
// to tokenize the field into words.
doc.add(Field.Keyword("modified",
DateField.timeToString(file.lastModified())));
writer.addDocument(doc);
totalIndexed++;
}
}
catch (DocumentHandlerException e) {
throw new BuildException(e);
}
}
}
// for j
}
// if (fs != null)
}
// for i
writer.optimize();
}
//try
finally {
// always make sure everything gets closed,
// no matter how we exit.
writer.close();
if (searcher != null) {
searcher.close();
}
}
Date end = new Date();
log(totalIndexed + " out of " + totalFiles + " indexed (" +
totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
" milliseconds");
}
}

View File

@ -0,0 +1,82 @@
package org.apache.lucene.ant;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* A utility for making Lucene Documents from a File.
*
*@author Erik Hatcher
*@created December 6, 2001
*@todo Fix JavaDoc comments here
*/
public class TextDocument {
private String contents;
/**
* Constructor for the TextDocument object
*
*@param file Description of Parameter
*@exception IOException Description of Exception
*/
public TextDocument(File file) throws IOException {
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
sw.write(line);
line = br.readLine();
}
br.close();
contents = sw.toString();
sw.close();
}
/**
* Makes a document for a File. <p>
*
* The document has a single field:
* <ul>
* <li> <code>contents</code>--containing the full contents
* of the file, as a Text field;
*
*@param f Description of Parameter
*@return Description of the Returned Value
*@exception IOException Description of Exception
*/
public static Document Document(File f) throws IOException {
TextDocument textDoc = new TextDocument(f);
// make a new, empty document
Document doc = new Document();
doc.add(Field.Text("contents", textDoc.getContents()));
doc.add(Field.UnIndexed("rawcontents",
textDoc.getContents()));
// return the document
return doc;
}
/**
*@return The contents value
*@todo finish this method
*/
public String getContents() {
return contents;
}
}

View File

@ -0,0 +1,22 @@
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
public abstract class DocumentTestCase extends TestCase
{
public DocumentTestCase(String name) {
super(name);
}
protected File getFile(String filename) throws IOException {
String fullname =
this.getClass().getResource(filename).getFile();
File file = new File(fullname);
return file;
}
}

View File

@ -0,0 +1,29 @@
package org.apache.lucene.ant;
import java.io.IOException;
import org.apache.lucene.ant.DocumentTestCase;
import org.apache.lucene.ant.HtmlDocument;
public class HtmlDocumentTest extends DocumentTestCase
{
public HtmlDocumentTest (String name) {
super(name);
}
HtmlDocument doc;
public void setUp() throws IOException {
doc = new HtmlDocument(getFile("test.html"));
}
public void testDoc() {
assertEquals("Title", "Test Title", doc.getTitle());
assertTrue("Body", doc.getBody().startsWith("This is some test"));
}
public void tearDown() {
doc = null;
}
}

View File

@ -0,0 +1,92 @@
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.ant.IndexTask;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.types.FileSet;
/**
* Test cases for index task
*
*@author Erik Hatcher
*/
public class IndexTaskTest extends TestCase {
private final static String docHandler =
"org.apache.lucene.ant.FileExtensionDocumentHandler";
private String docsDir = System.getProperty("docs.dir");
private String indexDir = System.getProperty("index.dir");
private Searcher searcher;
private Analyzer analyzer;
/**
* Constructor for the IndexTaskTest object
*
*@param name Description of Parameter
*/
public IndexTaskTest(String name) {
super(name);
}
/**
* The JUnit setup method
*
*@exception IOException Description of Exception
*/
public void setUp() throws IOException {
Project project = new Project();
IndexTask task = new IndexTask();
FileSet fs = new FileSet();
fs.setDir(new File(docsDir));
task.addFileset(fs);
task.setOverwrite(true);
task.setDocumentHandler(docHandler);
task.setIndex(new File(indexDir));
task.setProject(project);
task.execute();
searcher = new IndexSearcher(indexDir);
analyzer = new StopAnalyzer();
}
/**
* A unit test for JUnit
*/
public void testSearch() throws IOException, ParseException {
System.out.println("sysout");
System.err.println("syserr");
Query query = QueryParser.parse("test", "contents", analyzer);
Hits hits = searcher.search(query);
assertEquals("Find document(s)", 2, hits.length());
}
/**
* The teardown method for JUnit
* @todo remove indexDir?
*/
public void tearDown() throws IOException {
searcher.close();
}
}

View File

@ -0,0 +1,28 @@
package org.apache.lucene.ant;
import java.io.IOException;
import org.apache.lucene.ant.DocumentTestCase;
import org.apache.lucene.ant.TextDocument;
public class TextDocumentTest extends DocumentTestCase
{
public TextDocumentTest (String name) {
super(name);
}
TextDocument doc;
public void setUp() throws IOException {
doc = new TextDocument(getFile("test.txt"));
}
public void testDoc() {
assertEquals("Contents", "Test Contents", doc.getContents());
}
public void tearDown() {
doc = null;
}
}

View File

@ -0,0 +1,7 @@
<html>
<head>
<title>Test Title</title>
</head>
<body>
<i>This is <b>some</b>test</i>
</body>

View File

@ -0,0 +1 @@
Test Contents