my long promised index task. its in need of refactoring, but it does the job it was intended to do. docs and cleanup will follow.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150801 13f79535-47bb-0310-9956-ffa450edef68
2002-07-10 02:20:40 +00:00 · 2002-07-10 02:20:40 +00:00 · 6bf08ccff7
parent 2bd4df4bcb
commit 6bf08ccff7
14 changed files with 1028 additions and 0 deletions
--- a/sandbox/projects/ant/build.xml
+++ b/sandbox/projects/ant/build.xml
@ -0,0 +1,121 @@
+<?xml version="1.0"?>
+
+<project name="lucene-ant" default="default">
+
+  <description>
+    Lucene Ant integration
+  </description>
+
+  <property name="build.dir" location="build"/>
+  <property name="build.classes.dir" location="${build.dir}/classes"/>
+  <property name="test.dir" location="${build.dir}/test"/>
+  <property name="test.classes.dir" location="${test.dir}/classes"/>
+  <property name="dist.dir" location="dist"/>
+  <property name="lucene-ant.jar" location="${dist.dir}/lucene-ant.jar"/>
+
+  <property name="jtidy.jar" location="lib/Tidy.jar"/>
+  <property name="junit.jar" location="${ant.home}/lib/junit.jar"/>
+  <property name="lucene.bin.dir" location="../../../jakarta-lucene/bin"/>
+
+  <property name="build.debug" value="true"/>
+  <property name="junit.fork"  value="true"/>
+
+  <!-- ========================================================== -->
+  <!-- Datatype declarations                                      -->
+  <!-- ========================================================== -->
+  <path id="compile.classpath">
+    <fileset dir="${lucene.bin.dir}" includes="lucene*.jar"/>
+    <pathelement location="${jtidy.jar}"/>
+  </path>
+
+  <path id="test.classpath">
+    <path refid="compile.classpath"/>
+    <pathelement location="${junit.jar}"/>
+    <pathelement location="${build.classes.dir}"/>
+    <pathelement location="${test.classes.dir}"/>
+  </path>
+
+
+  <target name="default" depends="test,dist"
+          description="build everything"
+  />
+
+  <target name="init">
+    <echo message="Building ${ant.project.name}"/>
+    <tstamp/>
+
+    <mkdir dir="${build.dir}"/>
+    <mkdir dir="${build.classes.dir}"/>
+    <mkdir dir="${dist.dir}"/>
+
+    <mkdir dir="${test.dir}"/>
+    <mkdir dir="${test.classes.dir}"/>
+  </target>
+
+  <target name="clean"
+          description="Deletes all previous build artifacts">
+    <delete dir="${build.dir}"/>
+    <delete dir="${build.classes.dir}"/>
+    <delete dir="${dist.dir}"/>
+
+    <delete dir="${test.dir}"/>
+    <delete dir="${test.classes.dir}"/>
+  </target>
+
+  <target name="dist" depends="compile"
+          description="Create JAR">
+    <echo file="${build.classes.dir}/taskdef.properties">
+      index=org.apache.lucene.ant.IndexTask
+    </echo>
+    <jar destfile="${lucene-ant.jar}"
+         basedir="${build.classes.dir}"
+    />
+  </target>
+
+  <target name="compile" depends="init">
+    <javac destdir="${build.classes.dir}"
+           debug="${build.debug}"
+           includeAntRuntime="yes"
+           srcdir="src/main"
+           classpathref="compile.classpath"
+    />
+  </target>
+
+  <target name="test-compile" depends="compile">
+    <javac destdir="${test.classes.dir}"
+           debug="${build.debug}"
+           includeAntRuntime="yes"
+           srcdir="src/test"
+           classpathref="test.classpath"
+    />
+
+    <copy todir="${test.classes.dir}">
+      <fileset dir="src/test" excludes="**/*.java"/>
+    </copy>
+  </target>
+
+  <target name="test" depends="test-compile">
+    <junit printsummary="no"
+           errorProperty="test.failed"
+           failureProperty="test.failed"
+           fork="${junit.fork}">
+      <classpath refid="test.classpath"/>
+      <sysproperty key="docs.dir" file="${test.classes.dir}"/>
+      <sysproperty key="index.dir" file="${test.dir}/index"/>
+      <formatter type="brief" usefile="false"/>
+      <test name="${testcase}" if="testcase"/>
+      <batchtest todir="${test.data.dir}" unless="testcase">
+        <fileset dir="${test.classes.dir}"
+                 includes="**/*Test.class"
+        />
+      </batchtest>
+    </junit>
+
+    <fail if="test.failed">
+      Unit tests failed.  Check log or reports for details
+    </fail>
+
+  </target>
+
+
+</project>
--- a/sandbox/projects/ant/lib/Tidy.jar
+++ b/sandbox/projects/ant/lib/Tidy.jar
@ -0,0 +1,2 @@
+AnyObjectId[af4eed0506b53f17a4d22e4f1630ee03cb7991e5] was removed in git history.
+Apache SVN contains full history.
--- a/sandbox/projects/ant/src/main/org/apache/lucene/ant/DocumentHandler.java
+++ b/sandbox/projects/ant/src/main/org/apache/lucene/ant/DocumentHandler.java
@ -0,0 +1,23 @@
+package org.apache.lucene.ant;
+
+import java.io.File;
+import org.apache.lucene.document.Document;
+
+/**
+ *  Allows a class to act as a Lucene document handler
+ *
+ *@author     Erik Hatcher
+ *@created    October 27, 2001
+ */
+public interface DocumentHandler {
+    /**
+     *  Gets the document attribute of the DocumentHandler object
+     *
+     *@param  file  Description of Parameter
+     *@return       The document value
+     *@throws DocumentHandlerException
+     */
+    public Document getDocument(File file)
+                                  throws DocumentHandlerException;
+}
+
--- a/sandbox/projects/ant/src/main/org/apache/lucene/ant/DocumentHandlerException.java
+++ b/sandbox/projects/ant/src/main/org/apache/lucene/ant/DocumentHandlerException.java
@ -0,0 +1,54 @@
+package org.apache.lucene.ant;
+
+import java.io.PrintStream;
+import java.io.PrintWriter;
+
+/**
+ */
+public class DocumentHandlerException extends Exception
+{
+    private Throwable cause;
+    
+    public DocumentHandlerException() {
+        super();
+    }
+    
+    public DocumentHandlerException(String message) {
+        super(message);
+    }
+    
+    public DocumentHandlerException(Throwable cause) {
+        super(cause.toString());
+        this.cause = cause;
+    }
+    
+    public Throwable getException() {
+        return cause;
+    }
+
+    // Override stack trace methods to show original cause:
+    public void printStackTrace() {
+        printStackTrace(System.err);
+    }
+    
+    public void printStackTrace(PrintStream ps) {
+        synchronized (ps) {
+            super.printStackTrace(ps);
+            if (cause != null) {
+                ps.println("--- Nested Exception ---");
+                cause.printStackTrace(ps);
+            }
+        }
+    }
+    
+    public void printStackTrace(PrintWriter pw) {
+        synchronized (pw) {
+            super.printStackTrace(pw);
+            if (cause != null) {
+                pw.println("--- Nested Exception ---");
+                cause.printStackTrace(pw);
+            }
+        }
+    }
+}
+
--- a/sandbox/projects/ant/src/main/org/apache/lucene/ant/FileExtensionDocumentHandler.java
+++ b/sandbox/projects/ant/src/main/org/apache/lucene/ant/FileExtensionDocumentHandler.java
@ -0,0 +1,49 @@
+package org.apache.lucene.ant;
+
+import java.io.File;
+import org.apache.lucene.document.Document;
+
+/**
+ *  Decides which class used to create the Lucene Document
+ *  object based on its file extension.
+ *
+ *@author     Erik Hatcher
+ *@created    October 28, 2001
+ *@todo Add dynamic file extension/classname mappings for
+ *      extensibility
+ */
+public class FileExtensionDocumentHandler
+                                       implements DocumentHandler {
+    /**
+     *  Gets the document attribute of the
+     *  FileExtensionDocumentHandler object
+     *
+     *@param  file                          Description of
+     *      Parameter
+     *@return                               The document value
+     *@exception  DocumentHandlerException  Description of
+     *      Exception
+     */
+    public Document getDocument(File file)
+                                  throws DocumentHandlerException {
+        Document doc = null;
+
+        String name = file.getName();
+
+        try {
+            if (name.endsWith(".txt")) {
+                doc = TextDocument.Document(file);
+            }
+
+            if (name.endsWith(".html")) {
+                doc = HtmlDocument.Document(file);
+            }
+        }
+        catch (java.io.IOException e) {
+            throw new DocumentHandlerException(e);
+        }
+
+        return doc;
+    }
+}
+
--- a/sandbox/projects/ant/src/main/org/apache/lucene/ant/HtmlDocument.java
+++ b/sandbox/projects/ant/src/main/org/apache/lucene/ant/HtmlDocument.java
@ -0,0 +1,232 @@
+package org.apache.lucene.ant;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+
+// Imports commented out since there is a name clash and fully
+// qualified class names will be used in the code.  Imports are
+// left for ease of maintenance.
+import org.apache.lucene.document.Field;
+//import org.apache.lucene.document.Document;
+//import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Text;
+import org.w3c.tidy.Tidy;
+
+/**
+ *  The <code>HtmlDocument</code> class creates a Lucene {@link
+ *  org.apache.lucene.document.Document} from an HTML document. <P>
+ *
+ *  It does this by using JTidy package. It can take input input
+ *  from {@link java.io.File} or {@link java.io.InputStream}.
+ *
+ *@author     Erik Hatcher
+ *@created    October 27, 2001
+ */
+public class HtmlDocument {
+    private Element rawDoc;
+
+
+    //-------------------------------------------------------------
+    // Constructors
+    //-------------------------------------------------------------
+
+    /**
+     *  Constructs an <code>HtmlDocument</code> from a {@link
+     *  java.io.File}.
+     *
+     *@param  file             the <code>File</code> containing the
+     *      HTML to parse
+     *@exception  IOException  if an I/O exception occurs
+     *@since
+     */
+    public HtmlDocument(File file) throws IOException {
+        Tidy tidy = new Tidy();
+        tidy.setQuiet(true);
+        tidy.setShowWarnings(false);
+        org.w3c.dom.Document root = 
+                    tidy.parseDOM(new FileInputStream(file), null);
+        rawDoc = root.getDocumentElement();
+    }
+
+
+    /**
+     *  Constructs an <code>HtmlDocument</code> from an {@link
+     *  java.io.InputStream}.
+     *
+     *@param  is               the <code>InputStream</code>
+     *      containing the HTML
+     *@exception  IOException  if I/O exception occurs
+     *@since
+     */
+    public HtmlDocument(InputStream is) throws IOException {
+        Tidy tidy = new Tidy();
+        tidy.setQuiet(true);
+        tidy.setShowWarnings(false);
+        org.w3c.dom.Document root = tidy.parseDOM(is, null);
+        rawDoc = root.getDocumentElement();
+    }
+
+
+    /**
+     *  Creates a Lucene <code>Document</code> from an {@link
+     *  java.io.InputStream}.
+     *
+     *@param  is
+     *@return
+     *@exception  IOException
+     */
+    public static org.apache.lucene.document.Document
+                   getDocument(InputStream is) throws IOException {
+        HtmlDocument htmlDoc = new HtmlDocument(is);
+        org.apache.lucene.document.Document luceneDoc =
+                new org.apache.lucene.document.Document();
+
+        luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
+        luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
+
+        return luceneDoc;
+    }
+
+
+    //-------------------------------------------------------------
+    // Public methods
+    //-------------------------------------------------------------
+
+    /**
+     *  Creates a Lucene <code>Document</code> from a {@link
+     *  java.io.File}.
+     *
+     *@param  file
+     *@return
+     *@exception  IOException
+     */
+    public static org.apache.lucene.document.Document
+                           Document(File file) throws IOException {
+        HtmlDocument htmlDoc = new HtmlDocument(file);
+        org.apache.lucene.document.Document luceneDoc =
+                new org.apache.lucene.document.Document();
+
+        luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
+        luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
+
+        String contents = null;
+        BufferedReader br =
+                          new BufferedReader(new FileReader(file));
+        StringWriter sw = new StringWriter();
+        String line = br.readLine();
+        while (line != null) {
+            sw.write(line);
+            line = br.readLine();
+        }
+        br.close();
+        contents = sw.toString();
+        sw.close();
+
+        luceneDoc.add(Field.UnIndexed("rawcontents", contents));
+
+        return luceneDoc;
+    }
+
+
+    //-------------------------------------------------------------
+    // Private methods
+    //-------------------------------------------------------------
+
+    /**
+     *  Runs <code>HtmlDocument</code> on the files specified on
+     *  the command line.
+     *
+     *@param  args           Command line arguments
+     *@exception  Exception  Description of Exception
+     */
+    private static void main(String args[]) throws Exception {
+//         HtmlDocument doc = new HtmlDocument(new File(args[0]));
+//         System.out.println("Title = " + doc.getTitle());
+//         System.out.println("Body  = " + doc.getBody());
+
+        HtmlDocument doc =
+          new HtmlDocument(new FileInputStream(new File(args[0])));
+        System.out.println("Title = " + doc.getTitle());
+        System.out.println("Body  = " + doc.getBody());
+    }
+
+
+    /**
+     *  Gets the title attribute of the <code>HtmlDocument</code>
+     *  object.
+     *
+     *@return    the title value
+     */
+    public String getTitle() {
+        if (rawDoc == null) {
+            return null;
+        }
+
+        String title = "";
+
+        NodeList nl = rawDoc.getElementsByTagName("title");
+        if (nl.getLength() > 0) {
+            Element titleElement = ((Element) nl.item(0));
+            Text text = (Text) titleElement.getFirstChild();
+            if (text != null) {
+                title = text.getData();
+            }
+        }
+        return title;
+    }
+
+
+    /**
+     *  Gets the bodyText attribute of the
+     *  <code>HtmlDocument</code> object.
+     *
+     *@return    the bodyText value
+     */
+    public String getBody() {
+        if (rawDoc == null) {
+            return null;
+        }
+
+        String body = "";
+        NodeList nl = rawDoc.getElementsByTagName("body");
+        if (nl.getLength() > 0) {
+            body = getBodyText(nl.item(0));
+        }
+        return body;
+    }
+
+
+    /**
+     *  Gets the bodyText attribute of the
+     *  <code>HtmlDocument</code> object.
+     *
+     *@param  node  a DOM Node
+     *@return       The bodyText value
+     */
+    private String getBodyText(Node node) {
+        NodeList nl = node.getChildNodes();
+        StringBuffer buffer = new StringBuffer();
+        for (int i = 0; i < nl.getLength(); i++) {
+            Node child = nl.item(i);
+            switch (child.getNodeType()) {
+                case Node.ELEMENT_NODE:
+                    buffer.append(getBodyText(child));
+                    buffer.append(" ");
+                    break;
+                case Node.TEXT_NODE:
+                    buffer.append(((Text) child).getData());
+                    break;
+            }
+        }
+        return buffer.toString();
+    }
+}
+
--- a/sandbox/projects/ant/src/main/org/apache/lucene/ant/IndexTask.java
+++ b/sandbox/projects/ant/src/main/org/apache/lucene/ant/IndexTask.java
@ -0,0 +1,286 @@
+package org.apache.lucene.ant;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Vector;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.document.DateField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TermQuery;
+
+import org.apache.tools.ant.BuildException;
+import org.apache.tools.ant.DirectoryScanner;
+import org.apache.tools.ant.Project;
+import org.apache.tools.ant.Task;
+import org.apache.tools.ant.types.FileSet;
+
+/**
+ * Builds a Lucene index from a fileset.
+ *
+ * @author     Erik Hatcher
+ */
+public class IndexTask extends Task {
+    /**
+     *  file list
+     */
+    private Vector filesets = new Vector();
+    
+    /**
+     *  overwrite index?
+     */
+    private boolean overwrite = false;
+
+    /**
+     *  index path
+     */
+    private File indexPath;
+
+    /**
+     *  document handler classname
+     */
+    private String handlerClassName =
+            "org.apache.lucene.ant.FileExtensionDocumentHandler";
+
+    /**
+     *  document handler instance
+     */
+    private DocumentHandler handler;
+
+    /**
+     *  Lucene merge factor
+     */
+    private int mergeFactor = 20;
+
+
+    /**
+     *  Specifies the directory where the index will be stored
+     *
+     * @param  indexPath  The new index value
+     */
+    public void setIndex(File indexPath) {
+        this.indexPath = indexPath;
+    }
+
+    /**
+     *  Sets the mergeFactor attribute of the IndexTask object
+     *
+     *@param  mergeFactor  The new mergeFactor value
+     */
+    public void setMergeFactor(int mergeFactor) {
+        this.mergeFactor = mergeFactor;
+    }
+
+
+    /**
+     * If true, index will be overwritten.
+     *
+     * @param  overwrite  The new overwrite value
+     */
+    public void setOverwrite(boolean overwrite) {
+        this.overwrite = overwrite;
+    }
+
+
+    /**
+     * Classname of document handler.
+     *
+     * @param  classname  The new documentHandler value
+     */
+    public void setDocumentHandler(String classname) {
+        handlerClassName = classname;
+    }
+
+
+    /**
+     *  Adds a set of files.
+     *
+     * @param  set  FileSet to be added
+     */
+    public void addFileset(FileSet set) {
+        filesets.addElement(set);
+    }
+
+
+    /**
+     *  Begins the indexing
+     *
+     * @exception  BuildException  If an error occurs indexing the
+     *      fileset
+     * @todo add classpath handling so handler does not
+     *       have to be in system classpath
+     */
+    public void execute() throws BuildException {
+        try {
+            Class clazz = Class.forName(handlerClassName);
+            handler = (DocumentHandler) clazz.newInstance();
+        }
+        catch (ClassNotFoundException cnfe) {
+            throw new BuildException(cnfe);
+        }
+        catch (InstantiationException ie) {
+            throw new BuildException(ie);
+        }
+        catch (IllegalAccessException iae) {
+            throw new BuildException(iae);
+        }
+
+        try {
+            indexDocs();
+        }
+        catch (IOException e) {
+            throw new BuildException(e);
+        }
+    }
+
+
+    /**
+     *  index the fileset
+     *
+     * @exception  IOException  Description of Exception
+     * @todo refactor - definitely lots of room for improvement here
+     */
+    private void indexDocs() throws IOException {
+        Date start = new Date();
+
+        boolean create = overwrite;
+        // If the index directory doesn't exist,
+        // create it and force create mode
+        if (indexPath.mkdirs() && !overwrite) {
+            create = true;
+        }
+
+        Searcher searcher = null;
+        Analyzer analyzer = new StopAnalyzer();
+        boolean checkLastModified = false;
+        if (!create) {
+            try {
+                searcher = new IndexSearcher(indexPath.getAbsolutePath());
+                checkLastModified = true;
+            }
+            catch (IOException ioe) {
+                log("IOException: " + ioe.getMessage());
+                // Empty - ignore, which indicates to index all
+                // documents
+            }
+        }
+
+        log("checkLastModified = " + checkLastModified);
+
+        IndexWriter writer =
+                       new IndexWriter(indexPath, analyzer, create);
+        int totalFiles = 0;
+        int totalIndexed = 0;
+        int totalIgnored = 0;
+        try {
+            writer.mergeFactor = mergeFactor;
+
+            for (int i = 0; i < filesets.size(); i++) {
+                FileSet fs = (FileSet) filesets.elementAt(i);
+                if (fs != null) {
+                    DirectoryScanner ds =
+                                   fs.getDirectoryScanner(project);
+                    String[] dsfiles = ds.getIncludedFiles();
+                    File baseDir = ds.getBasedir();
+
+                    for (int j = 0; j < dsfiles.length; j++) {
+                        File file = new File(baseDir, dsfiles[j]);
+                        totalFiles++;
+
+                        if (!file.exists() || !file.canRead()) {
+                            throw new BuildException("File \"" +
+                        file.getAbsolutePath()
+                        + "\" does not exist or is not readable.");
+                        }
+
+                        boolean indexIt = true;
+
+                        if (checkLastModified) {
+                            Hits hits = null;
+                            Term pathTerm = 
+                                  new Term("path", file.getPath());
+                            TermQuery query =
+                                           new TermQuery(pathTerm);
+                            hits = searcher.search(query);
+
+                            // if document is found, compare the
+                            // indexed last modified time with the
+                            // current file
+                            // - don't index if up to date
+                            if (hits.length() > 0) {
+                                Document doc = hits.doc(0);
+                                String indexModified =
+                                               doc.get("modified");
+                                if (indexModified != null) {
+                                    if (DateField.stringToTime(indexModified)
+                                             == file.lastModified()) {
+                                        indexIt = false;
+                                    }
+                                }
+                            }
+                        }
+
+                        if (indexIt) {
+                            try {
+                                log("Indexing " + file.getPath(),
+                                    Project.MSG_VERBOSE);
+                                Document doc =
+                                         handler.getDocument(file);
+
+                                if (doc == null) {
+                                    totalIgnored++;
+                                }
+                                else {
+                                    // Add the path of the file as a field named "path".  Use a Text field, so
+                                    // that the index stores the path, and so that the path is searchable
+                                    doc.add(Field.Keyword("path", file.getPath()));
+
+                                    // Add the last modified date of the file a field named "modified".  Use a
+                                    // Keyword field, so that it's searchable, but so that no attempt is made
+                                    // to tokenize the field into words.
+                                    doc.add(Field.Keyword("modified",
+                                            DateField.timeToString(file.lastModified())));
+
+                                    writer.addDocument(doc);
+                                    totalIndexed++;
+                                }
+                            }
+                            catch (DocumentHandlerException e) {
+                                throw new BuildException(e);
+                            }
+                        }
+                    }
+                    // for j
+                }
+                // if (fs != null)
+            }
+            // for i
+
+            writer.optimize();
+        }
+        //try
+        finally {
+            // always make sure everything gets closed,
+            // no matter how we exit.
+            writer.close();
+            if (searcher != null) {
+                searcher.close();
+            }
+        }
+
+        Date end = new Date();
+
+        log(totalIndexed + " out of " + totalFiles + " indexed (" +
+                totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
+                " milliseconds");
+    }
+}
+
--- a/sandbox/projects/ant/src/main/org/apache/lucene/ant/TextDocument.java
+++ b/sandbox/projects/ant/src/main/org/apache/lucene/ant/TextDocument.java
@ -0,0 +1,82 @@
+package org.apache.lucene.ant;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.StringWriter;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/**
+ *  A utility for making Lucene Documents from a File.
+ *
+ *@author     Erik Hatcher
+ *@created    December 6, 2001
+ *@todo       Fix JavaDoc comments here
+ */
+
+public class TextDocument {
+    private String contents;
+
+
+    /**
+     *  Constructor for the TextDocument object
+     *
+     *@param  file             Description of Parameter
+     *@exception  IOException  Description of Exception
+     */
+    public TextDocument(File file) throws IOException {
+        BufferedReader br =
+                new BufferedReader(new FileReader(file));
+        StringWriter sw = new StringWriter();
+
+        String line = br.readLine();
+        while (line != null) {
+            sw.write(line);
+            line = br.readLine();
+        }
+        br.close();
+
+        contents = sw.toString();
+        sw.close();
+    }
+
+
+    /**
+     *  Makes a document for a File. <p>
+     *
+     *  The document has a single field:
+     *  <ul>
+     *    <li> <code>contents</code>--containing the full contents
+     *    of the file, as a Text field;
+     *
+     *@param  f                Description of Parameter
+     *@return                  Description of the Returned Value
+     *@exception  IOException  Description of Exception
+     */
+    public static Document Document(File f) throws IOException {
+
+        TextDocument textDoc = new TextDocument(f);
+        // make a new, empty document
+        Document doc = new Document();
+
+        doc.add(Field.Text("contents", textDoc.getContents()));
+        doc.add(Field.UnIndexed("rawcontents", 
+                                           textDoc.getContents()));
+
+        // return the document
+        return doc;
+    }
+
+
+    /**
+     *@return    The contents value
+     *@todo      finish this method
+     */
+    public String getContents() {
+        return contents;
+    }
+}
+
--- a/sandbox/projects/ant/src/test/org/apache/lucene/ant/DocumentTestCase.java
+++ b/sandbox/projects/ant/src/test/org/apache/lucene/ant/DocumentTestCase.java
@ -0,0 +1,22 @@
+package org.apache.lucene.ant;
+
+import java.io.File;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+public abstract class DocumentTestCase extends TestCase
+{
+    public DocumentTestCase(String name) {
+        super(name);
+    }
+    
+    protected File getFile(String filename) throws IOException {
+        String fullname =
+                     this.getClass().getResource(filename).getFile();
+                     
+        File file = new File(fullname);
+        
+        return file;
+    }
+}
--- a/sandbox/projects/ant/src/test/org/apache/lucene/ant/HtmlDocumentTest.java
+++ b/sandbox/projects/ant/src/test/org/apache/lucene/ant/HtmlDocumentTest.java
@ -0,0 +1,29 @@
+package org.apache.lucene.ant;
+
+import java.io.IOException;
+
+import org.apache.lucene.ant.DocumentTestCase;
+import org.apache.lucene.ant.HtmlDocument;
+
+public class HtmlDocumentTest extends DocumentTestCase
+{
+    public HtmlDocumentTest (String name) {
+        super(name);
+    }
+    
+    HtmlDocument doc;
+    
+    public void setUp() throws IOException {
+        doc = new HtmlDocument(getFile("test.html"));
+    }
+    
+    public void testDoc() {
+        assertEquals("Title", "Test Title", doc.getTitle());
+        assertTrue("Body", doc.getBody().startsWith("This is some test"));
+    }
+    
+    public void tearDown() {
+        doc = null;
+    }
+}
+
--- a/sandbox/projects/ant/src/test/org/apache/lucene/ant/IndexTaskTest.java
+++ b/sandbox/projects/ant/src/test/org/apache/lucene/ant/IndexTaskTest.java
@ -0,0 +1,92 @@
+package org.apache.lucene.ant;
+
+import java.io.File;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.ant.IndexTask;
+
+import org.apache.tools.ant.Project;
+import org.apache.tools.ant.types.FileSet;
+
+/**
+ *  Test cases for index task
+ *
+ *@author     Erik Hatcher
+ */
+public class IndexTaskTest extends TestCase {
+    private final static String docHandler =
+            "org.apache.lucene.ant.FileExtensionDocumentHandler";
+
+    private String docsDir = System.getProperty("docs.dir");
+    private String indexDir = System.getProperty("index.dir");
+    
+    private Searcher searcher;
+    private Analyzer analyzer;
+
+
+    /**
+     *  Constructor for the IndexTaskTest object
+     *
+     *@param  name  Description of Parameter
+     */
+    public IndexTaskTest(String name) {
+        super(name);
+    }
+
+
+    /**
+     *  The JUnit setup method
+     *
+     *@exception  IOException  Description of Exception
+     */
+    public void setUp() throws IOException {
+        Project project = new Project();
+
+        IndexTask task = new IndexTask();
+        FileSet fs = new FileSet();
+        fs.setDir(new File(docsDir));
+        task.addFileset(fs);
+        task.setOverwrite(true);
+        task.setDocumentHandler(docHandler);
+        task.setIndex(new File(indexDir));
+        task.setProject(project);
+        task.execute();
+
+        searcher = new IndexSearcher(indexDir);
+        analyzer = new StopAnalyzer();
+    }
+
+
+    /**
+     *  A unit test for JUnit
+     */
+    public void testSearch() throws IOException, ParseException {
+        System.out.println("sysout");
+        System.err.println("syserr");
+        Query query = QueryParser.parse("test", "contents", analyzer);
+
+        Hits hits = searcher.search(query);
+        
+        assertEquals("Find document(s)", 2, hits.length());
+    }
+
+    /**
+     *  The teardown method for JUnit
+     * @todo remove indexDir?
+     */
+    public void tearDown() throws IOException {
+        searcher.close();
+    }
+}
+
--- a/sandbox/projects/ant/src/test/org/apache/lucene/ant/TextDocumentTest.java
+++ b/sandbox/projects/ant/src/test/org/apache/lucene/ant/TextDocumentTest.java
@ -0,0 +1,28 @@
+package org.apache.lucene.ant;
+
+import java.io.IOException;
+
+import org.apache.lucene.ant.DocumentTestCase;
+import org.apache.lucene.ant.TextDocument;
+
+public class TextDocumentTest extends DocumentTestCase
+{
+    public TextDocumentTest (String name) {
+        super(name);
+    }
+    
+    TextDocument doc;
+    
+    public void setUp() throws IOException {
+        doc = new TextDocument(getFile("test.txt"));
+    }
+    
+    public void testDoc() {
+        assertEquals("Contents", "Test Contents", doc.getContents());
+    }
+    
+    public void tearDown() {
+        doc = null;
+    }
+}
+
--- a/sandbox/projects/ant/src/test/org/apache/lucene/ant/test.html
+++ b/sandbox/projects/ant/src/test/org/apache/lucene/ant/test.html
@ -0,0 +1,7 @@
+<html>
+  <head>
+    <title>Test Title</title>
+  </head>
+  <body>
+    <i>This is <b>some</b>test</i>
+  </body>
--- a/sandbox/projects/ant/src/test/org/apache/lucene/ant/test.txt
+++ b/sandbox/projects/ant/src/test/org/apache/lucene/ant/test.txt
@ -0,0 +1 @@
+Test Contents