mirror of https://github.com/apache/lucene.git
my long promised index task. its in need of refactoring, but it does the job it was intended to do. docs and cleanup will follow.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
@ -0,0 +1,121 @@
<?xml version="1.0"?>
<project name="lucene-ant" default="default">
Lucene Ant integration
<property name="build.dir" location="build"/>
<property name="build.classes.dir" location="${build.dir}/classes"/>
<property name="test.dir" location="${build.dir}/test"/>
<property name="test.classes.dir" location="${test.dir}/classes"/>
<property name="dist.dir" location="dist"/>
<property name="lucene-ant.jar" location="${dist.dir}/lucene-ant.jar"/>
<property name="jtidy.jar" location="lib/Tidy.jar"/>
<property name="junit.jar" location="${ant.home}/lib/junit.jar"/>
<property name="lucene.bin.dir" location="../../../jakarta-lucene/bin"/>
<property name="build.debug" value="true"/>
<property name="junit.fork" value="true"/>
<!-- ========================================================== -->
<!-- Datatype declarations -->
<!-- ========================================================== -->
<path id="compile.classpath">
<fileset dir="${lucene.bin.dir}" includes="lucene*.jar"/>
<pathelement location="${jtidy.jar}"/>
<path id="test.classpath">
<path refid="compile.classpath"/>
<pathelement location="${junit.jar}"/>
<pathelement location="${build.classes.dir}"/>
<pathelement location="${test.classes.dir}"/>
<target name="default" depends="test,dist"
description="build everything"
<target name="init">
<echo message="Building ${ant.project.name}"/>
<mkdir dir="${build.dir}"/>
<mkdir dir="${build.classes.dir}"/>
<mkdir dir="${dist.dir}"/>
<mkdir dir="${test.dir}"/>
<mkdir dir="${test.classes.dir}"/>
<target name="clean"
description="Deletes all previous build artifacts">
<delete dir="${build.dir}"/>
<delete dir="${build.classes.dir}"/>
<delete dir="${dist.dir}"/>
<delete dir="${test.dir}"/>
<delete dir="${test.classes.dir}"/>
<target name="dist" depends="compile"
description="Create JAR">
<echo file="${build.classes.dir}/taskdef.properties">
<jar destfile="${lucene-ant.jar}"
<target name="compile" depends="init">
<javac destdir="${build.classes.dir}"
<target name="test-compile" depends="compile">
<javac destdir="${test.classes.dir}"
<copy todir="${test.classes.dir}">
<fileset dir="src/test" excludes="**/*.java"/>
<target name="test" depends="test-compile">
<junit printsummary="no"
<classpath refid="test.classpath"/>
<sysproperty key="docs.dir" file="${test.classes.dir}"/>
<sysproperty key="index.dir" file="${test.dir}/index"/>
<formatter type="brief" usefile="false"/>
<test name="${testcase}" if="testcase"/>
<batchtest todir="${test.data.dir}" unless="testcase">
<fileset dir="${test.classes.dir}"
<fail if="test.failed">
Unit tests failed. Check log or reports for details
@ -0,0 +1,2 @@
AnyObjectId[af4eed0506b53f17a4d22e4f1630ee03cb7991e5] was removed in git history.
Apache SVN contains full history.
@ -0,0 +1,23 @@
package org.apache.lucene.ant;
import java.io.File;
import org.apache.lucene.document.Document;
* Allows a class to act as a Lucene document handler
*@author Erik Hatcher
*@created October 27, 2001
public interface DocumentHandler {
* Gets the document attribute of the DocumentHandler object
*@param file Description of Parameter
*@return The document value
*@throws DocumentHandlerException
public Document getDocument(File file)
throws DocumentHandlerException;
@ -0,0 +1,54 @@
package org.apache.lucene.ant;
import java.io.PrintStream;
import java.io.PrintWriter;
public class DocumentHandlerException extends Exception
private Throwable cause;
public DocumentHandlerException() {
public DocumentHandlerException(String message) {
public DocumentHandlerException(Throwable cause) {
this.cause = cause;
public Throwable getException() {
return cause;
// Override stack trace methods to show original cause:
public void printStackTrace() {
public void printStackTrace(PrintStream ps) {
synchronized (ps) {
if (cause != null) {
ps.println("--- Nested Exception ---");
public void printStackTrace(PrintWriter pw) {
synchronized (pw) {
if (cause != null) {
pw.println("--- Nested Exception ---");
@ -0,0 +1,49 @@
package org.apache.lucene.ant;
import java.io.File;
import org.apache.lucene.document.Document;
* Decides which class used to create the Lucene Document
* object based on its file extension.
*@author Erik Hatcher
*@created October 28, 2001
*@todo Add dynamic file extension/classname mappings for
* extensibility
public class FileExtensionDocumentHandler
implements DocumentHandler {
* Gets the document attribute of the
* FileExtensionDocumentHandler object
*@param file Description of
* Parameter
*@return The document value
*@exception DocumentHandlerException Description of
* Exception
public Document getDocument(File file)
throws DocumentHandlerException {
Document doc = null;
String name = file.getName();
try {
if (name.endsWith(".txt")) {
doc = TextDocument.Document(file);
if (name.endsWith(".html")) {
doc = HtmlDocument.Document(file);
catch (java.io.IOException e) {
throw new DocumentHandlerException(e);
return doc;
@ -0,0 +1,232 @@
package org.apache.lucene.ant;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
// Imports commented out since there is a name clash and fully
// qualified class names will be used in the code. Imports are
// left for ease of maintenance.
import org.apache.lucene.document.Field;
//import org.apache.lucene.document.Document;
//import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;
* The <code>HtmlDocument</code> class creates a Lucene {@link
* org.apache.lucene.document.Document} from an HTML document. <P>
* It does this by using JTidy package. It can take input input
* from {@link java.io.File} or {@link java.io.InputStream}.
*@author Erik Hatcher
*@created October 27, 2001
public class HtmlDocument {
private Element rawDoc;
// Constructors
* Constructs an <code>HtmlDocument</code> from a {@link
* java.io.File}.
*@param file the <code>File</code> containing the
* HTML to parse
*@exception IOException if an I/O exception occurs
public HtmlDocument(File file) throws IOException {
Tidy tidy = new Tidy();
org.w3c.dom.Document root =
tidy.parseDOM(new FileInputStream(file), null);
rawDoc = root.getDocumentElement();
* Constructs an <code>HtmlDocument</code> from an {@link
* java.io.InputStream}.
*@param is the <code>InputStream</code>
* containing the HTML
*@exception IOException if I/O exception occurs
public HtmlDocument(InputStream is) throws IOException {
Tidy tidy = new Tidy();
org.w3c.dom.Document root = tidy.parseDOM(is, null);
rawDoc = root.getDocumentElement();
* Creates a Lucene <code>Document</code> from an {@link
* java.io.InputStream}.
*@param is
*@exception IOException
public static org.apache.lucene.document.Document
getDocument(InputStream is) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(is);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
return luceneDoc;
// Public methods
* Creates a Lucene <code>Document</code> from a {@link
* java.io.File}.
*@param file
*@exception IOException
public static org.apache.lucene.document.Document
Document(File file) throws IOException {
HtmlDocument htmlDoc = new HtmlDocument(file);
org.apache.lucene.document.Document luceneDoc =
new org.apache.lucene.document.Document();
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
String contents = null;
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
line = br.readLine();
contents = sw.toString();
luceneDoc.add(Field.UnIndexed("rawcontents", contents));
return luceneDoc;
// Private methods
* Runs <code>HtmlDocument</code> on the files specified on
* the command line.
*@param args Command line arguments
*@exception Exception Description of Exception
private static void main(String args[]) throws Exception {
// HtmlDocument doc = new HtmlDocument(new File(args[0]));
// System.out.println("Title = " + doc.getTitle());
// System.out.println("Body = " + doc.getBody());
HtmlDocument doc =
new HtmlDocument(new FileInputStream(new File(args[0])));
System.out.println("Title = " + doc.getTitle());
System.out.println("Body = " + doc.getBody());
* Gets the title attribute of the <code>HtmlDocument</code>
* object.
*@return the title value
public String getTitle() {
if (rawDoc == null) {
return null;
String title = "";
NodeList nl = rawDoc.getElementsByTagName("title");
if (nl.getLength() > 0) {
Element titleElement = ((Element) nl.item(0));
Text text = (Text) titleElement.getFirstChild();
if (text != null) {
title = text.getData();
return title;
* Gets the bodyText attribute of the
* <code>HtmlDocument</code> object.
*@return the bodyText value
public String getBody() {
if (rawDoc == null) {
return null;
String body = "";
NodeList nl = rawDoc.getElementsByTagName("body");
if (nl.getLength() > 0) {
body = getBodyText(nl.item(0));
return body;
* Gets the bodyText attribute of the
* <code>HtmlDocument</code> object.
*@param node a DOM Node
*@return The bodyText value
private String getBodyText(Node node) {
NodeList nl = node.getChildNodes();
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < nl.getLength(); i++) {
Node child = nl.item(i);
switch (child.getNodeType()) {
buffer.append(" ");
case Node.TEXT_NODE:
buffer.append(((Text) child).getData());
return buffer.toString();
@ -0,0 +1,286 @@
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Vector;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.DirectoryScanner;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.Task;
import org.apache.tools.ant.types.FileSet;
* Builds a Lucene index from a fileset.
* @author Erik Hatcher
public class IndexTask extends Task {
* file list
private Vector filesets = new Vector();
* overwrite index?
private boolean overwrite = false;
* index path
private File indexPath;
* document handler classname
private String handlerClassName =
* document handler instance
private DocumentHandler handler;
* Lucene merge factor
private int mergeFactor = 20;
* Specifies the directory where the index will be stored
* @param indexPath The new index value
public void setIndex(File indexPath) {
this.indexPath = indexPath;
* Sets the mergeFactor attribute of the IndexTask object
*@param mergeFactor The new mergeFactor value
public void setMergeFactor(int mergeFactor) {
this.mergeFactor = mergeFactor;
* If true, index will be overwritten.
* @param overwrite The new overwrite value
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
* Classname of document handler.
* @param classname The new documentHandler value
public void setDocumentHandler(String classname) {
handlerClassName = classname;
* Adds a set of files.
* @param set FileSet to be added
public void addFileset(FileSet set) {
* Begins the indexing
* @exception BuildException If an error occurs indexing the
* fileset
* @todo add classpath handling so handler does not
* have to be in system classpath
public void execute() throws BuildException {
try {
Class clazz = Class.forName(handlerClassName);
handler = (DocumentHandler) clazz.newInstance();
catch (ClassNotFoundException cnfe) {
throw new BuildException(cnfe);
catch (InstantiationException ie) {
throw new BuildException(ie);
catch (IllegalAccessException iae) {
throw new BuildException(iae);
try {
catch (IOException e) {
throw new BuildException(e);
* index the fileset
* @exception IOException Description of Exception
* @todo refactor - definitely lots of room for improvement here
private void indexDocs() throws IOException {
Date start = new Date();
boolean create = overwrite;
// If the index directory doesn't exist,
// create it and force create mode
if (indexPath.mkdirs() && !overwrite) {
create = true;
Searcher searcher = null;
Analyzer analyzer = new StopAnalyzer();
boolean checkLastModified = false;
if (!create) {
try {
searcher = new IndexSearcher(indexPath.getAbsolutePath());
checkLastModified = true;
catch (IOException ioe) {
log("IOException: " + ioe.getMessage());
// Empty - ignore, which indicates to index all
// documents
log("checkLastModified = " + checkLastModified);
IndexWriter writer =
new IndexWriter(indexPath, analyzer, create);
int totalFiles = 0;
int totalIndexed = 0;
int totalIgnored = 0;
try {
writer.mergeFactor = mergeFactor;
for (int i = 0; i < filesets.size(); i++) {
FileSet fs = (FileSet) filesets.elementAt(i);
if (fs != null) {
DirectoryScanner ds =
String[] dsfiles = ds.getIncludedFiles();
File baseDir = ds.getBasedir();
for (int j = 0; j < dsfiles.length; j++) {
File file = new File(baseDir, dsfiles[j]);
if (!file.exists() || !file.canRead()) {
throw new BuildException("File \"" +
+ "\" does not exist or is not readable.");
boolean indexIt = true;
if (checkLastModified) {
Hits hits = null;
Term pathTerm =
new Term("path", file.getPath());
TermQuery query =
new TermQuery(pathTerm);
hits = searcher.search(query);
// if document is found, compare the
// indexed last modified time with the
// current file
// - don't index if up to date
if (hits.length() > 0) {
Document doc = hits.doc(0);
String indexModified =
if (indexModified != null) {
if (DateField.stringToTime(indexModified)
== file.lastModified()) {
indexIt = false;
if (indexIt) {
try {
log("Indexing " + file.getPath(),
Document doc =
if (doc == null) {
else {
// Add the path of the file as a field named "path". Use a Text field, so
// that the index stores the path, and so that the path is searchable
doc.add(Field.Keyword("path", file.getPath()));
// Add the last modified date of the file a field named "modified". Use a
// Keyword field, so that it's searchable, but so that no attempt is made
// to tokenize the field into words.
catch (DocumentHandlerException e) {
throw new BuildException(e);
// for j
// if (fs != null)
// for i
finally {
// always make sure everything gets closed,
// no matter how we exit.
if (searcher != null) {
Date end = new Date();
log(totalIndexed + " out of " + totalFiles + " indexed (" +
totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
" milliseconds");
@ -0,0 +1,82 @@
package org.apache.lucene.ant;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringWriter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
* A utility for making Lucene Documents from a File.
*@author Erik Hatcher
*@created December 6, 2001
*@todo Fix JavaDoc comments here
public class TextDocument {
private String contents;
* Constructor for the TextDocument object
*@param file Description of Parameter
*@exception IOException Description of Exception
public TextDocument(File file) throws IOException {
BufferedReader br =
new BufferedReader(new FileReader(file));
StringWriter sw = new StringWriter();
String line = br.readLine();
while (line != null) {
line = br.readLine();
contents = sw.toString();
* Makes a document for a File. <p>
* The document has a single field:
* <ul>
* <li> <code>contents</code>--containing the full contents
* of the file, as a Text field;
*@param f Description of Parameter
*@return Description of the Returned Value
*@exception IOException Description of Exception
public static Document Document(File f) throws IOException {
TextDocument textDoc = new TextDocument(f);
// make a new, empty document
Document doc = new Document();
doc.add(Field.Text("contents", textDoc.getContents()));
// return the document
return doc;
*@return The contents value
*@todo finish this method
public String getContents() {
return contents;
@ -0,0 +1,22 @@
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
public abstract class DocumentTestCase extends TestCase
public DocumentTestCase(String name) {
protected File getFile(String filename) throws IOException {
String fullname =
File file = new File(fullname);
return file;
@ -0,0 +1,29 @@
package org.apache.lucene.ant;
import java.io.IOException;
import org.apache.lucene.ant.DocumentTestCase;
import org.apache.lucene.ant.HtmlDocument;
public class HtmlDocumentTest extends DocumentTestCase
public HtmlDocumentTest (String name) {
HtmlDocument doc;
public void setUp() throws IOException {
doc = new HtmlDocument(getFile("test.html"));
public void testDoc() {
assertEquals("Title", "Test Title", doc.getTitle());
assertTrue("Body", doc.getBody().startsWith("This is some test"));
public void tearDown() {
doc = null;
@ -0,0 +1,92 @@
package org.apache.lucene.ant;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.ant.IndexTask;
import org.apache.tools.ant.Project;
import org.apache.tools.ant.types.FileSet;
* Test cases for index task
*@author Erik Hatcher
public class IndexTaskTest extends TestCase {
private final static String docHandler =
private String docsDir = System.getProperty("docs.dir");
private String indexDir = System.getProperty("index.dir");
private Searcher searcher;
private Analyzer analyzer;
* Constructor for the IndexTaskTest object
*@param name Description of Parameter
public IndexTaskTest(String name) {
* The JUnit setup method
*@exception IOException Description of Exception
public void setUp() throws IOException {
Project project = new Project();
IndexTask task = new IndexTask();
FileSet fs = new FileSet();
fs.setDir(new File(docsDir));
task.setIndex(new File(indexDir));
searcher = new IndexSearcher(indexDir);
analyzer = new StopAnalyzer();
* A unit test for JUnit
public void testSearch() throws IOException, ParseException {
Query query = QueryParser.parse("test", "contents", analyzer);
Hits hits = searcher.search(query);
assertEquals("Find document(s)", 2, hits.length());
* The teardown method for JUnit
* @todo remove indexDir?
public void tearDown() throws IOException {
@ -0,0 +1,28 @@
package org.apache.lucene.ant;
import java.io.IOException;
import org.apache.lucene.ant.DocumentTestCase;
import org.apache.lucene.ant.TextDocument;
public class TextDocumentTest extends DocumentTestCase
public TextDocumentTest (String name) {
TextDocument doc;
public void setUp() throws IOException {
doc = new TextDocument(getFile("test.txt"));
public void testDoc() {
assertEquals("Contents", "Test Contents", doc.getContents());
public void tearDown() {
doc = null;
@ -0,0 +1,7 @@
<title>Test Title</title>
<i>This is <b>some</b>test</i>
@ -0,0 +1 @@
Test Contents
Reference in New Issue