mirror of https://github.com/apache/lucene.git
my long promised index task. its in need of refactoring, but it does the job it was intended to do. docs and cleanup will follow.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2bd4df4bcb
commit
6bf08ccff7
|
@ -0,0 +1,121 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="lucene-ant" default="default">
|
||||
|
||||
<description>
|
||||
Lucene Ant integration
|
||||
</description>
|
||||
|
||||
<property name="build.dir" location="build"/>
|
||||
<property name="build.classes.dir" location="${build.dir}/classes"/>
|
||||
<property name="test.dir" location="${build.dir}/test"/>
|
||||
<property name="test.classes.dir" location="${test.dir}/classes"/>
|
||||
<property name="dist.dir" location="dist"/>
|
||||
<property name="lucene-ant.jar" location="${dist.dir}/lucene-ant.jar"/>
|
||||
|
||||
<property name="jtidy.jar" location="lib/Tidy.jar"/>
|
||||
<property name="junit.jar" location="${ant.home}/lib/junit.jar"/>
|
||||
<property name="lucene.bin.dir" location="../../../jakarta-lucene/bin"/>
|
||||
|
||||
<property name="build.debug" value="true"/>
|
||||
<property name="junit.fork" value="true"/>
|
||||
|
||||
<!-- ========================================================== -->
|
||||
<!-- Datatype declarations -->
|
||||
<!-- ========================================================== -->
|
||||
<path id="compile.classpath">
|
||||
<fileset dir="${lucene.bin.dir}" includes="lucene*.jar"/>
|
||||
<pathelement location="${jtidy.jar}"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="compile.classpath"/>
|
||||
<pathelement location="${junit.jar}"/>
|
||||
<pathelement location="${build.classes.dir}"/>
|
||||
<pathelement location="${test.classes.dir}"/>
|
||||
</path>
|
||||
|
||||
|
||||
<target name="default" depends="test,dist"
|
||||
description="build everything"
|
||||
/>
|
||||
|
||||
<target name="init">
|
||||
<echo message="Building ${ant.project.name}"/>
|
||||
<tstamp/>
|
||||
|
||||
<mkdir dir="${build.dir}"/>
|
||||
<mkdir dir="${build.classes.dir}"/>
|
||||
<mkdir dir="${dist.dir}"/>
|
||||
|
||||
<mkdir dir="${test.dir}"/>
|
||||
<mkdir dir="${test.classes.dir}"/>
|
||||
</target>
|
||||
|
||||
<target name="clean"
|
||||
description="Deletes all previous build artifacts">
|
||||
<delete dir="${build.dir}"/>
|
||||
<delete dir="${build.classes.dir}"/>
|
||||
<delete dir="${dist.dir}"/>
|
||||
|
||||
<delete dir="${test.dir}"/>
|
||||
<delete dir="${test.classes.dir}"/>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="compile"
|
||||
description="Create JAR">
|
||||
<echo file="${build.classes.dir}/taskdef.properties">
|
||||
index=org.apache.lucene.ant.IndexTask
|
||||
</echo>
|
||||
<jar destfile="${lucene-ant.jar}"
|
||||
basedir="${build.classes.dir}"
|
||||
/>
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="init">
|
||||
<javac destdir="${build.classes.dir}"
|
||||
debug="${build.debug}"
|
||||
includeAntRuntime="yes"
|
||||
srcdir="src/main"
|
||||
classpathref="compile.classpath"
|
||||
/>
|
||||
</target>
|
||||
|
||||
<target name="test-compile" depends="compile">
|
||||
<javac destdir="${test.classes.dir}"
|
||||
debug="${build.debug}"
|
||||
includeAntRuntime="yes"
|
||||
srcdir="src/test"
|
||||
classpathref="test.classpath"
|
||||
/>
|
||||
|
||||
<copy todir="${test.classes.dir}">
|
||||
<fileset dir="src/test" excludes="**/*.java"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="test" depends="test-compile">
|
||||
<junit printsummary="no"
|
||||
errorProperty="test.failed"
|
||||
failureProperty="test.failed"
|
||||
fork="${junit.fork}">
|
||||
<classpath refid="test.classpath"/>
|
||||
<sysproperty key="docs.dir" file="${test.classes.dir}"/>
|
||||
<sysproperty key="index.dir" file="${test.dir}/index"/>
|
||||
<formatter type="brief" usefile="false"/>
|
||||
<test name="${testcase}" if="testcase"/>
|
||||
<batchtest todir="${test.data.dir}" unless="testcase">
|
||||
<fileset dir="${test.classes.dir}"
|
||||
includes="**/*Test.class"
|
||||
/>
|
||||
</batchtest>
|
||||
</junit>
|
||||
|
||||
<fail if="test.failed">
|
||||
Unit tests failed. Check log or reports for details
|
||||
</fail>
|
||||
|
||||
</target>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[af4eed0506b53f17a4d22e4f1630ee03cb7991e5] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,23 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.File;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
/**
|
||||
* Allows a class to act as a Lucene document handler
|
||||
*
|
||||
*@author Erik Hatcher
|
||||
*@created October 27, 2001
|
||||
*/
|
||||
public interface DocumentHandler {
|
||||
/**
|
||||
* Gets the document attribute of the DocumentHandler object
|
||||
*
|
||||
*@param file Description of Parameter
|
||||
*@return The document value
|
||||
*@throws DocumentHandlerException
|
||||
*/
|
||||
public Document getDocument(File file)
|
||||
throws DocumentHandlerException;
|
||||
}
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
*/
|
||||
public class DocumentHandlerException extends Exception
|
||||
{
|
||||
private Throwable cause;
|
||||
|
||||
public DocumentHandlerException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public DocumentHandlerException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public DocumentHandlerException(Throwable cause) {
|
||||
super(cause.toString());
|
||||
this.cause = cause;
|
||||
}
|
||||
|
||||
public Throwable getException() {
|
||||
return cause;
|
||||
}
|
||||
|
||||
// Override stack trace methods to show original cause:
|
||||
public void printStackTrace() {
|
||||
printStackTrace(System.err);
|
||||
}
|
||||
|
||||
public void printStackTrace(PrintStream ps) {
|
||||
synchronized (ps) {
|
||||
super.printStackTrace(ps);
|
||||
if (cause != null) {
|
||||
ps.println("--- Nested Exception ---");
|
||||
cause.printStackTrace(ps);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void printStackTrace(PrintWriter pw) {
|
||||
synchronized (pw) {
|
||||
super.printStackTrace(pw);
|
||||
if (cause != null) {
|
||||
pw.println("--- Nested Exception ---");
|
||||
cause.printStackTrace(pw);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.File;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
/**
|
||||
* Decides which class used to create the Lucene Document
|
||||
* object based on its file extension.
|
||||
*
|
||||
*@author Erik Hatcher
|
||||
*@created October 28, 2001
|
||||
*@todo Add dynamic file extension/classname mappings for
|
||||
* extensibility
|
||||
*/
|
||||
public class FileExtensionDocumentHandler
|
||||
implements DocumentHandler {
|
||||
/**
|
||||
* Gets the document attribute of the
|
||||
* FileExtensionDocumentHandler object
|
||||
*
|
||||
*@param file Description of
|
||||
* Parameter
|
||||
*@return The document value
|
||||
*@exception DocumentHandlerException Description of
|
||||
* Exception
|
||||
*/
|
||||
public Document getDocument(File file)
|
||||
throws DocumentHandlerException {
|
||||
Document doc = null;
|
||||
|
||||
String name = file.getName();
|
||||
|
||||
try {
|
||||
if (name.endsWith(".txt")) {
|
||||
doc = TextDocument.Document(file);
|
||||
}
|
||||
|
||||
if (name.endsWith(".html")) {
|
||||
doc = HtmlDocument.Document(file);
|
||||
}
|
||||
}
|
||||
catch (java.io.IOException e) {
|
||||
throw new DocumentHandlerException(e);
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,232 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringWriter;
|
||||
|
||||
// Imports commented out since there is a name clash and fully
|
||||
// qualified class names will be used in the code. Imports are
|
||||
// left for ease of maintenance.
|
||||
import org.apache.lucene.document.Field;
|
||||
//import org.apache.lucene.document.Document;
|
||||
//import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Element;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.w3c.dom.Text;
|
||||
import org.w3c.tidy.Tidy;
|
||||
|
||||
/**
|
||||
* The <code>HtmlDocument</code> class creates a Lucene {@link
|
||||
* org.apache.lucene.document.Document} from an HTML document. <P>
|
||||
*
|
||||
* It does this by using JTidy package. It can take input input
|
||||
* from {@link java.io.File} or {@link java.io.InputStream}.
|
||||
*
|
||||
*@author Erik Hatcher
|
||||
*@created October 27, 2001
|
||||
*/
|
||||
public class HtmlDocument {
|
||||
private Element rawDoc;
|
||||
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Constructors
|
||||
//-------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructs an <code>HtmlDocument</code> from a {@link
|
||||
* java.io.File}.
|
||||
*
|
||||
*@param file the <code>File</code> containing the
|
||||
* HTML to parse
|
||||
*@exception IOException if an I/O exception occurs
|
||||
*@since
|
||||
*/
|
||||
public HtmlDocument(File file) throws IOException {
|
||||
Tidy tidy = new Tidy();
|
||||
tidy.setQuiet(true);
|
||||
tidy.setShowWarnings(false);
|
||||
org.w3c.dom.Document root =
|
||||
tidy.parseDOM(new FileInputStream(file), null);
|
||||
rawDoc = root.getDocumentElement();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs an <code>HtmlDocument</code> from an {@link
|
||||
* java.io.InputStream}.
|
||||
*
|
||||
*@param is the <code>InputStream</code>
|
||||
* containing the HTML
|
||||
*@exception IOException if I/O exception occurs
|
||||
*@since
|
||||
*/
|
||||
public HtmlDocument(InputStream is) throws IOException {
|
||||
Tidy tidy = new Tidy();
|
||||
tidy.setQuiet(true);
|
||||
tidy.setShowWarnings(false);
|
||||
org.w3c.dom.Document root = tidy.parseDOM(is, null);
|
||||
rawDoc = root.getDocumentElement();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a Lucene <code>Document</code> from an {@link
|
||||
* java.io.InputStream}.
|
||||
*
|
||||
*@param is
|
||||
*@return
|
||||
*@exception IOException
|
||||
*/
|
||||
public static org.apache.lucene.document.Document
|
||||
getDocument(InputStream is) throws IOException {
|
||||
HtmlDocument htmlDoc = new HtmlDocument(is);
|
||||
org.apache.lucene.document.Document luceneDoc =
|
||||
new org.apache.lucene.document.Document();
|
||||
|
||||
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
|
||||
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
|
||||
|
||||
return luceneDoc;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Public methods
|
||||
//-------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a Lucene <code>Document</code> from a {@link
|
||||
* java.io.File}.
|
||||
*
|
||||
*@param file
|
||||
*@return
|
||||
*@exception IOException
|
||||
*/
|
||||
public static org.apache.lucene.document.Document
|
||||
Document(File file) throws IOException {
|
||||
HtmlDocument htmlDoc = new HtmlDocument(file);
|
||||
org.apache.lucene.document.Document luceneDoc =
|
||||
new org.apache.lucene.document.Document();
|
||||
|
||||
luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
|
||||
luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
|
||||
|
||||
String contents = null;
|
||||
BufferedReader br =
|
||||
new BufferedReader(new FileReader(file));
|
||||
StringWriter sw = new StringWriter();
|
||||
String line = br.readLine();
|
||||
while (line != null) {
|
||||
sw.write(line);
|
||||
line = br.readLine();
|
||||
}
|
||||
br.close();
|
||||
contents = sw.toString();
|
||||
sw.close();
|
||||
|
||||
luceneDoc.add(Field.UnIndexed("rawcontents", contents));
|
||||
|
||||
return luceneDoc;
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------
|
||||
// Private methods
|
||||
//-------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Runs <code>HtmlDocument</code> on the files specified on
|
||||
* the command line.
|
||||
*
|
||||
*@param args Command line arguments
|
||||
*@exception Exception Description of Exception
|
||||
*/
|
||||
private static void main(String args[]) throws Exception {
|
||||
// HtmlDocument doc = new HtmlDocument(new File(args[0]));
|
||||
// System.out.println("Title = " + doc.getTitle());
|
||||
// System.out.println("Body = " + doc.getBody());
|
||||
|
||||
HtmlDocument doc =
|
||||
new HtmlDocument(new FileInputStream(new File(args[0])));
|
||||
System.out.println("Title = " + doc.getTitle());
|
||||
System.out.println("Body = " + doc.getBody());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the title attribute of the <code>HtmlDocument</code>
|
||||
* object.
|
||||
*
|
||||
*@return the title value
|
||||
*/
|
||||
public String getTitle() {
|
||||
if (rawDoc == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String title = "";
|
||||
|
||||
NodeList nl = rawDoc.getElementsByTagName("title");
|
||||
if (nl.getLength() > 0) {
|
||||
Element titleElement = ((Element) nl.item(0));
|
||||
Text text = (Text) titleElement.getFirstChild();
|
||||
if (text != null) {
|
||||
title = text.getData();
|
||||
}
|
||||
}
|
||||
return title;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the bodyText attribute of the
|
||||
* <code>HtmlDocument</code> object.
|
||||
*
|
||||
*@return the bodyText value
|
||||
*/
|
||||
public String getBody() {
|
||||
if (rawDoc == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String body = "";
|
||||
NodeList nl = rawDoc.getElementsByTagName("body");
|
||||
if (nl.getLength() > 0) {
|
||||
body = getBodyText(nl.item(0));
|
||||
}
|
||||
return body;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Gets the bodyText attribute of the
|
||||
* <code>HtmlDocument</code> object.
|
||||
*
|
||||
*@param node a DOM Node
|
||||
*@return The bodyText value
|
||||
*/
|
||||
private String getBodyText(Node node) {
|
||||
NodeList nl = node.getChildNodes();
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
for (int i = 0; i < nl.getLength(); i++) {
|
||||
Node child = nl.item(i);
|
||||
switch (child.getNodeType()) {
|
||||
case Node.ELEMENT_NODE:
|
||||
buffer.append(getBodyText(child));
|
||||
buffer.append(" ");
|
||||
break;
|
||||
case Node.TEXT_NODE:
|
||||
buffer.append(((Text) child).getData());
|
||||
break;
|
||||
}
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,286 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.Vector;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.document.DateField;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
||||
import org.apache.tools.ant.BuildException;
|
||||
import org.apache.tools.ant.DirectoryScanner;
|
||||
import org.apache.tools.ant.Project;
|
||||
import org.apache.tools.ant.Task;
|
||||
import org.apache.tools.ant.types.FileSet;
|
||||
|
||||
/**
|
||||
* Builds a Lucene index from a fileset.
|
||||
*
|
||||
* @author Erik Hatcher
|
||||
*/
|
||||
public class IndexTask extends Task {
|
||||
/**
|
||||
* file list
|
||||
*/
|
||||
private Vector filesets = new Vector();
|
||||
|
||||
/**
|
||||
* overwrite index?
|
||||
*/
|
||||
private boolean overwrite = false;
|
||||
|
||||
/**
|
||||
* index path
|
||||
*/
|
||||
private File indexPath;
|
||||
|
||||
/**
|
||||
* document handler classname
|
||||
*/
|
||||
private String handlerClassName =
|
||||
"org.apache.lucene.ant.FileExtensionDocumentHandler";
|
||||
|
||||
/**
|
||||
* document handler instance
|
||||
*/
|
||||
private DocumentHandler handler;
|
||||
|
||||
/**
|
||||
* Lucene merge factor
|
||||
*/
|
||||
private int mergeFactor = 20;
|
||||
|
||||
|
||||
/**
|
||||
* Specifies the directory where the index will be stored
|
||||
*
|
||||
* @param indexPath The new index value
|
||||
*/
|
||||
public void setIndex(File indexPath) {
|
||||
this.indexPath = indexPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the mergeFactor attribute of the IndexTask object
|
||||
*
|
||||
*@param mergeFactor The new mergeFactor value
|
||||
*/
|
||||
public void setMergeFactor(int mergeFactor) {
|
||||
this.mergeFactor = mergeFactor;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* If true, index will be overwritten.
|
||||
*
|
||||
* @param overwrite The new overwrite value
|
||||
*/
|
||||
public void setOverwrite(boolean overwrite) {
|
||||
this.overwrite = overwrite;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Classname of document handler.
|
||||
*
|
||||
* @param classname The new documentHandler value
|
||||
*/
|
||||
public void setDocumentHandler(String classname) {
|
||||
handlerClassName = classname;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds a set of files.
|
||||
*
|
||||
* @param set FileSet to be added
|
||||
*/
|
||||
public void addFileset(FileSet set) {
|
||||
filesets.addElement(set);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Begins the indexing
|
||||
*
|
||||
* @exception BuildException If an error occurs indexing the
|
||||
* fileset
|
||||
* @todo add classpath handling so handler does not
|
||||
* have to be in system classpath
|
||||
*/
|
||||
public void execute() throws BuildException {
|
||||
try {
|
||||
Class clazz = Class.forName(handlerClassName);
|
||||
handler = (DocumentHandler) clazz.newInstance();
|
||||
}
|
||||
catch (ClassNotFoundException cnfe) {
|
||||
throw new BuildException(cnfe);
|
||||
}
|
||||
catch (InstantiationException ie) {
|
||||
throw new BuildException(ie);
|
||||
}
|
||||
catch (IllegalAccessException iae) {
|
||||
throw new BuildException(iae);
|
||||
}
|
||||
|
||||
try {
|
||||
indexDocs();
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new BuildException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* index the fileset
|
||||
*
|
||||
* @exception IOException Description of Exception
|
||||
* @todo refactor - definitely lots of room for improvement here
|
||||
*/
|
||||
private void indexDocs() throws IOException {
|
||||
Date start = new Date();
|
||||
|
||||
boolean create = overwrite;
|
||||
// If the index directory doesn't exist,
|
||||
// create it and force create mode
|
||||
if (indexPath.mkdirs() && !overwrite) {
|
||||
create = true;
|
||||
}
|
||||
|
||||
Searcher searcher = null;
|
||||
Analyzer analyzer = new StopAnalyzer();
|
||||
boolean checkLastModified = false;
|
||||
if (!create) {
|
||||
try {
|
||||
searcher = new IndexSearcher(indexPath.getAbsolutePath());
|
||||
checkLastModified = true;
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
log("IOException: " + ioe.getMessage());
|
||||
// Empty - ignore, which indicates to index all
|
||||
// documents
|
||||
}
|
||||
}
|
||||
|
||||
log("checkLastModified = " + checkLastModified);
|
||||
|
||||
IndexWriter writer =
|
||||
new IndexWriter(indexPath, analyzer, create);
|
||||
int totalFiles = 0;
|
||||
int totalIndexed = 0;
|
||||
int totalIgnored = 0;
|
||||
try {
|
||||
writer.mergeFactor = mergeFactor;
|
||||
|
||||
for (int i = 0; i < filesets.size(); i++) {
|
||||
FileSet fs = (FileSet) filesets.elementAt(i);
|
||||
if (fs != null) {
|
||||
DirectoryScanner ds =
|
||||
fs.getDirectoryScanner(project);
|
||||
String[] dsfiles = ds.getIncludedFiles();
|
||||
File baseDir = ds.getBasedir();
|
||||
|
||||
for (int j = 0; j < dsfiles.length; j++) {
|
||||
File file = new File(baseDir, dsfiles[j]);
|
||||
totalFiles++;
|
||||
|
||||
if (!file.exists() || !file.canRead()) {
|
||||
throw new BuildException("File \"" +
|
||||
file.getAbsolutePath()
|
||||
+ "\" does not exist or is not readable.");
|
||||
}
|
||||
|
||||
boolean indexIt = true;
|
||||
|
||||
if (checkLastModified) {
|
||||
Hits hits = null;
|
||||
Term pathTerm =
|
||||
new Term("path", file.getPath());
|
||||
TermQuery query =
|
||||
new TermQuery(pathTerm);
|
||||
hits = searcher.search(query);
|
||||
|
||||
// if document is found, compare the
|
||||
// indexed last modified time with the
|
||||
// current file
|
||||
// - don't index if up to date
|
||||
if (hits.length() > 0) {
|
||||
Document doc = hits.doc(0);
|
||||
String indexModified =
|
||||
doc.get("modified");
|
||||
if (indexModified != null) {
|
||||
if (DateField.stringToTime(indexModified)
|
||||
== file.lastModified()) {
|
||||
indexIt = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (indexIt) {
|
||||
try {
|
||||
log("Indexing " + file.getPath(),
|
||||
Project.MSG_VERBOSE);
|
||||
Document doc =
|
||||
handler.getDocument(file);
|
||||
|
||||
if (doc == null) {
|
||||
totalIgnored++;
|
||||
}
|
||||
else {
|
||||
// Add the path of the file as a field named "path". Use a Text field, so
|
||||
// that the index stores the path, and so that the path is searchable
|
||||
doc.add(Field.Keyword("path", file.getPath()));
|
||||
|
||||
// Add the last modified date of the file a field named "modified". Use a
|
||||
// Keyword field, so that it's searchable, but so that no attempt is made
|
||||
// to tokenize the field into words.
|
||||
doc.add(Field.Keyword("modified",
|
||||
DateField.timeToString(file.lastModified())));
|
||||
|
||||
writer.addDocument(doc);
|
||||
totalIndexed++;
|
||||
}
|
||||
}
|
||||
catch (DocumentHandlerException e) {
|
||||
throw new BuildException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
// for j
|
||||
}
|
||||
// if (fs != null)
|
||||
}
|
||||
// for i
|
||||
|
||||
writer.optimize();
|
||||
}
|
||||
//try
|
||||
finally {
|
||||
// always make sure everything gets closed,
|
||||
// no matter how we exit.
|
||||
writer.close();
|
||||
if (searcher != null) {
|
||||
searcher.close();
|
||||
}
|
||||
}
|
||||
|
||||
Date end = new Date();
|
||||
|
||||
log(totalIndexed + " out of " + totalFiles + " indexed (" +
|
||||
totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) +
|
||||
" milliseconds");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
/**
|
||||
* A utility for making Lucene Documents from a File.
|
||||
*
|
||||
*@author Erik Hatcher
|
||||
*@created December 6, 2001
|
||||
*@todo Fix JavaDoc comments here
|
||||
*/
|
||||
|
||||
public class TextDocument {
|
||||
private String contents;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for the TextDocument object
|
||||
*
|
||||
*@param file Description of Parameter
|
||||
*@exception IOException Description of Exception
|
||||
*/
|
||||
public TextDocument(File file) throws IOException {
|
||||
BufferedReader br =
|
||||
new BufferedReader(new FileReader(file));
|
||||
StringWriter sw = new StringWriter();
|
||||
|
||||
String line = br.readLine();
|
||||
while (line != null) {
|
||||
sw.write(line);
|
||||
line = br.readLine();
|
||||
}
|
||||
br.close();
|
||||
|
||||
contents = sw.toString();
|
||||
sw.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Makes a document for a File. <p>
|
||||
*
|
||||
* The document has a single field:
|
||||
* <ul>
|
||||
* <li> <code>contents</code>--containing the full contents
|
||||
* of the file, as a Text field;
|
||||
*
|
||||
*@param f Description of Parameter
|
||||
*@return Description of the Returned Value
|
||||
*@exception IOException Description of Exception
|
||||
*/
|
||||
public static Document Document(File f) throws IOException {
|
||||
|
||||
TextDocument textDoc = new TextDocument(f);
|
||||
// make a new, empty document
|
||||
Document doc = new Document();
|
||||
|
||||
doc.add(Field.Text("contents", textDoc.getContents()));
|
||||
doc.add(Field.UnIndexed("rawcontents",
|
||||
textDoc.getContents()));
|
||||
|
||||
// return the document
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*@return The contents value
|
||||
*@todo finish this method
|
||||
*/
|
||||
public String getContents() {
|
||||
return contents;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public abstract class DocumentTestCase extends TestCase
|
||||
{
|
||||
public DocumentTestCase(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
protected File getFile(String filename) throws IOException {
|
||||
String fullname =
|
||||
this.getClass().getResource(filename).getFile();
|
||||
|
||||
File file = new File(fullname);
|
||||
|
||||
return file;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.ant.DocumentTestCase;
|
||||
import org.apache.lucene.ant.HtmlDocument;
|
||||
|
||||
public class HtmlDocumentTest extends DocumentTestCase
|
||||
{
|
||||
public HtmlDocumentTest (String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
HtmlDocument doc;
|
||||
|
||||
public void setUp() throws IOException {
|
||||
doc = new HtmlDocument(getFile("test.html"));
|
||||
}
|
||||
|
||||
public void testDoc() {
|
||||
assertEquals("Title", "Test Title", doc.getTitle());
|
||||
assertTrue("Body", doc.getBody().startsWith("This is some test"));
|
||||
}
|
||||
|
||||
public void tearDown() {
|
||||
doc = null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.ant.IndexTask;
|
||||
|
||||
import org.apache.tools.ant.Project;
|
||||
import org.apache.tools.ant.types.FileSet;
|
||||
|
||||
/**
|
||||
* Test cases for index task
|
||||
*
|
||||
*@author Erik Hatcher
|
||||
*/
|
||||
public class IndexTaskTest extends TestCase {
|
||||
private final static String docHandler =
|
||||
"org.apache.lucene.ant.FileExtensionDocumentHandler";
|
||||
|
||||
private String docsDir = System.getProperty("docs.dir");
|
||||
private String indexDir = System.getProperty("index.dir");
|
||||
|
||||
private Searcher searcher;
|
||||
private Analyzer analyzer;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor for the IndexTaskTest object
|
||||
*
|
||||
*@param name Description of Parameter
|
||||
*/
|
||||
public IndexTaskTest(String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* The JUnit setup method
|
||||
*
|
||||
*@exception IOException Description of Exception
|
||||
*/
|
||||
public void setUp() throws IOException {
|
||||
Project project = new Project();
|
||||
|
||||
IndexTask task = new IndexTask();
|
||||
FileSet fs = new FileSet();
|
||||
fs.setDir(new File(docsDir));
|
||||
task.addFileset(fs);
|
||||
task.setOverwrite(true);
|
||||
task.setDocumentHandler(docHandler);
|
||||
task.setIndex(new File(indexDir));
|
||||
task.setProject(project);
|
||||
task.execute();
|
||||
|
||||
searcher = new IndexSearcher(indexDir);
|
||||
analyzer = new StopAnalyzer();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A unit test for JUnit
|
||||
*/
|
||||
public void testSearch() throws IOException, ParseException {
|
||||
System.out.println("sysout");
|
||||
System.err.println("syserr");
|
||||
Query query = QueryParser.parse("test", "contents", analyzer);
|
||||
|
||||
Hits hits = searcher.search(query);
|
||||
|
||||
assertEquals("Find document(s)", 2, hits.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* The teardown method for JUnit
|
||||
* @todo remove indexDir?
|
||||
*/
|
||||
public void tearDown() throws IOException {
|
||||
searcher.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.apache.lucene.ant;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.ant.DocumentTestCase;
|
||||
import org.apache.lucene.ant.TextDocument;
|
||||
|
||||
public class TextDocumentTest extends DocumentTestCase
|
||||
{
|
||||
public TextDocumentTest (String name) {
|
||||
super(name);
|
||||
}
|
||||
|
||||
TextDocument doc;
|
||||
|
||||
public void setUp() throws IOException {
|
||||
doc = new TextDocument(getFile("test.txt"));
|
||||
}
|
||||
|
||||
public void testDoc() {
|
||||
assertEquals("Contents", "Test Contents", doc.getContents());
|
||||
}
|
||||
|
||||
public void tearDown() {
|
||||
doc = null;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
<html>
|
||||
<head>
|
||||
<title>Test Title</title>
|
||||
</head>
|
||||
<body>
|
||||
<i>This is <b>some</b>test</i>
|
||||
</body>
|
|
@ -0,0 +1 @@
|
|||
Test Contents
|
Loading…
Reference in New Issue