Importing the classes seem to have warped the whitespaces. Here's my attempt to get things back to normal.

Introduced new datasource and contenthandler mechanism. It's quite a major alteration for individual changes to be enumerated.


git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150758 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kelvin Tan 2002-05-08 15:52:37 +00:00
parent 5b5ea958c9
commit a716edd6d1
16 changed files with 662 additions and 354 deletions

View File

@ -26,12 +26,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,22 +54,35 @@ package search;
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import java.util.Map;
import java.util.Set;
/** /**
* Generic implementation of a datasource. * Generic implementation of a datasource.
*
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
*/ */
public abstract class AbstractDataSource implements DataSource public abstract class AbstractDataSource implements DataSource
{ {
protected SearchConfiguration config; protected AbstractDataSource()
public AbstractDataSource(SearchConfiguration config)
{ {
this.config = config;
} }
public SearchConfiguration getConfig() protected AbstractDataSource(Map map)
{ {
return this.config; loadFields(map);
}
/**
* Fields to index.
*/
protected String[] fields;
/**
* Convenience method to load fields to index into a Map.
*/
protected void loadFields(Map map)
{
Set fieldSet = map.keySet();
fields = new String[fieldSet.size()];
fieldSet.toArray(fields);
} }
} }

View File

@ -1,5 +1,4 @@
package search; package search;
/* ==================================================================== /* ====================================================================
* The Apache Software License, Version 1.1 * The Apache Software License, Version 1.1
* *
@ -26,12 +25,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,28 +53,49 @@ package search;
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import java.util.List; import java.util.Map;
/** /**
* A datasource is any source of data (filesystem, database, URL, etc) * A datasource is any source of data (filesystem, database, URL, etc)
* which is indexed by SearchIndexer. * which is indexed by SearchIndexer.
*
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
*/ */
public interface DataSource public interface DataSource
{ {
/**
* Key in the map (located in the list returned by getData)
* to represent the class name of the object being indexed.
*/
public static final String OBJECT_CLASS = "objectClass"; public static final String OBJECT_CLASS = "objectClass";
public static final String OBJECT_IDENTIFIER = "objectid";
/** /**
* Retrieve a list of Maps. Each map represents the * Key in the map (located in the list returned by getData)
* to represent the uuid of the object being indexed.
*/
public static final String OBJECT_IDENTIFIER = "objectId";
/**
* The key in the map (located in the list returned by getData)
* to represent nested datasources.
*/
public static final String NESTED_DATASOURCE = "nestedDataSource";
/**
* Key in the map (located in the list returned by getData)
* to represent the id of the datasource's container. Applies to
* nested datasources.
*/
public static final String CONTAINER_IDENTIFIER = "containerId";
/**
* Key in the map to represent the class name of the Search Result
* object for this datasource (if any).
*/
public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
/**
* Retrieve a array of Maps. Each map represents the
* a document to be indexed. The key:value pair of the map * a document to be indexed. The key:value pair of the map
* is the data of the document. * is the metadata of the document.
*/ */
public List getData() throws Exception; public Map[] getData() throws Exception;
/**
* Obtain the SearchConfiguration object used to configure the datasource.
*/
public SearchConfiguration getConfig();
} }

View File

@ -26,12 +26,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -55,177 +55,263 @@ package search;
*/ */
import org.apache.log4j.Category; import org.apache.log4j.Category;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import search.util.StringUtils;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.io.Reader;
import java.util.List; import java.util.*;
import java.util.Map;
import search.util.IOUtils;
import search.contenthandler.FileContentHandler;
import search.contenthandler.ContentHandlerFactory;
/** /**
* <p> * <p>
* A document is the atomic unit used for indexing purposes. It consists of * A document is the atomic unit used for indexing purposes. It consists of
* metadata as well as its file contents. File contents are handled by {@link FileContentHandler}. * metadata as well as its file contents. File contents are handled by
* {@link ContentHandler}.
* </p> * </p>
* <p> * <p>
* DocumentHandler creates the {@link org.apache.lucene.document.Document}, * DocumentHandler creates the {@link org.apache.lucene.document.Document},
* adds the standard fields to it, delegates to {@link FileContentHandler} to handle * adds fields to it, delegates to {@link ContentHandler} to handle
* file contents, then adds to the {@link org.apache.lucene.index.IndexWriter}. * file contents.
* </p> * </p>
* <p>
* The standard fields are:<br>
* <ul>
* <li>filePath : Full filesystem path to the document
* <li>fileName : File name of the document
* <li>fileLastModifiedDate : Date the file was last modified
* <li>fileSize : Size of the file in bytes
* <li>fileFormat : Extension of the file {@see com.marketingbright.core.util.IOUtils#getFileExtension}
* </ul>
* </p>
*
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
*/ */
public class DocumentHandler public class DocumentHandler
{ {
public static final String[] STANDARD_SEARCH_FIELDS = /**
{"filePath", "fileName", "fileLastModifiedDate", "fileSize", "fileFormat"}; * Field to retrieve all documents.
private static Category cat = Category.getInstance(DocumentHandler.class.getName()); */
private static Map customFields; public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
private static final String EMPTY_STRING = "";
private static Category cat = Category.getInstance(DocumentHandler.class);
private static boolean isDebugEnabled = cat.isDebugEnabled();
/**
* Should parent documents include data of its children?
*/
private static boolean parentEncapsulation = false;
/** /**
* Document object this DocumentHandler is handling. * Document object this DocumentHandler is handling.
*/ */
private Document doc; private Document doc;
/** /**
* Parent Document (null if none). * Map of metadata for this document. Contains the field:value pair
* to be added to the document.
*/ */
private Document parentDoc; private Map metadata;
/** /**
* IndexWriter to add this document to. * Map of fields. Contains field:type_of_field pair.
*/
private Map customFields;
/**
* IndexWriter.
*/ */
private IndexWriter writer; private IndexWriter writer;
public static void setCustomFields(Map aCustomFields) /**
{ * A collection of documents to be added to the writer.
customFields = aCustomFields; */
} private List documents = new ArrayList();
public DocumentHandler(IndexWriter writer) /**
* Ctor.
*
* @param Map of metadata for this document.
* @param Map of fields.
* @param Writer.
*/
public DocumentHandler(Map metadata,
Map customFields,
IndexWriter writer)
{ {
this.metadata = metadata;
this.customFields = customFields;
this.writer = writer; this.writer = writer;
doc = new Document();
} }
public DocumentHandler(IndexWriter writer, Document parentDoc) /**
* Handles the actual processing of the document.
*/
public void process() throws IOException, Exception
{ {
this(writer); String objectid = (String) metadata.get(DataSource.OBJECT_IDENTIFIER);
this.parentDoc = parentDoc; if (objectid == null)
} return;
doc = createDocument();
public void process(Map metadata) throws IOException addMapToDoc(metadata);
{ addNestedDataSource(metadata);
File contentFile = new File((String) metadata.get("filePath")); doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
//documents.add(doc);
// add the standard fields if (writer != null)
doc.add(Field.Keyword("filePath", contentFile.toString()));
doc.add(Field.Text("fileName", contentFile.getName()));
doc.add(Field.Keyword("fileLastModifiedDate", DateField.timeToString(contentFile.lastModified())));
doc.add(Field.Keyword("fileSize", String.valueOf(contentFile.length())));
doc.add(Field.Text("fileFormat", IOUtils.getFileExtension(contentFile)));
// check if this is a document from datasource where
// custom fields need to be added
if (parentDoc == null)
{ {
// add the custom fields addToWriter();
for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
{
String field = (String) it.next();
String value = (String) metadata.get(field);
String type = (String) customFields.get(field);
addFieldToDoc(type, field, value);
}
// Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
// to populate the result templates with the proper
// objects
doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
(String) metadata.get(DataSource.OBJECT_CLASS)));
doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
(String) metadata.get(DataSource.OBJECT_IDENTIFIER)));
} }
else else
{ {
for (Iterator it = customFields.keySet().iterator(); it.hasNext();) documents.add(doc);
{
String field = (String) it.next();
String value = parentDoc.get(field);
String type = (String) customFields.get(field);
addFieldToDoc(type, field, value);
}
// Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
// to populate the result templates with the proper
// objects
doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
parentDoc.get(DataSource.OBJECT_CLASS)));
doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
parentDoc.get(DataSource.OBJECT_IDENTIFIER)));
} }
if (!metadata.containsKey("fileContents")) }
private List getDocuments()
{
return documents;
}
private Document createDocument()
{
return new Document();
}
/**
* Add the contents of a Map to a document.
*
* @param Map to add.
*/
private void addMapToDoc(Map map)
{
for (Iterator it = map.keySet().iterator(); it.hasNext();)
{ {
String extension = IOUtils.getFileExtension(contentFile); String field = (String) it.next();
FileContentHandler cHandler = ContentHandlerFactory.getContentHandler(extension); Object value = map.get(field);
if (cHandler != null) if (value instanceof String)
{ {
cHandler.parse(doc, contentFile); String type = null;
if (cHandler.isNested()) if (customFields != null)
{ {
List nestedData = cHandler.getNestedData(); type = (String) customFields.get(field);
cat.debug("Nested data list size:" + nestedData.size());
for (int i = 0; i < nestedData.size(); i++)
{
Map dataMap = (Map) nestedData.get(i);
DocumentHandler handler = new DocumentHandler(writer, doc);
handler.process(dataMap);
}
} }
addFieldToDoc(type, field, (String) value);
}
else if (value instanceof Reader)
{
addFieldToDoc(field, (Reader) value);
}
}
}
/**
* Add nested datasources.
*
* @param Map which contains the nested datasources.
*/
private void addNestedDataSource(Map map) throws Exception
{
Object o = map.get(DataSource.NESTED_DATASOURCE);
if (o == null)
return;
if (o instanceof List)
{
List nestedDataSource = (List) o;
for (int i = 0; i < nestedDataSource.size(); i++)
{
DataSource ds = (DataSource) nestedDataSource.get(i);
addDataSource(ds);
}
}
else if (o instanceof DataSource)
{
DataSource ds = (DataSource) o;
addDataSource(ds);
}
}
/**
* Datasources are basically a collection of data maps to be indexed.
* addMapToDoc is invoked for each map.
*
* @param Datasource to add.
*/
private void addDataSource(DataSource ds) throws Exception
{
Map[] data = ds.getData();
for (int i = 0; i < data.length; i++)
{
Map map = data[i];
if (map.containsKey(DataSource.OBJECT_IDENTIFIER))
{
/**
* Create a new document because child datasources may need
* to be retrieved independently of parent doc.
*/
DocumentHandler docHandler = new DocumentHandler(map, null, null);
docHandler.process();
documents.addAll(docHandler.getDocuments());
} }
else else
{ {
cat.warn("FileContentHandler not found for " + contentFile.getName()); addMapToDoc(map);
/**
* Add nested datasources of this datasource's data
*/
addNestedDataSource(map);
} }
} }
else
doc.add(Field.Text("fileContents", (String) metadata.get("fileContents")));
addToWriter();
}
public void addToWriter() throws IOException
{
writer.addDocument(this.doc);
} }
/**
* Adds a String-based field to a document.
*
* @param Type of field.
* @param Name of field.
* @param Value of field.
*/
private void addFieldToDoc(String type, String field, String value) private void addFieldToDoc(String type, String field, String value)
{ {
if (value == null) if (value == null)
value = EMPTY_STRING; value = StringUtils.EMPTY_STRING;
if (type.equalsIgnoreCase(SearchConfiguration.TEXT_FIELD_TYPE)) if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
doc.add(Field.Text(field, value));
else if (type.equalsIgnoreCase(SearchConfiguration.KEYWORD_FIELD_TYPE))
doc.add(Field.Keyword(field, value)); doc.add(Field.Keyword(field, value));
else if (type.equalsIgnoreCase(SearchConfiguration.UNINDEXED_FIELD_TYPE)) else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
doc.add(Field.UnIndexed(field, value)); doc.add(Field.UnIndexed(field, value));
else if (type.equalsIgnoreCase(SearchConfiguration.UNSTORED_FIELD_TYPE)) else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
doc.add(Field.UnStored(field, value)); doc.add(Field.UnStored(field, value));
else
doc.add(Field.Text(field, value));
}
/**
* Adds a Reader-based field to a document.
*
* @param Name of field.
* @param Reader.
*/
private void addFieldToDoc(String field, Reader reader)
{
doc.add(Field.Text(field, reader));
}
/**
* Adds documents to the IndexWriter.
*/
private void addToWriter() throws IOException
{
if (parentEncapsulation)
{
for (int i = 0; i < documents.size(); i++)
{
Document d = (Document) documents.get(i);
for (Enumeration e = d.fields(); e.hasMoreElements();)
{
Field f = (Field) e.nextElement();
String fieldName = f.name();
if (!fieldName.equals(DataSource.CONTAINER_IDENTIFIER)
&& !fieldName.equals(DataSource.OBJECT_CLASS)
&& !fieldName.equals(DataSource.OBJECT_IDENTIFIER))
{
doc.add(f);
}
}
}
}
writer.addDocument(doc);
for (int i = 0; i < documents.size(); i++)
{
writer.addDocument((Document) documents.get(i));
}
//cat.debug((documents.size() + 1) + " documents added.");
} }
} }

View File

@ -26,12 +26,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -55,9 +55,12 @@ package search;
*/ */
import org.apache.lucene.document.DateField; import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Field; import search.contenthandler.FileContentHandler;
import search.contenthandler.FileContentHandlerFactory;
import search.util.IOUtils;
import java.io.File; import java.io.File;
import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -70,23 +73,38 @@ import java.util.Map;
*/ */
public class FSDataSource extends AbstractDataSource public class FSDataSource extends AbstractDataSource
{ {
private File targetDirectory; public static final String FILE_PATH_FIELD = "filePath";
public static final String FILE_NAME_FIELD = "fileName";
public static final String FILE_SIZE_FIELD = "fileSize";
public static final String FILE_FORMAT_FIELD = "fileFormat";
public static final String FILE_CONTENTS_FIELD = "fileContents";
public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
public FSDataSource(SearchConfiguration config) private File targetFileOrDir;
public FSDataSource(String targetFileOrDirStr)
{ {
super(config); this(new File(targetFileOrDirStr));
} }
public List getData() public FSDataSource(File targetFileOrDir)
{ {
List returnData = new ArrayList(); setTargetDirectory(targetFileOrDir);
loadDataFromFiles(targetDirectory, returnData); }
public Map[] getData()
{
Map[] returnData = null;
List temp = new ArrayList();
loadDataFromFiles(targetFileOrDir, temp);
returnData = new Map[temp.size()];
returnData = (Map[]) temp.toArray(returnData);
return returnData; return returnData;
} }
public void setTargetDirectory(File targetDirectory) public void setTargetDirectory(File targetFileOrDir)
{ {
this.targetDirectory = targetDirectory; this.targetFileOrDir = targetFileOrDir;
} }
private void loadDataFromFiles(File f, List list) private void loadDataFromFiles(File f, List list)
@ -102,8 +120,40 @@ public class FSDataSource extends AbstractDataSource
else else
{ {
Map dataMap = new HashMap(); Map dataMap = new HashMap();
dataMap.put("filePath", f.getPath()); dataMap.put(FILE_PATH_FIELD, f.getPath());
dataMap.put(FILE_NAME_FIELD, f.getName());
dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
DateField.timeToString(f.lastModified()));
dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
dataMap.put(FILE_FORMAT_FIELD,
IOUtils.getFileExtension(f));
addFileContents(f, dataMap);
list.add(dataMap); list.add(dataMap);
} }
} }
private void addFileContents(File targetFile, Map dataMap)
{
FileContentHandler cHandler =
FileContentHandlerFactory.getContentHandler(targetFile);
if (cHandler != null)
{
if (cHandler.fileContentIsReadable())
{
Reader r = cHandler.getReader();
if (r != null)
{
dataMap.put(FILE_CONTENTS_FIELD, r);
}
}
if (cHandler.containsNestedData())
{
dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
}
}
else
{
//cat.warn("ContentHandler not found for " + contentFile.getName());
}
}
} }

View File

@ -26,12 +26,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED

View File

@ -26,12 +26,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -59,7 +59,7 @@ import org.jdom.Document;
import org.jdom.Element; import org.jdom.Element;
import org.jdom.input.SAXBuilder; import org.jdom.input.SAXBuilder;
import search.util.DataUnformatFilter; import search.util.DataUnformatFilter;
import search.contenthandler.ContentHandlerFactory; import search.contenthandler.FileContentHandlerFactory;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -158,7 +158,7 @@ public class SearchConfiguration
{ {
if (defaultExtension[i] != null && defaultExtension[i].equals("true")) if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
{ {
contentHandlers.put(ContentHandlerFactory.DEFAULT_HANDLER_KEY contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
, generateObject(handlers[i])); , generateObject(handlers[i]));
} }
} }

View File

@ -26,12 +26,12 @@ package search;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -53,80 +53,75 @@ package search;
* information on the Apache Software Foundation, please see * information on the Apache Software Foundation, please see
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import org.apache.log4j.Category;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.log4j.Category;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import search.contenthandler.ContentHandlerFactory; import search.contenthandler.FileContentHandlerFactory;
/** /**
* Entry point for search engine indexing. * Entry point for search engine indexing.
* <p> * <p>
* SearchIndexer is responsible for creating the IndexWriter {@see org.apache.lucene.index.IndexWriter} * SearchIndexer is responsible for creating the IndexWriter
* and passing it to DocumentHandlers {@link DocumentHandler} to index individual documents. * {@see org.apache.lucene.index.IndexWriter} and passing it to
* DocumentHandlers {@link DocumentHandler} to index individual documents.
* </p> * </p>
*
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
*/ */
public class SearchIndexer public class SearchIndexer
{ {
private static Category cat = Category.getInstance(SearchIndexer.class); private static Category cat = Category.getInstance(SearchIndexer.class);
private IndexWriter fsWriter;
private IndexWriter writer; private SearchConfiguration config;
private DataSource source;
private int indexedDocuments = 0; private int indexedDocuments = 0;
public SearchIndexer() throws IOException public SearchIndexer() throws IOException
{ {
writer = new IndexWriter("/usr/local/lucene/index", Analyzer a = new StandardAnalyzer();
new StandardAnalyzer(), true); String indexDirectory = "/usr/path/to/index";
fsWriter = new IndexWriter(indexDirectory, a, true);
fsWriter.maxFieldLength = 1000000;
} }
public void index() throws IOException, Exception /**
* Indexes documents.
*/
public synchronized void index() throws IOException, Exception
{ {
cat.debug("Initiating indexing..."); cat.debug("Initiating search engine indexing...");
long start = System.currentTimeMillis();
init(); loadConfig();
List dataMapList = source.getData(); fsWriter.optimize();
for (int i = 0; i < dataMapList.size(); i++) fsWriter.close();
{ long stop = System.currentTimeMillis();
Map map = (Map) dataMapList.get(i); cat.debug("Indexing took " + (stop - start) + " milliseconds");
DocumentHandler docHandler = new DocumentHandler(writer);
try
{
docHandler.process(map);
++indexedDocuments;
}
catch (IOException ioe)
{
cat.error("Error encountered indexing:" + ioe.getMessage(),
ioe);
}
}
writer.optimize();
writer.close();
cat.debug(indexedDocuments + " documents were indexed.");
}
public void setSource(DataSource source)
{
this.source = source;
}
public void init()
{
ContentHandlerFactory.setContentHandlers(source.getConfig().getContentHandlers());
DocumentHandler.setCustomFields(source.getConfig().getCustomFields());
} }
public int getIndexedDocuments() public int getIndexedDocuments()
{ {
return this.indexedDocuments; return this.indexedDocuments;
} }
private void loadConfig() throws IllegalConfigurationException
{
config = new SearchConfiguration("/path/to/config");
FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
}
private void indexDataSource(DataSource source, Map customFields)
throws Exception
{
Map[] data = source.getData();
// here's a good place to spawn a couple of threads for indexing
for (int i = 0; i < data.length; i++)
{
DocumentHandler docHandler =
new DocumentHandler(data[i], customFields, fsWriter);
docHandler.process();
}
}
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,33 +54,35 @@ package search.contenthandler;
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import org.apache.lucene.document.Document; import java.io.Reader;
import java.io.File;
import java.util.List; import java.util.List;
/** /**
* A content handler determines how to index a file's contents. * A content handler determines how to index a file's contents.
*
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
*/ */
public interface FileContentHandler public interface FileContentHandler
{ {
/** /**
* Perform filetype-specific actions to index the file's contents and * Do the file contents of this file have any meaning? Should
* add it to the {@link org.apache.lucene.document.Document} object. * its contents be indexed?
*/ */
public void parse(Document doc, File f); public boolean fileContentIsReadable();
/** /**
* Is this a collection of files? * Returns a reader for this file's contents.
*/ */
public boolean isNested(); public Reader getReader();
/** /**
* Return the collection of files contained within the parent file. * Does this file have nested data within?
*/ */
public List getNestedData(); public boolean containsNestedData();
public Object clone(); /**
* Return the datasources contained within the parent file.
* This can be URLs contained within a HTML file, files
* within a ZIP file, basically anything represented by a
* DataSource.
*/
public List getNestedDataSource();
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,9 +54,8 @@ package search.contenthandler;
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import org.apache.lucene.document.Document;
import java.io.File; import java.io.File;
import java.io.Reader;
import java.util.List; import java.util.List;
/** /**
@ -70,12 +69,20 @@ import java.util.List;
*/ */
public abstract class FileContentHandlerAdapter implements FileContentHandler public abstract class FileContentHandlerAdapter implements FileContentHandler
{ {
public void parse(Document doc, File f) protected File file;
protected FileContentHandlerAdapter(File file)
{ {
this.file = file;
} }
public List getNestedData()
public Reader getReader()
{
return null;
}
public List getNestedDataSource()
{ {
return null; return null;
} }
public abstract Object clone();
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -57,29 +57,123 @@ package search.contenthandler;
import org.apache.log4j.Category; import org.apache.log4j.Category;
import java.util.Map; import java.util.Map;
import java.io.File;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Constructor;
import search.util.IOUtils;
/** /**
* Factory responsible for obtaining ContentHandlers. * Factory responsible for obtaining ContentHandlers.
* *
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a> * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
*/ */
public abstract class ContentHandlerFactory public abstract class FileContentHandlerFactory
{ {
public static final String DEFAULT_HANDLER_KEY = "DEFAULT"; public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
static Category cat = Category.getInstance(ContentHandlerFactory.class.getName()); static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
private static Map handlerCache = null; private static Map handlerRegistry;
public static FileContentHandler getContentHandler(String extension)
public static FileContentHandler getContentHandler(File f)
{ {
if (handlerCache.containsKey(extension)) String extension = IOUtils.getFileExtension(f);
return (FileContentHandler) ((FileContentHandler) handlerCache.get(extension)).clone(); if (handlerRegistry.containsKey(extension))
else if (handlerCache.containsKey(DEFAULT_HANDLER_KEY)) {
return (FileContentHandler) ((FileContentHandler) handlerCache.get(DEFAULT_HANDLER_KEY)).clone(); String handlerClassname = (String) handlerRegistry.get(extension);
return (FileContentHandler) generateObject(handlerClassname,
new Class[]{File.class},
new Object[]{f});
}
else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
{
String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
return (FileContentHandler) generateObject(handlerClassname);
}
else else
{
return NullHandler.getInstance(); return NullHandler.getInstance();
}
} }
public static void setContentHandlers(Map contentHandlers) public static void setHandlerRegistry(Map handlerRegistry)
{ {
handlerCache = contentHandlers; FileContentHandlerFactory.handlerRegistry = handlerRegistry;
}
/**
* Utility method to return an object based on its class name.
* The object needs to have a constructor which accepts no parameters.
*
* @param className Class name of object to be generated
* @return Object
*/
private static Object generateObject(String className)
{
Object o = null;
try
{
Class c = Class.forName(className);
o = c.newInstance();
}
catch (ClassNotFoundException cnfe)
{
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
}
catch (InstantiationException ie)
{
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
}
catch (IllegalAccessException iae)
{
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
}
return o;
}
/**
* Utility method to return an object based on its class name.
*
* @param type Class name of object to be generated
* @param clazz Class array of parameters.
* @param args Object array of arguments.
* @return Object
*/
private static Object generateObject(String className,
Class[] clazz,
Object[] args)
{
Object o = null;
try
{
Class c = Class.forName(className);
Constructor con = c.getConstructor(clazz);
if (con != null)
{
o = con.newInstance(args);
}
else
throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
}
catch (ClassNotFoundException cnfe)
{
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
}
catch (InstantiationException ie)
{
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
}
catch (IllegalAccessException iae)
{
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
}
catch (NoSuchMethodException nsme)
{
cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
}
catch (InvocationTargetException ite)
{
cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
}
return o;
} }
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -55,17 +55,14 @@ package search.contenthandler;
*/ */
import org.apache.log4j.Category; import org.apache.log4j.Category;
import org.apache.lucene.document.DateField; import search.DataSource;
import org.apache.lucene.document.Document; import search.FSDataSource;
import search.util.IOUtils;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.io.Reader;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import search.util.IOUtils;
/** /**
* Handles GZip content. * Handles GZip content.
@ -74,51 +71,60 @@ import search.util.IOUtils;
*/ */
public class GZipHandler extends NestedFileContentHandlerAdapter public class GZipHandler extends NestedFileContentHandlerAdapter
{ {
static Category cat = Category.getInstance(GZipHandler.class.getName()); private static Category cat = Category.getInstance(GZipHandler.class.getName());
public void parse(Document doc, File f) public GZipHandler(File file)
{ {
if (!f.exists()) super(file);
return; }
public Reader getReader()
{
return null;
}
public List getNestedDataSource()
{
if (!file.exists())
return null;
try try
{ {
File tempDir = new File(TEMP_FOLDER); File tempDir = new File(TEMP_FOLDER);
tempDir.mkdirs(); tempDir.mkdirs();
tempDir.deleteOnExit(); tempDir.deleteOnExit();
String filename = f.getName(); String filename = file.getName();
File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf("."))); File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
tempFile.deleteOnExit(); tempFile.deleteOnExit();
IOUtils.extractGZip(f, tempFile); IOUtils.extractGZip(file, tempFile);
indexGZipDirectory(tempDir, dataMapList); indexGZipDirectory(tempDir);
} }
catch (IOException ioe) catch (IOException ioe)
{ {
cat.error("IOException ungzipping " + f.toString(), ioe); cat.error("IOException ungzipping " + file.toString(), ioe);
} }
return nestedDataSource;
}
public boolean fileContentIsReadable()
{
return false;
} }
// only one file, but let's just treat it like a directory anyway // only one file, but let's just treat it like a directory anyway
private void indexGZipDirectory(File dir, List dataMapList) private void indexGZipDirectory(File dir)
{ {
if (dir.isDirectory()) if (dir.isDirectory())
{ {
File[] dirContents = dir.listFiles(); File[] dirContents = dir.listFiles();
for (int i = 0; i < dirContents.length; i++) for (int i = 0; i < dirContents.length; i++)
{ {
indexGZipDirectory(dirContents[i], dataMapList); indexGZipDirectory(dirContents[i]);
} }
} }
else if (dir.isFile()) else if (dir.isFile())
{ {
// here create new DataMap for the gzip entry DataSource ds = new FSDataSource(dir);
Map dataMap = new HashMap(); nestedDataSource.add(nestedDataSource);
dataMap.put("filePath", dir.toString());
dataMapList.add(dataMap);
} }
} }
public Object clone()
{
return new GZipHandler();
}
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -76,17 +76,15 @@ public abstract class NestedFileContentHandlerAdapter
protected final String TEMP_FOLDER = "/usr/temp" + '/' protected final String TEMP_FOLDER = "/usr/temp" + '/'
+ Math.random() + '/'; + Math.random() + '/';
protected List dataMapList = new ArrayList(); protected List nestedDataSource;
public abstract void parse(Document doc, File f); public NestedFileContentHandlerAdapter(File file)
{
super(file);
}
public boolean isNested() public boolean containsNestedData()
{ {
return true; return true;
} }
public List getNestedData()
{
return this.dataMapList;
}
} }

View File

@ -1,5 +1,8 @@
package search.contenthandler; package search.contenthandler;
import java.io.File;
import java.io.Reader;
/* ==================================================================== /* ====================================================================
* The Apache Software License, Version 1.1 * The Apache Software License, Version 1.1
* *
@ -26,12 +29,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -61,19 +64,29 @@ package search.contenthandler;
*/ */
public class NullHandler extends FileContentHandlerAdapter public class NullHandler extends FileContentHandlerAdapter
{ {
static NullHandler singleton = new NullHandler(); private static NullHandler singleton = new NullHandler(null);
public static FileContentHandler getInstance() public static FileContentHandler getInstance()
{ {
return singleton; return singleton;
} }
public Object clone() private NullHandler(File file)
{ {
return this; super(file);
} }
public boolean isNested() public boolean fileContentIsReadable()
{
return false;
}
public Reader getReader()
{
return null;
}
public boolean containsNestedData()
{ {
return false; return false;
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,17 +54,16 @@ package search.contenthandler;
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import search.util.IOUtils;
import org.apache.log4j.Category; import org.apache.log4j.Category;
import org.apache.lucene.document.DateField; import search.DataSource;
import org.apache.lucene.document.Document; import search.FSDataSource;
import search.util.IOUtils;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* Handles Tar files. * Handles Tar files.
@ -75,44 +74,58 @@ public class TARHandler extends NestedFileContentHandlerAdapter
{ {
static Category cat = Category.getInstance(TARHandler.class.getName()); static Category cat = Category.getInstance(TARHandler.class.getName());
public void parse(Document doc, File f) public TARHandler(File file)
{ {
if (!f.exists()) super(file);
return; }
public Reader getReader()
{
return null;
}
public boolean fileContentIsReadable()
{
return false;
}
public List getNestedDataSource()
{
if (!file.exists())
return null;
if (nestedDataSource == null)
{
nestedDataSource = new ArrayList();
}
try try
{ {
File tempDir = new File(TEMP_FOLDER); File tempDir = new File(TEMP_FOLDER);
tempDir.deleteOnExit(); tempDir.deleteOnExit();
IOUtils.extractTar(f, tempDir); IOUtils.extractTar(file, tempDir);
indexTarDirectory(tempDir, dataMapList); indexTarDirectory(tempDir);
} }
catch (IOException ioe) catch (IOException ioe)
{ {
cat.error(ioe.getMessage(), ioe); cat.error(ioe.getMessage(), ioe);
} }
return nestedDataSource;
} }
private void indexTarDirectory(File dir, List dataMapList) private void indexTarDirectory(File dir)
{ {
if (dir.isDirectory()) if (dir.isDirectory())
{ {
File[] dirContents = dir.listFiles(); File[] dirContents = dir.listFiles();
for (int i = 0; i < dirContents.length; i++) for (int i = 0; i < dirContents.length; i++)
{ {
indexTarDirectory(dirContents[i], dataMapList); indexTarDirectory(dirContents[i]);
} }
} }
else if (dir.isFile()) else if (dir.isFile())
{ {
// here create new DataMap for the tarred file // here create new DataMap for the tarred file
Map dataMap = new HashMap(); DataSource ds = new FSDataSource(dir);
dataMap.put("filePath", dir.toString()); nestedDataSource.add(nestedDataSource);
dataMapList.add(dataMap);
} }
} }
public Object clone()
{
return new TARHandler();
}
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -71,37 +71,37 @@ public class TextHandler extends FileContentHandlerAdapter
{ {
static Category cat = Category.getInstance(TextHandler.class.getName()); static Category cat = Category.getInstance(TextHandler.class.getName());
public void parse(Document doc, File f) public TextHandler(File file)
{ {
if (!f.exists()) super(file);
{
cat.error(f.toString() + " doesn't exist! Failing silently...");
return;
}
doc.add(Field.Text("fileContents", getReader(f)));
} }
public boolean isNested() public Reader getReader()
{
if (!file.exists())
{
cat.error(file.toString() + " doesn't exist! Failing silently...");
return null;
}
return getReader(file);
}
public boolean containsNestedData()
{ {
return false; return false;
} }
public boolean fileContentIsReadable()
{
return true;
}
private Reader getReader(File f) private Reader getReader(File f)
{ {
Reader reader = null; Reader reader = null;
try try
{ {
BufferedReader br = new BufferedReader(new FileReader(f)); reader = new FileReader(f);
String s = null;
StringBuffer strbf = new StringBuffer();
while ((s = br.readLine()) != null)
{
if (s.trim().length() > 0)
{
strbf.append(StringUtils.removeUnreadableCharacters(s));
}
}
reader = new StringReader(strbf.toString());
} }
catch (FileNotFoundException nfe) catch (FileNotFoundException nfe)
{ {
@ -113,9 +113,4 @@ public class TextHandler extends FileContentHandlerAdapter
} }
return reader; return reader;
} }
public Object clone()
{
return new TextHandler();
}
} }

View File

@ -26,12 +26,12 @@ package search.contenthandler;
* if and wherever such third-party acknowledgments normally appear. * if and wherever such third-party acknowledgments normally appear.
* *
* 4. The names "Apache" and "Apache Software Foundation" and * 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Turbine" must not be used to endorse or promote products * "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For * derived from this software without prior written permission. For
* written permission, please contact apache@apache.org. * written permission, please contact apache@apache.org.
* *
* 5. Products derived from this software may not be called "Apache", * 5. Products derived from this software may not be called "Apache",
* "Apache Turbine", nor may "Apache" appear in their name, without * "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation. * prior written permission of the Apache Software Foundation.
* *
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,15 +54,17 @@ package search.contenthandler;
* <http://www.apache.org/>. * <http://www.apache.org/>.
*/ */
import search.util.IOUtils;
import org.apache.log4j.Category; import org.apache.log4j.Category;
import org.apache.lucene.document.Document; import search.DataSource;
import search.FSDataSource;
import search.util.IOUtils;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashMap; import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipException; import java.util.zip.ZipException;
import java.util.zip.ZipFile; import java.util.zip.ZipFile;
@ -74,15 +76,34 @@ import java.util.zip.ZipFile;
*/ */
public class ZIPHandler extends NestedFileContentHandlerAdapter public class ZIPHandler extends NestedFileContentHandlerAdapter
{ {
static Category cat = Category.getInstance(ZIPHandler.class.getName()); private static Category cat = Category.getInstance(ZIPHandler.class);
public void parse(Document doc, File f) public ZIPHandler(File file)
{ {
if (!f.exists()) super(file);
return; }
public boolean fileContentIsReadable()
{
return false;
}
public Reader getReader()
{
return null;
}
public List getNestedDataSource()
{
if (!file.exists())
return null;
if (nestedDataSource == null)
{
nestedDataSource = new ArrayList();
}
try try
{ {
ZipFile zFile = new ZipFile(f); ZipFile zFile = new ZipFile(file);
for (Enumeration e = zFile.entries(); e.hasMoreElements();) for (Enumeration e = zFile.entries(); e.hasMoreElements();)
{ {
ZipEntry entry = (ZipEntry) e.nextElement(); ZipEntry entry = (ZipEntry) e.nextElement();
@ -92,9 +113,8 @@ public class ZIPHandler extends NestedFileContentHandlerAdapter
if (!entry.isDirectory()) if (!entry.isDirectory())
{ {
// create a new DataMap for each zip entry // create a new DataMap for each zip entry
Map dataMap = new HashMap(); DataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
dataMap.put("filePath", TEMP_FOLDER + entryName); nestedDataSource.add(ds);
dataMapList.add(dataMap);
} }
} }
zFile.close(); zFile.close();
@ -107,10 +127,6 @@ public class ZIPHandler extends NestedFileContentHandlerAdapter
{ {
cat.error("IOException parsing zip:" + ioe.getMessage(), ioe); cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
} }
} return nestedDataSource;
public Object clone()
{
return new ZIPHandler();
} }
} }