mirror of https://github.com/apache/lucene.git
Importing the classes seem to have warped the whitespaces. Here's my attempt to get things back to normal.
Introduced new datasource and contenthandler mechanism. It's quite a major alteration for individual changes to be enumerated. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150758 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5b5ea958c9
commit
a716edd6d1
|
@ -26,12 +26,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -54,22 +54,35 @@ package search;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generic implementation of a datasource.
|
* Generic implementation of a datasource.
|
||||||
*
|
|
||||||
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractDataSource implements DataSource
|
public abstract class AbstractDataSource implements DataSource
|
||||||
{
|
{
|
||||||
protected SearchConfiguration config;
|
protected AbstractDataSource()
|
||||||
|
|
||||||
public AbstractDataSource(SearchConfiguration config)
|
|
||||||
{
|
{
|
||||||
this.config = config;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchConfiguration getConfig()
|
protected AbstractDataSource(Map map)
|
||||||
{
|
{
|
||||||
return this.config;
|
loadFields(map);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fields to index.
|
||||||
|
*/
|
||||||
|
protected String[] fields;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenience method to load fields to index into a Map.
|
||||||
|
*/
|
||||||
|
protected void loadFields(Map map)
|
||||||
|
{
|
||||||
|
Set fieldSet = map.keySet();
|
||||||
|
fields = new String[fieldSet.size()];
|
||||||
|
fieldSet.toArray(fields);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
package search;
|
package search;
|
||||||
|
|
||||||
/* ====================================================================
|
/* ====================================================================
|
||||||
* The Apache Software License, Version 1.1
|
* The Apache Software License, Version 1.1
|
||||||
*
|
*
|
||||||
|
@ -26,12 +25,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -54,28 +53,49 @@ package search;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A datasource is any source of data (filesystem, database, URL, etc)
|
* A datasource is any source of data (filesystem, database, URL, etc)
|
||||||
* which is indexed by SearchIndexer.
|
* which is indexed by SearchIndexer.
|
||||||
*
|
|
||||||
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
|
||||||
*/
|
*/
|
||||||
public interface DataSource
|
public interface DataSource
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* Key in the map (located in the list returned by getData)
|
||||||
|
* to represent the class name of the object being indexed.
|
||||||
|
*/
|
||||||
public static final String OBJECT_CLASS = "objectClass";
|
public static final String OBJECT_CLASS = "objectClass";
|
||||||
public static final String OBJECT_IDENTIFIER = "objectid";
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieve a list of Maps. Each map represents the
|
* Key in the map (located in the list returned by getData)
|
||||||
|
* to represent the uuid of the object being indexed.
|
||||||
|
*/
|
||||||
|
public static final String OBJECT_IDENTIFIER = "objectId";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The key in the map (located in the list returned by getData)
|
||||||
|
* to represent nested datasources.
|
||||||
|
*/
|
||||||
|
public static final String NESTED_DATASOURCE = "nestedDataSource";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the map (located in the list returned by getData)
|
||||||
|
* to represent the id of the datasource's container. Applies to
|
||||||
|
* nested datasources.
|
||||||
|
*/
|
||||||
|
public static final String CONTAINER_IDENTIFIER = "containerId";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the map to represent the class name of the Search Result
|
||||||
|
* object for this datasource (if any).
|
||||||
|
*/
|
||||||
|
public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve a array of Maps. Each map represents the
|
||||||
* a document to be indexed. The key:value pair of the map
|
* a document to be indexed. The key:value pair of the map
|
||||||
* is the data of the document.
|
* is the metadata of the document.
|
||||||
*/
|
*/
|
||||||
public List getData() throws Exception;
|
public Map[] getData() throws Exception;
|
||||||
|
|
||||||
/**
|
|
||||||
* Obtain the SearchConfiguration object used to configure the datasource.
|
|
||||||
*/
|
|
||||||
public SearchConfiguration getConfig();
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -55,177 +55,263 @@ package search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.log4j.Category;
|
import org.apache.log4j.Category;
|
||||||
import org.apache.lucene.document.DateField;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import search.util.StringUtils;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.io.Reader;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import search.util.IOUtils;
|
|
||||||
import search.contenthandler.FileContentHandler;
|
|
||||||
import search.contenthandler.ContentHandlerFactory;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* A document is the atomic unit used for indexing purposes. It consists of
|
* A document is the atomic unit used for indexing purposes. It consists of
|
||||||
* metadata as well as its file contents. File contents are handled by {@link FileContentHandler}.
|
* metadata as well as its file contents. File contents are handled by
|
||||||
|
* {@link ContentHandler}.
|
||||||
* </p>
|
* </p>
|
||||||
* <p>
|
* <p>
|
||||||
* DocumentHandler creates the {@link org.apache.lucene.document.Document},
|
* DocumentHandler creates the {@link org.apache.lucene.document.Document},
|
||||||
* adds the standard fields to it, delegates to {@link FileContentHandler} to handle
|
* adds fields to it, delegates to {@link ContentHandler} to handle
|
||||||
* file contents, then adds to the {@link org.apache.lucene.index.IndexWriter}.
|
* file contents.
|
||||||
* </p>
|
* </p>
|
||||||
* <p>
|
|
||||||
* The standard fields are:<br>
|
|
||||||
* <ul>
|
|
||||||
* <li>filePath : Full filesystem path to the document
|
|
||||||
* <li>fileName : File name of the document
|
|
||||||
* <li>fileLastModifiedDate : Date the file was last modified
|
|
||||||
* <li>fileSize : Size of the file in bytes
|
|
||||||
* <li>fileFormat : Extension of the file {@see com.marketingbright.core.util.IOUtils#getFileExtension}
|
|
||||||
* </ul>
|
|
||||||
* </p>
|
|
||||||
*
|
|
||||||
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
|
||||||
*/
|
*/
|
||||||
public class DocumentHandler
|
public class DocumentHandler
|
||||||
{
|
{
|
||||||
public static final String[] STANDARD_SEARCH_FIELDS =
|
/**
|
||||||
{"filePath", "fileName", "fileLastModifiedDate", "fileSize", "fileFormat"};
|
* Field to retrieve all documents.
|
||||||
private static Category cat = Category.getInstance(DocumentHandler.class.getName());
|
*/
|
||||||
private static Map customFields;
|
public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
|
||||||
private static final String EMPTY_STRING = "";
|
|
||||||
|
|
||||||
|
private static Category cat = Category.getInstance(DocumentHandler.class);
|
||||||
|
|
||||||
|
private static boolean isDebugEnabled = cat.isDebugEnabled();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should parent documents include data of its children?
|
||||||
|
*/
|
||||||
|
private static boolean parentEncapsulation = false;
|
||||||
/**
|
/**
|
||||||
* Document object this DocumentHandler is handling.
|
* Document object this DocumentHandler is handling.
|
||||||
*/
|
*/
|
||||||
private Document doc;
|
private Document doc;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parent Document (null if none).
|
* Map of metadata for this document. Contains the field:value pair
|
||||||
|
* to be added to the document.
|
||||||
*/
|
*/
|
||||||
private Document parentDoc;
|
private Map metadata;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* IndexWriter to add this document to.
|
* Map of fields. Contains field:type_of_field pair.
|
||||||
|
*/
|
||||||
|
private Map customFields;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* IndexWriter.
|
||||||
*/
|
*/
|
||||||
private IndexWriter writer;
|
private IndexWriter writer;
|
||||||
|
|
||||||
public static void setCustomFields(Map aCustomFields)
|
/**
|
||||||
{
|
* A collection of documents to be added to the writer.
|
||||||
customFields = aCustomFields;
|
*/
|
||||||
}
|
private List documents = new ArrayList();
|
||||||
|
|
||||||
public DocumentHandler(IndexWriter writer)
|
/**
|
||||||
|
* Ctor.
|
||||||
|
*
|
||||||
|
* @param Map of metadata for this document.
|
||||||
|
* @param Map of fields.
|
||||||
|
* @param Writer.
|
||||||
|
*/
|
||||||
|
public DocumentHandler(Map metadata,
|
||||||
|
Map customFields,
|
||||||
|
IndexWriter writer)
|
||||||
{
|
{
|
||||||
|
this.metadata = metadata;
|
||||||
|
this.customFields = customFields;
|
||||||
this.writer = writer;
|
this.writer = writer;
|
||||||
doc = new Document();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public DocumentHandler(IndexWriter writer, Document parentDoc)
|
/**
|
||||||
|
* Handles the actual processing of the document.
|
||||||
|
*/
|
||||||
|
public void process() throws IOException, Exception
|
||||||
{
|
{
|
||||||
this(writer);
|
String objectid = (String) metadata.get(DataSource.OBJECT_IDENTIFIER);
|
||||||
this.parentDoc = parentDoc;
|
if (objectid == null)
|
||||||
}
|
return;
|
||||||
|
doc = createDocument();
|
||||||
public void process(Map metadata) throws IOException
|
addMapToDoc(metadata);
|
||||||
{
|
addNestedDataSource(metadata);
|
||||||
File contentFile = new File((String) metadata.get("filePath"));
|
doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
|
||||||
|
//documents.add(doc);
|
||||||
// add the standard fields
|
if (writer != null)
|
||||||
doc.add(Field.Keyword("filePath", contentFile.toString()));
|
|
||||||
doc.add(Field.Text("fileName", contentFile.getName()));
|
|
||||||
doc.add(Field.Keyword("fileLastModifiedDate", DateField.timeToString(contentFile.lastModified())));
|
|
||||||
doc.add(Field.Keyword("fileSize", String.valueOf(contentFile.length())));
|
|
||||||
doc.add(Field.Text("fileFormat", IOUtils.getFileExtension(contentFile)));
|
|
||||||
|
|
||||||
// check if this is a document from datasource where
|
|
||||||
// custom fields need to be added
|
|
||||||
if (parentDoc == null)
|
|
||||||
{
|
{
|
||||||
// add the custom fields
|
addToWriter();
|
||||||
for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
|
|
||||||
{
|
|
||||||
String field = (String) it.next();
|
|
||||||
String value = (String) metadata.get(field);
|
|
||||||
String type = (String) customFields.get(field);
|
|
||||||
addFieldToDoc(type, field, value);
|
|
||||||
}
|
|
||||||
// Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
|
|
||||||
// to populate the result templates with the proper
|
|
||||||
// objects
|
|
||||||
doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
|
|
||||||
(String) metadata.get(DataSource.OBJECT_CLASS)));
|
|
||||||
doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
|
|
||||||
(String) metadata.get(DataSource.OBJECT_IDENTIFIER)));
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
|
documents.add(doc);
|
||||||
{
|
|
||||||
String field = (String) it.next();
|
|
||||||
String value = parentDoc.get(field);
|
|
||||||
String type = (String) customFields.get(field);
|
|
||||||
addFieldToDoc(type, field, value);
|
|
||||||
}
|
|
||||||
// Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
|
|
||||||
// to populate the result templates with the proper
|
|
||||||
// objects
|
|
||||||
doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
|
|
||||||
parentDoc.get(DataSource.OBJECT_CLASS)));
|
|
||||||
doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
|
|
||||||
parentDoc.get(DataSource.OBJECT_IDENTIFIER)));
|
|
||||||
}
|
}
|
||||||
if (!metadata.containsKey("fileContents"))
|
}
|
||||||
|
|
||||||
|
private List getDocuments()
|
||||||
|
{
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document createDocument()
|
||||||
|
{
|
||||||
|
return new Document();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the contents of a Map to a document.
|
||||||
|
*
|
||||||
|
* @param Map to add.
|
||||||
|
*/
|
||||||
|
private void addMapToDoc(Map map)
|
||||||
|
{
|
||||||
|
for (Iterator it = map.keySet().iterator(); it.hasNext();)
|
||||||
{
|
{
|
||||||
String extension = IOUtils.getFileExtension(contentFile);
|
String field = (String) it.next();
|
||||||
FileContentHandler cHandler = ContentHandlerFactory.getContentHandler(extension);
|
Object value = map.get(field);
|
||||||
if (cHandler != null)
|
if (value instanceof String)
|
||||||
{
|
{
|
||||||
cHandler.parse(doc, contentFile);
|
String type = null;
|
||||||
if (cHandler.isNested())
|
if (customFields != null)
|
||||||
{
|
{
|
||||||
List nestedData = cHandler.getNestedData();
|
type = (String) customFields.get(field);
|
||||||
cat.debug("Nested data list size:" + nestedData.size());
|
|
||||||
for (int i = 0; i < nestedData.size(); i++)
|
|
||||||
{
|
|
||||||
Map dataMap = (Map) nestedData.get(i);
|
|
||||||
DocumentHandler handler = new DocumentHandler(writer, doc);
|
|
||||||
handler.process(dataMap);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
addFieldToDoc(type, field, (String) value);
|
||||||
|
}
|
||||||
|
else if (value instanceof Reader)
|
||||||
|
{
|
||||||
|
addFieldToDoc(field, (Reader) value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add nested datasources.
|
||||||
|
*
|
||||||
|
* @param Map which contains the nested datasources.
|
||||||
|
*/
|
||||||
|
private void addNestedDataSource(Map map) throws Exception
|
||||||
|
{
|
||||||
|
Object o = map.get(DataSource.NESTED_DATASOURCE);
|
||||||
|
if (o == null)
|
||||||
|
return;
|
||||||
|
if (o instanceof List)
|
||||||
|
{
|
||||||
|
List nestedDataSource = (List) o;
|
||||||
|
for (int i = 0; i < nestedDataSource.size(); i++)
|
||||||
|
{
|
||||||
|
DataSource ds = (DataSource) nestedDataSource.get(i);
|
||||||
|
addDataSource(ds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (o instanceof DataSource)
|
||||||
|
{
|
||||||
|
DataSource ds = (DataSource) o;
|
||||||
|
addDataSource(ds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Datasources are basically a collection of data maps to be indexed.
|
||||||
|
* addMapToDoc is invoked for each map.
|
||||||
|
*
|
||||||
|
* @param Datasource to add.
|
||||||
|
*/
|
||||||
|
private void addDataSource(DataSource ds) throws Exception
|
||||||
|
{
|
||||||
|
Map[] data = ds.getData();
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
{
|
||||||
|
Map map = data[i];
|
||||||
|
if (map.containsKey(DataSource.OBJECT_IDENTIFIER))
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Create a new document because child datasources may need
|
||||||
|
* to be retrieved independently of parent doc.
|
||||||
|
*/
|
||||||
|
DocumentHandler docHandler = new DocumentHandler(map, null, null);
|
||||||
|
docHandler.process();
|
||||||
|
documents.addAll(docHandler.getDocuments());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
cat.warn("FileContentHandler not found for " + contentFile.getName());
|
addMapToDoc(map);
|
||||||
|
/**
|
||||||
|
* Add nested datasources of this datasource's data
|
||||||
|
*/
|
||||||
|
addNestedDataSource(map);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
doc.add(Field.Text("fileContents", (String) metadata.get("fileContents")));
|
|
||||||
addToWriter();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addToWriter() throws IOException
|
|
||||||
{
|
|
||||||
writer.addDocument(this.doc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a String-based field to a document.
|
||||||
|
*
|
||||||
|
* @param Type of field.
|
||||||
|
* @param Name of field.
|
||||||
|
* @param Value of field.
|
||||||
|
*/
|
||||||
private void addFieldToDoc(String type, String field, String value)
|
private void addFieldToDoc(String type, String field, String value)
|
||||||
{
|
{
|
||||||
if (value == null)
|
if (value == null)
|
||||||
value = EMPTY_STRING;
|
value = StringUtils.EMPTY_STRING;
|
||||||
if (type.equalsIgnoreCase(SearchConfiguration.TEXT_FIELD_TYPE))
|
if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
|
||||||
doc.add(Field.Text(field, value));
|
|
||||||
else if (type.equalsIgnoreCase(SearchConfiguration.KEYWORD_FIELD_TYPE))
|
|
||||||
doc.add(Field.Keyword(field, value));
|
doc.add(Field.Keyword(field, value));
|
||||||
else if (type.equalsIgnoreCase(SearchConfiguration.UNINDEXED_FIELD_TYPE))
|
else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
|
||||||
doc.add(Field.UnIndexed(field, value));
|
doc.add(Field.UnIndexed(field, value));
|
||||||
else if (type.equalsIgnoreCase(SearchConfiguration.UNSTORED_FIELD_TYPE))
|
else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
|
||||||
doc.add(Field.UnStored(field, value));
|
doc.add(Field.UnStored(field, value));
|
||||||
|
else
|
||||||
|
doc.add(Field.Text(field, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a Reader-based field to a document.
|
||||||
|
*
|
||||||
|
* @param Name of field.
|
||||||
|
* @param Reader.
|
||||||
|
*/
|
||||||
|
private void addFieldToDoc(String field, Reader reader)
|
||||||
|
{
|
||||||
|
doc.add(Field.Text(field, reader));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds documents to the IndexWriter.
|
||||||
|
*/
|
||||||
|
private void addToWriter() throws IOException
|
||||||
|
{
|
||||||
|
if (parentEncapsulation)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < documents.size(); i++)
|
||||||
|
{
|
||||||
|
Document d = (Document) documents.get(i);
|
||||||
|
for (Enumeration e = d.fields(); e.hasMoreElements();)
|
||||||
|
{
|
||||||
|
Field f = (Field) e.nextElement();
|
||||||
|
String fieldName = f.name();
|
||||||
|
if (!fieldName.equals(DataSource.CONTAINER_IDENTIFIER)
|
||||||
|
&& !fieldName.equals(DataSource.OBJECT_CLASS)
|
||||||
|
&& !fieldName.equals(DataSource.OBJECT_IDENTIFIER))
|
||||||
|
{
|
||||||
|
doc.add(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.addDocument(doc);
|
||||||
|
for (int i = 0; i < documents.size(); i++)
|
||||||
|
{
|
||||||
|
writer.addDocument((Document) documents.get(i));
|
||||||
|
}
|
||||||
|
//cat.debug((documents.size() + 1) + " documents added.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -55,9 +55,12 @@ package search;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.Field;
|
import search.contenthandler.FileContentHandler;
|
||||||
|
import search.contenthandler.FileContentHandlerFactory;
|
||||||
|
import search.util.IOUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -70,23 +73,38 @@ import java.util.Map;
|
||||||
*/
|
*/
|
||||||
public class FSDataSource extends AbstractDataSource
|
public class FSDataSource extends AbstractDataSource
|
||||||
{
|
{
|
||||||
private File targetDirectory;
|
public static final String FILE_PATH_FIELD = "filePath";
|
||||||
|
public static final String FILE_NAME_FIELD = "fileName";
|
||||||
|
public static final String FILE_SIZE_FIELD = "fileSize";
|
||||||
|
public static final String FILE_FORMAT_FIELD = "fileFormat";
|
||||||
|
public static final String FILE_CONTENTS_FIELD = "fileContents";
|
||||||
|
public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
|
||||||
|
|
||||||
public FSDataSource(SearchConfiguration config)
|
private File targetFileOrDir;
|
||||||
|
|
||||||
|
public FSDataSource(String targetFileOrDirStr)
|
||||||
{
|
{
|
||||||
super(config);
|
this(new File(targetFileOrDirStr));
|
||||||
}
|
}
|
||||||
|
|
||||||
public List getData()
|
public FSDataSource(File targetFileOrDir)
|
||||||
{
|
{
|
||||||
List returnData = new ArrayList();
|
setTargetDirectory(targetFileOrDir);
|
||||||
loadDataFromFiles(targetDirectory, returnData);
|
}
|
||||||
|
|
||||||
|
public Map[] getData()
|
||||||
|
{
|
||||||
|
Map[] returnData = null;
|
||||||
|
List temp = new ArrayList();
|
||||||
|
loadDataFromFiles(targetFileOrDir, temp);
|
||||||
|
returnData = new Map[temp.size()];
|
||||||
|
returnData = (Map[]) temp.toArray(returnData);
|
||||||
return returnData;
|
return returnData;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setTargetDirectory(File targetDirectory)
|
public void setTargetDirectory(File targetFileOrDir)
|
||||||
{
|
{
|
||||||
this.targetDirectory = targetDirectory;
|
this.targetFileOrDir = targetFileOrDir;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadDataFromFiles(File f, List list)
|
private void loadDataFromFiles(File f, List list)
|
||||||
|
@ -102,8 +120,40 @@ public class FSDataSource extends AbstractDataSource
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
Map dataMap = new HashMap();
|
Map dataMap = new HashMap();
|
||||||
dataMap.put("filePath", f.getPath());
|
dataMap.put(FILE_PATH_FIELD, f.getPath());
|
||||||
|
dataMap.put(FILE_NAME_FIELD, f.getName());
|
||||||
|
dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
|
||||||
|
DateField.timeToString(f.lastModified()));
|
||||||
|
dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
|
||||||
|
dataMap.put(FILE_FORMAT_FIELD,
|
||||||
|
IOUtils.getFileExtension(f));
|
||||||
|
addFileContents(f, dataMap);
|
||||||
list.add(dataMap);
|
list.add(dataMap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addFileContents(File targetFile, Map dataMap)
|
||||||
|
{
|
||||||
|
FileContentHandler cHandler =
|
||||||
|
FileContentHandlerFactory.getContentHandler(targetFile);
|
||||||
|
if (cHandler != null)
|
||||||
|
{
|
||||||
|
if (cHandler.fileContentIsReadable())
|
||||||
|
{
|
||||||
|
Reader r = cHandler.getReader();
|
||||||
|
if (r != null)
|
||||||
|
{
|
||||||
|
dataMap.put(FILE_CONTENTS_FIELD, r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cHandler.containsNestedData())
|
||||||
|
{
|
||||||
|
dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//cat.warn("ContentHandler not found for " + contentFile.getName());
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -59,7 +59,7 @@ import org.jdom.Document;
|
||||||
import org.jdom.Element;
|
import org.jdom.Element;
|
||||||
import org.jdom.input.SAXBuilder;
|
import org.jdom.input.SAXBuilder;
|
||||||
import search.util.DataUnformatFilter;
|
import search.util.DataUnformatFilter;
|
||||||
import search.contenthandler.ContentHandlerFactory;
|
import search.contenthandler.FileContentHandlerFactory;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -158,7 +158,7 @@ public class SearchConfiguration
|
||||||
{
|
{
|
||||||
if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
|
if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
|
||||||
{
|
{
|
||||||
contentHandlers.put(ContentHandlerFactory.DEFAULT_HANDLER_KEY
|
contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
|
||||||
, generateObject(handlers[i]));
|
, generateObject(handlers[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -53,80 +53,75 @@ package search;
|
||||||
* information on the Apache Software Foundation, please see
|
* information on the Apache Software Foundation, please see
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.log4j.Category;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import search.contenthandler.ContentHandlerFactory;
|
import search.contenthandler.FileContentHandlerFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Entry point for search engine indexing.
|
* Entry point for search engine indexing.
|
||||||
* <p>
|
* <p>
|
||||||
* SearchIndexer is responsible for creating the IndexWriter {@see org.apache.lucene.index.IndexWriter}
|
* SearchIndexer is responsible for creating the IndexWriter
|
||||||
* and passing it to DocumentHandlers {@link DocumentHandler} to index individual documents.
|
* {@see org.apache.lucene.index.IndexWriter} and passing it to
|
||||||
|
* DocumentHandlers {@link DocumentHandler} to index individual documents.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
|
||||||
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
|
||||||
*/
|
*/
|
||||||
public class SearchIndexer
|
public class SearchIndexer
|
||||||
{
|
{
|
||||||
private static Category cat = Category.getInstance(SearchIndexer.class);
|
private static Category cat = Category.getInstance(SearchIndexer.class);
|
||||||
|
private IndexWriter fsWriter;
|
||||||
private IndexWriter writer;
|
private SearchConfiguration config;
|
||||||
private DataSource source;
|
|
||||||
private int indexedDocuments = 0;
|
private int indexedDocuments = 0;
|
||||||
|
|
||||||
public SearchIndexer() throws IOException
|
public SearchIndexer() throws IOException
|
||||||
{
|
{
|
||||||
writer = new IndexWriter("/usr/local/lucene/index",
|
Analyzer a = new StandardAnalyzer();
|
||||||
new StandardAnalyzer(), true);
|
String indexDirectory = "/usr/path/to/index";
|
||||||
|
fsWriter = new IndexWriter(indexDirectory, a, true);
|
||||||
|
fsWriter.maxFieldLength = 1000000;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void index() throws IOException, Exception
|
/**
|
||||||
|
* Indexes documents.
|
||||||
|
*/
|
||||||
|
public synchronized void index() throws IOException, Exception
|
||||||
{
|
{
|
||||||
cat.debug("Initiating indexing...");
|
cat.debug("Initiating search engine indexing...");
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
init();
|
loadConfig();
|
||||||
List dataMapList = source.getData();
|
fsWriter.optimize();
|
||||||
for (int i = 0; i < dataMapList.size(); i++)
|
fsWriter.close();
|
||||||
{
|
long stop = System.currentTimeMillis();
|
||||||
Map map = (Map) dataMapList.get(i);
|
cat.debug("Indexing took " + (stop - start) + " milliseconds");
|
||||||
DocumentHandler docHandler = new DocumentHandler(writer);
|
|
||||||
try
|
|
||||||
{
|
|
||||||
docHandler.process(map);
|
|
||||||
++indexedDocuments;
|
|
||||||
}
|
|
||||||
catch (IOException ioe)
|
|
||||||
{
|
|
||||||
cat.error("Error encountered indexing:" + ioe.getMessage(),
|
|
||||||
ioe);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
writer.optimize();
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
cat.debug(indexedDocuments + " documents were indexed.");
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSource(DataSource source)
|
|
||||||
{
|
|
||||||
this.source = source;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void init()
|
|
||||||
{
|
|
||||||
ContentHandlerFactory.setContentHandlers(source.getConfig().getContentHandlers());
|
|
||||||
DocumentHandler.setCustomFields(source.getConfig().getCustomFields());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getIndexedDocuments()
|
public int getIndexedDocuments()
|
||||||
{
|
{
|
||||||
return this.indexedDocuments;
|
return this.indexedDocuments;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void loadConfig() throws IllegalConfigurationException
|
||||||
|
{
|
||||||
|
config = new SearchConfiguration("/path/to/config");
|
||||||
|
FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void indexDataSource(DataSource source, Map customFields)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
Map[] data = source.getData();
|
||||||
|
// here's a good place to spawn a couple of threads for indexing
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
{
|
||||||
|
DocumentHandler docHandler =
|
||||||
|
new DocumentHandler(data[i], customFields, fsWriter);
|
||||||
|
docHandler.process();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -54,33 +54,35 @@ package search.contenthandler;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
import java.io.Reader;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A content handler determines how to index a file's contents.
|
* A content handler determines how to index a file's contents.
|
||||||
*
|
|
||||||
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
|
||||||
*/
|
*/
|
||||||
public interface FileContentHandler
|
public interface FileContentHandler
|
||||||
{
|
{
|
||||||
/**
|
/**
|
||||||
* Perform filetype-specific actions to index the file's contents and
|
* Do the file contents of this file have any meaning? Should
|
||||||
* add it to the {@link org.apache.lucene.document.Document} object.
|
* its contents be indexed?
|
||||||
*/
|
*/
|
||||||
public void parse(Document doc, File f);
|
public boolean fileContentIsReadable();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is this a collection of files?
|
* Returns a reader for this file's contents.
|
||||||
*/
|
*/
|
||||||
public boolean isNested();
|
public Reader getReader();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the collection of files contained within the parent file.
|
* Does this file have nested data within?
|
||||||
*/
|
*/
|
||||||
public List getNestedData();
|
public boolean containsNestedData();
|
||||||
|
|
||||||
public Object clone();
|
/**
|
||||||
|
* Return the datasources contained within the parent file.
|
||||||
|
* This can be URLs contained within a HTML file, files
|
||||||
|
* within a ZIP file, basically anything represented by a
|
||||||
|
* DataSource.
|
||||||
|
*/
|
||||||
|
public List getNestedDataSource();
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -54,9 +54,8 @@ package search.contenthandler;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -70,12 +69,20 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public abstract class FileContentHandlerAdapter implements FileContentHandler
|
public abstract class FileContentHandlerAdapter implements FileContentHandler
|
||||||
{
|
{
|
||||||
public void parse(Document doc, File f)
|
protected File file;
|
||||||
|
|
||||||
|
protected FileContentHandlerAdapter(File file)
|
||||||
{
|
{
|
||||||
|
this.file = file;
|
||||||
}
|
}
|
||||||
public List getNestedData()
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
{
|
{
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
public abstract Object clone();
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -57,29 +57,123 @@ package search.contenthandler;
|
||||||
import org.apache.log4j.Category;
|
import org.apache.log4j.Category;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.io.File;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.lang.reflect.Constructor;
|
||||||
|
|
||||||
|
import search.util.IOUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory responsible for obtaining ContentHandlers.
|
* Factory responsible for obtaining ContentHandlers.
|
||||||
*
|
*
|
||||||
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
* @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
|
||||||
*/
|
*/
|
||||||
public abstract class ContentHandlerFactory
|
public abstract class FileContentHandlerFactory
|
||||||
{
|
{
|
||||||
public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
|
public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
|
||||||
static Category cat = Category.getInstance(ContentHandlerFactory.class.getName());
|
static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
|
||||||
private static Map handlerCache = null;
|
private static Map handlerRegistry;
|
||||||
public static FileContentHandler getContentHandler(String extension)
|
|
||||||
|
public static FileContentHandler getContentHandler(File f)
|
||||||
{
|
{
|
||||||
if (handlerCache.containsKey(extension))
|
String extension = IOUtils.getFileExtension(f);
|
||||||
return (FileContentHandler) ((FileContentHandler) handlerCache.get(extension)).clone();
|
if (handlerRegistry.containsKey(extension))
|
||||||
else if (handlerCache.containsKey(DEFAULT_HANDLER_KEY))
|
{
|
||||||
return (FileContentHandler) ((FileContentHandler) handlerCache.get(DEFAULT_HANDLER_KEY)).clone();
|
String handlerClassname = (String) handlerRegistry.get(extension);
|
||||||
|
return (FileContentHandler) generateObject(handlerClassname,
|
||||||
|
new Class[]{File.class},
|
||||||
|
new Object[]{f});
|
||||||
|
}
|
||||||
|
else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
|
||||||
|
{
|
||||||
|
String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
|
||||||
|
return (FileContentHandler) generateObject(handlerClassname);
|
||||||
|
}
|
||||||
else
|
else
|
||||||
|
{
|
||||||
return NullHandler.getInstance();
|
return NullHandler.getInstance();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void setContentHandlers(Map contentHandlers)
|
public static void setHandlerRegistry(Map handlerRegistry)
|
||||||
{
|
{
|
||||||
handlerCache = contentHandlers;
|
FileContentHandlerFactory.handlerRegistry = handlerRegistry;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method to return an object based on its class name.
|
||||||
|
* The object needs to have a constructor which accepts no parameters.
|
||||||
|
*
|
||||||
|
* @param className Class name of object to be generated
|
||||||
|
* @return Object
|
||||||
|
*/
|
||||||
|
private static Object generateObject(String className)
|
||||||
|
{
|
||||||
|
Object o = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Class c = Class.forName(className);
|
||||||
|
o = c.newInstance();
|
||||||
|
}
|
||||||
|
catch (ClassNotFoundException cnfe)
|
||||||
|
{
|
||||||
|
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||||
|
}
|
||||||
|
catch (InstantiationException ie)
|
||||||
|
{
|
||||||
|
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||||
|
}
|
||||||
|
catch (IllegalAccessException iae)
|
||||||
|
{
|
||||||
|
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||||
|
}
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method to return an object based on its class name.
|
||||||
|
*
|
||||||
|
* @param type Class name of object to be generated
|
||||||
|
* @param clazz Class array of parameters.
|
||||||
|
* @param args Object array of arguments.
|
||||||
|
* @return Object
|
||||||
|
*/
|
||||||
|
private static Object generateObject(String className,
|
||||||
|
Class[] clazz,
|
||||||
|
Object[] args)
|
||||||
|
{
|
||||||
|
Object o = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Class c = Class.forName(className);
|
||||||
|
Constructor con = c.getConstructor(clazz);
|
||||||
|
if (con != null)
|
||||||
|
{
|
||||||
|
o = con.newInstance(args);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
|
||||||
|
}
|
||||||
|
catch (ClassNotFoundException cnfe)
|
||||||
|
{
|
||||||
|
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||||
|
}
|
||||||
|
catch (InstantiationException ie)
|
||||||
|
{
|
||||||
|
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||||
|
}
|
||||||
|
catch (IllegalAccessException iae)
|
||||||
|
{
|
||||||
|
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||||
|
}
|
||||||
|
catch (NoSuchMethodException nsme)
|
||||||
|
{
|
||||||
|
cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
|
||||||
|
}
|
||||||
|
catch (InvocationTargetException ite)
|
||||||
|
{
|
||||||
|
cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
|
||||||
|
}
|
||||||
|
return o;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -55,17 +55,14 @@ package search.contenthandler;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.log4j.Category;
|
import org.apache.log4j.Category;
|
||||||
import org.apache.lucene.document.DateField;
|
import search.DataSource;
|
||||||
import org.apache.lucene.document.Document;
|
import search.FSDataSource;
|
||||||
|
import search.util.IOUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.io.Reader;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import search.util.IOUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handles GZip content.
|
* Handles GZip content.
|
||||||
|
@ -74,51 +71,60 @@ import search.util.IOUtils;
|
||||||
*/
|
*/
|
||||||
public class GZipHandler extends NestedFileContentHandlerAdapter
|
public class GZipHandler extends NestedFileContentHandlerAdapter
|
||||||
{
|
{
|
||||||
static Category cat = Category.getInstance(GZipHandler.class.getName());
|
private static Category cat = Category.getInstance(GZipHandler.class.getName());
|
||||||
|
|
||||||
public void parse(Document doc, File f)
|
public GZipHandler(File file)
|
||||||
{
|
{
|
||||||
if (!f.exists())
|
super(file);
|
||||||
return;
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
return null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
File tempDir = new File(TEMP_FOLDER);
|
File tempDir = new File(TEMP_FOLDER);
|
||||||
tempDir.mkdirs();
|
tempDir.mkdirs();
|
||||||
tempDir.deleteOnExit();
|
tempDir.deleteOnExit();
|
||||||
String filename = f.getName();
|
String filename = file.getName();
|
||||||
File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
|
File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
|
||||||
tempFile.deleteOnExit();
|
tempFile.deleteOnExit();
|
||||||
IOUtils.extractGZip(f, tempFile);
|
IOUtils.extractGZip(file, tempFile);
|
||||||
indexGZipDirectory(tempDir, dataMapList);
|
indexGZipDirectory(tempDir);
|
||||||
}
|
}
|
||||||
catch (IOException ioe)
|
catch (IOException ioe)
|
||||||
{
|
{
|
||||||
cat.error("IOException ungzipping " + f.toString(), ioe);
|
cat.error("IOException ungzipping " + file.toString(), ioe);
|
||||||
}
|
}
|
||||||
|
return nestedDataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// only one file, but let's just treat it like a directory anyway
|
// only one file, but let's just treat it like a directory anyway
|
||||||
private void indexGZipDirectory(File dir, List dataMapList)
|
private void indexGZipDirectory(File dir)
|
||||||
{
|
{
|
||||||
if (dir.isDirectory())
|
if (dir.isDirectory())
|
||||||
{
|
{
|
||||||
File[] dirContents = dir.listFiles();
|
File[] dirContents = dir.listFiles();
|
||||||
for (int i = 0; i < dirContents.length; i++)
|
for (int i = 0; i < dirContents.length; i++)
|
||||||
{
|
{
|
||||||
indexGZipDirectory(dirContents[i], dataMapList);
|
indexGZipDirectory(dirContents[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (dir.isFile())
|
else if (dir.isFile())
|
||||||
{
|
{
|
||||||
// here create new DataMap for the gzip entry
|
DataSource ds = new FSDataSource(dir);
|
||||||
Map dataMap = new HashMap();
|
nestedDataSource.add(nestedDataSource);
|
||||||
dataMap.put("filePath", dir.toString());
|
|
||||||
dataMapList.add(dataMap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object clone()
|
|
||||||
{
|
|
||||||
return new GZipHandler();
|
|
||||||
}
|
|
||||||
}
|
}
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -76,17 +76,15 @@ public abstract class NestedFileContentHandlerAdapter
|
||||||
protected final String TEMP_FOLDER = "/usr/temp" + '/'
|
protected final String TEMP_FOLDER = "/usr/temp" + '/'
|
||||||
+ Math.random() + '/';
|
+ Math.random() + '/';
|
||||||
|
|
||||||
protected List dataMapList = new ArrayList();
|
protected List nestedDataSource;
|
||||||
|
|
||||||
public abstract void parse(Document doc, File f);
|
public NestedFileContentHandlerAdapter(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isNested()
|
public boolean containsNestedData()
|
||||||
{
|
{
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List getNestedData()
|
|
||||||
{
|
|
||||||
return this.dataMapList;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
package search.contenthandler;
|
package search.contenthandler;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
/* ====================================================================
|
/* ====================================================================
|
||||||
* The Apache Software License, Version 1.1
|
* The Apache Software License, Version 1.1
|
||||||
*
|
*
|
||||||
|
@ -26,12 +29,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -61,19 +64,29 @@ package search.contenthandler;
|
||||||
*/
|
*/
|
||||||
public class NullHandler extends FileContentHandlerAdapter
|
public class NullHandler extends FileContentHandlerAdapter
|
||||||
{
|
{
|
||||||
static NullHandler singleton = new NullHandler();
|
private static NullHandler singleton = new NullHandler(null);
|
||||||
|
|
||||||
public static FileContentHandler getInstance()
|
public static FileContentHandler getInstance()
|
||||||
{
|
{
|
||||||
return singleton;
|
return singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object clone()
|
private NullHandler(File file)
|
||||||
{
|
{
|
||||||
return this;
|
super(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isNested()
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsNestedData()
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -54,17 +54,16 @@ package search.contenthandler;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import search.util.IOUtils;
|
|
||||||
import org.apache.log4j.Category;
|
import org.apache.log4j.Category;
|
||||||
import org.apache.lucene.document.DateField;
|
import search.DataSource;
|
||||||
import org.apache.lucene.document.Document;
|
import search.FSDataSource;
|
||||||
|
import search.util.IOUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handles Tar files.
|
* Handles Tar files.
|
||||||
|
@ -75,44 +74,58 @@ public class TARHandler extends NestedFileContentHandlerAdapter
|
||||||
{
|
{
|
||||||
static Category cat = Category.getInstance(TARHandler.class.getName());
|
static Category cat = Category.getInstance(TARHandler.class.getName());
|
||||||
|
|
||||||
public void parse(Document doc, File f)
|
public TARHandler(File file)
|
||||||
{
|
{
|
||||||
if (!f.exists())
|
super(file);
|
||||||
return;
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
return null;
|
||||||
|
if (nestedDataSource == null)
|
||||||
|
{
|
||||||
|
nestedDataSource = new ArrayList();
|
||||||
|
}
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
File tempDir = new File(TEMP_FOLDER);
|
File tempDir = new File(TEMP_FOLDER);
|
||||||
tempDir.deleteOnExit();
|
tempDir.deleteOnExit();
|
||||||
IOUtils.extractTar(f, tempDir);
|
IOUtils.extractTar(file, tempDir);
|
||||||
indexTarDirectory(tempDir, dataMapList);
|
indexTarDirectory(tempDir);
|
||||||
}
|
}
|
||||||
catch (IOException ioe)
|
catch (IOException ioe)
|
||||||
{
|
{
|
||||||
cat.error(ioe.getMessage(), ioe);
|
cat.error(ioe.getMessage(), ioe);
|
||||||
}
|
}
|
||||||
|
return nestedDataSource;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void indexTarDirectory(File dir, List dataMapList)
|
private void indexTarDirectory(File dir)
|
||||||
{
|
{
|
||||||
if (dir.isDirectory())
|
if (dir.isDirectory())
|
||||||
{
|
{
|
||||||
File[] dirContents = dir.listFiles();
|
File[] dirContents = dir.listFiles();
|
||||||
for (int i = 0; i < dirContents.length; i++)
|
for (int i = 0; i < dirContents.length; i++)
|
||||||
{
|
{
|
||||||
indexTarDirectory(dirContents[i], dataMapList);
|
indexTarDirectory(dirContents[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (dir.isFile())
|
else if (dir.isFile())
|
||||||
{
|
{
|
||||||
// here create new DataMap for the tarred file
|
// here create new DataMap for the tarred file
|
||||||
Map dataMap = new HashMap();
|
DataSource ds = new FSDataSource(dir);
|
||||||
dataMap.put("filePath", dir.toString());
|
nestedDataSource.add(nestedDataSource);
|
||||||
dataMapList.add(dataMap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object clone()
|
|
||||||
{
|
|
||||||
return new TARHandler();
|
|
||||||
}
|
|
||||||
}
|
}
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -71,37 +71,37 @@ public class TextHandler extends FileContentHandlerAdapter
|
||||||
{
|
{
|
||||||
static Category cat = Category.getInstance(TextHandler.class.getName());
|
static Category cat = Category.getInstance(TextHandler.class.getName());
|
||||||
|
|
||||||
public void parse(Document doc, File f)
|
public TextHandler(File file)
|
||||||
{
|
{
|
||||||
if (!f.exists())
|
super(file);
|
||||||
{
|
|
||||||
cat.error(f.toString() + " doesn't exist! Failing silently...");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
doc.add(Field.Text("fileContents", getReader(f)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isNested()
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
{
|
||||||
|
cat.error(file.toString() + " doesn't exist! Failing silently...");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return getReader(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsNestedData()
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private Reader getReader(File f)
|
private Reader getReader(File f)
|
||||||
{
|
{
|
||||||
Reader reader = null;
|
Reader reader = null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
BufferedReader br = new BufferedReader(new FileReader(f));
|
reader = new FileReader(f);
|
||||||
String s = null;
|
|
||||||
StringBuffer strbf = new StringBuffer();
|
|
||||||
while ((s = br.readLine()) != null)
|
|
||||||
{
|
|
||||||
if (s.trim().length() > 0)
|
|
||||||
{
|
|
||||||
strbf.append(StringUtils.removeUnreadableCharacters(s));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reader = new StringReader(strbf.toString());
|
|
||||||
}
|
}
|
||||||
catch (FileNotFoundException nfe)
|
catch (FileNotFoundException nfe)
|
||||||
{
|
{
|
||||||
|
@ -113,9 +113,4 @@ public class TextHandler extends FileContentHandlerAdapter
|
||||||
}
|
}
|
||||||
return reader;
|
return reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object clone()
|
|
||||||
{
|
|
||||||
return new TextHandler();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,12 +26,12 @@ package search.contenthandler;
|
||||||
* if and wherever such third-party acknowledgments normally appear.
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
*
|
*
|
||||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
* "Apache Turbine" must not be used to endorse or promote products
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
* derived from this software without prior written permission. For
|
* derived from this software without prior written permission. For
|
||||||
* written permission, please contact apache@apache.org.
|
* written permission, please contact apache@apache.org.
|
||||||
*
|
*
|
||||||
* 5. Products derived from this software may not be called "Apache",
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
* "Apache Turbine", nor may "Apache" appear in their name, without
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
* prior written permission of the Apache Software Foundation.
|
* prior written permission of the Apache Software Foundation.
|
||||||
*
|
*
|
||||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
@ -54,15 +54,17 @@ package search.contenthandler;
|
||||||
* <http://www.apache.org/>.
|
* <http://www.apache.org/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import search.util.IOUtils;
|
|
||||||
import org.apache.log4j.Category;
|
import org.apache.log4j.Category;
|
||||||
import org.apache.lucene.document.Document;
|
import search.DataSource;
|
||||||
|
import search.FSDataSource;
|
||||||
|
import search.util.IOUtils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
import java.util.HashMap;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.zip.ZipException;
|
import java.util.zip.ZipException;
|
||||||
import java.util.zip.ZipFile;
|
import java.util.zip.ZipFile;
|
||||||
|
@ -74,15 +76,34 @@ import java.util.zip.ZipFile;
|
||||||
*/
|
*/
|
||||||
public class ZIPHandler extends NestedFileContentHandlerAdapter
|
public class ZIPHandler extends NestedFileContentHandlerAdapter
|
||||||
{
|
{
|
||||||
static Category cat = Category.getInstance(ZIPHandler.class.getName());
|
private static Category cat = Category.getInstance(ZIPHandler.class);
|
||||||
|
|
||||||
public void parse(Document doc, File f)
|
public ZIPHandler(File file)
|
||||||
{
|
{
|
||||||
if (!f.exists())
|
super(file);
|
||||||
return;
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
return null;
|
||||||
|
if (nestedDataSource == null)
|
||||||
|
{
|
||||||
|
nestedDataSource = new ArrayList();
|
||||||
|
}
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
ZipFile zFile = new ZipFile(f);
|
ZipFile zFile = new ZipFile(file);
|
||||||
for (Enumeration e = zFile.entries(); e.hasMoreElements();)
|
for (Enumeration e = zFile.entries(); e.hasMoreElements();)
|
||||||
{
|
{
|
||||||
ZipEntry entry = (ZipEntry) e.nextElement();
|
ZipEntry entry = (ZipEntry) e.nextElement();
|
||||||
|
@ -92,9 +113,8 @@ public class ZIPHandler extends NestedFileContentHandlerAdapter
|
||||||
if (!entry.isDirectory())
|
if (!entry.isDirectory())
|
||||||
{
|
{
|
||||||
// create a new DataMap for each zip entry
|
// create a new DataMap for each zip entry
|
||||||
Map dataMap = new HashMap();
|
DataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
|
||||||
dataMap.put("filePath", TEMP_FOLDER + entryName);
|
nestedDataSource.add(ds);
|
||||||
dataMapList.add(dataMap);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
zFile.close();
|
zFile.close();
|
||||||
|
@ -107,10 +127,6 @@ public class ZIPHandler extends NestedFileContentHandlerAdapter
|
||||||
{
|
{
|
||||||
cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
|
cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
|
||||||
}
|
}
|
||||||
}
|
return nestedDataSource;
|
||||||
|
|
||||||
public Object clone()
|
|
||||||
{
|
|
||||||
return new ZIPHandler();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue