Importing the classes seem to have warped the whitespaces. Here's my attempt to get things back to normal.

Introduced new datasource and contenthandler mechanism. It's quite a major alteration for individual changes to be enumerated. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150758 13f79535-47bb-0310-9956-ffa450edef68
2002-05-08 15:52:37 +00:00 · 2002-05-08 15:52:37 +00:00 · a716edd6d1
parent 5b5ea958c9
commit a716edd6d1
16 changed files with 662 additions and 354 deletions
--- a/sandbox/projects/appex/src/java/search/AbstractDataSource.java
+++ b/sandbox/projects/appex/src/java/search/AbstractDataSource.java
@ -26,12 +26,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache POI" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,22 +54,35 @@ package search;
 * <http://www.apache.org/>.
 */

+import java.util.Map;
+import java.util.Set;
+
 /**
 * Generic implementation of a datasource.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
 */
 public abstract class AbstractDataSource implements DataSource
 {
-    protected SearchConfiguration config;
-
-    public AbstractDataSource(SearchConfiguration config)
+    protected AbstractDataSource()
    {
-        this.config = config;
    }

-    public SearchConfiguration getConfig()
+    protected AbstractDataSource(Map map)
    {
-        return this.config;
+        loadFields(map);
+    }
+
+    /**
+     * Fields to index.
+     */
+    protected String[] fields;
+
+    /**
+     * Convenience method to load fields to index into a Map.
+     */
+    protected void loadFields(Map map)
+    {
+        Set fieldSet = map.keySet();
+        fields = new String[fieldSet.size()];
+        fieldSet.toArray(fields);
    }
 }
--- a/sandbox/projects/appex/src/java/search/DataSource.java
+++ b/sandbox/projects/appex/src/java/search/DataSource.java
@ -1,5 +1,4 @@
 package search;
-
 /* ====================================================================
 * The Apache Software License, Version 1.1
 *
@ -26,12 +25,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache POI" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,28 +53,49 @@ package search;
 * <http://www.apache.org/>.
 */

-import java.util.List;
+import java.util.Map;

 /**
 * A datasource is any source of data (filesystem, database, URL, etc)
 * which is indexed by SearchIndexer.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
 */
 public interface DataSource
 {
+    /**
+     * Key in the map (located in the list returned by getData)
+     * to represent the class name of the object being indexed.
+     */
    public static final String OBJECT_CLASS = "objectClass";
-    public static final String OBJECT_IDENTIFIER = "objectid";

    /**
-     * Retrieve a list of Maps. Each map represents the
+     * Key in the map (located in the list returned by getData)
+     * to represent the uuid of the object being indexed.
+     */
+    public static final String OBJECT_IDENTIFIER = "objectId";
+
+    /**
+     * The key in the map (located in the list returned by getData)
+     * to represent nested datasources.
+     */
+    public static final String NESTED_DATASOURCE = "nestedDataSource";
+
+    /**
+     * Key in the map (located in the list returned by getData)
+     * to represent the id of the datasource's container. Applies to
+     * nested datasources.
+     */
+    public static final String CONTAINER_IDENTIFIER = "containerId";
+
+    /**
+     * Key in the map to represent the class name of the Search Result
+     * object for this datasource (if any).
+     */
+    public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
+
+    /**
+     * Retrieve a array of Maps. Each map represents the
     * a document to be indexed. The key:value pair of the map
-     * is the data of the document.
+     * is the metadata of the document.
     */
-    public List getData() throws Exception;
-
-    /**
-     * Obtain the SearchConfiguration object used to configure the datasource.
-     */
-    public SearchConfiguration getConfig();
+    public Map[] getData() throws Exception;
 }
--- a/sandbox/projects/appex/src/java/search/DocumentHandler.java
+++ b/sandbox/projects/appex/src/java/search/DocumentHandler.java
@ -26,12 +26,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -55,177 +55,263 @@ package search;
 */

 import org.apache.log4j.Category;
-import org.apache.lucene.document.DateField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexWriter;
+import search.util.StringUtils;

-import java.io.File;
 import java.io.IOException;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import search.util.IOUtils;
-import search.contenthandler.FileContentHandler;
-import search.contenthandler.ContentHandlerFactory;
+import java.io.Reader;
+import java.util.*;

 /**
 * <p>
 * A document is the atomic unit used for indexing purposes. It consists of
- * metadata as well as its file contents. File contents are handled by {@link FileContentHandler}.
+ * metadata as well as its file contents. File contents are handled by
+ * {@link ContentHandler}.
 * </p>
 * <p>
 * DocumentHandler creates the {@link org.apache.lucene.document.Document},
- * adds the standard fields to it, delegates to {@link FileContentHandler} to handle
- * file contents, then adds to the {@link org.apache.lucene.index.IndexWriter}.
+ * adds fields to it, delegates to {@link ContentHandler} to handle
+ * file contents.
 * </p>
- * <p>
- * The standard fields are:<br>
- * <ul>
- * <li>filePath : Full filesystem path to the document
- * <li>fileName : File name of the document
- * <li>fileLastModifiedDate : Date the file was last modified
- * <li>fileSize : Size of the file in bytes
- * <li>fileFormat : Extension of the file {@see com.marketingbright.core.util.IOUtils#getFileExtension}
- * </ul>
- * </p>
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
 */
 public class DocumentHandler
 {
-    public static final String[] STANDARD_SEARCH_FIELDS =
-            {"filePath", "fileName", "fileLastModifiedDate", "fileSize", "fileFormat"};
-    private static Category cat = Category.getInstance(DocumentHandler.class.getName());
-    private static Map customFields;
-    private static final String EMPTY_STRING = "";
+    /**
+     * Field to retrieve all documents.
+     */
+    public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";

+    private static Category cat = Category.getInstance(DocumentHandler.class);
+
+    private static boolean isDebugEnabled = cat.isDebugEnabled();
+
+    /**
+     * Should parent documents include data of its children?
+     */
+    private static boolean parentEncapsulation = false;
    /**
     * Document object this DocumentHandler is handling.
     */
    private Document doc;

    /**
-     * Parent Document (null if none).
+     * Map of metadata for this document. Contains the field:value pair
+     * to be added to the document.
     */
-    private Document parentDoc;
+    private Map metadata;

    /**
-     * IndexWriter to add this document to.
+     * Map of fields. Contains field:type_of_field pair.
+     */
+    private Map customFields;
+
+    /**
+     * IndexWriter.
     */
    private IndexWriter writer;

-    public static void setCustomFields(Map aCustomFields)
-    {
-        customFields = aCustomFields;
-    }
+    /**
+     * A collection of documents to be added to the writer.
+     */
+    private List documents = new ArrayList();

-    public DocumentHandler(IndexWriter writer)
+    /**
+     * Ctor.
+     *
+     * @param Map of metadata for this document.
+     * @param Map of fields.
+     * @param Writer.
+     */
+    public DocumentHandler(Map metadata,
+                           Map customFields,
+                           IndexWriter writer)
    {
+        this.metadata = metadata;
+        this.customFields = customFields;
        this.writer = writer;
-        doc = new Document();
    }

-    public DocumentHandler(IndexWriter writer, Document parentDoc)
+    /**
+     * Handles the actual processing of the document.
+     */
+    public void process() throws IOException, Exception
    {
-        this(writer);
-        this.parentDoc = parentDoc;
-    }
-
-    public void process(Map metadata) throws IOException
+        String objectid = (String) metadata.get(DataSource.OBJECT_IDENTIFIER);
+        if (objectid == null)
+            return;
+        doc = createDocument();
+        addMapToDoc(metadata);
+        addNestedDataSource(metadata);
+        doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
+        //documents.add(doc);
+        if (writer != null)
        {
-        File contentFile = new File((String) metadata.get("filePath"));
-
-        // add the standard fields
-        doc.add(Field.Keyword("filePath", contentFile.toString()));
-        doc.add(Field.Text("fileName", contentFile.getName()));
-        doc.add(Field.Keyword("fileLastModifiedDate", DateField.timeToString(contentFile.lastModified())));
-        doc.add(Field.Keyword("fileSize", String.valueOf(contentFile.length())));
-        doc.add(Field.Text("fileFormat", IOUtils.getFileExtension(contentFile)));
-
-        // check if this is a document from datasource where
-        // custom fields need to be added
-        if (parentDoc == null)
-        {
-            // add the custom fields
-            for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
-            {
-                String field = (String) it.next();
-                String value = (String) metadata.get(field);
-                String type = (String) customFields.get(field);
-                addFieldToDoc(type, field, value);
-            }
-            // Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
-            // to populate the result templates with the proper
-            // objects
-            doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
-                                    (String) metadata.get(DataSource.OBJECT_CLASS)));
-            doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
-                               (String) metadata.get(DataSource.OBJECT_IDENTIFIER)));
-        }
-        else
-        {
-            for (Iterator it = customFields.keySet().iterator(); it.hasNext();)
-            {
-                String field = (String) it.next();
-                String value = parentDoc.get(field);
-                String type = (String) customFields.get(field);
-                addFieldToDoc(type, field, value);
-            }
-            // Add OBJECT_CLASS_FIELD and OBJECT_IDENTIFIER
-            // to populate the result templates with the proper
-            // objects
-            doc.add(Field.UnIndexed(DataSource.OBJECT_CLASS,
-                                    parentDoc.get(DataSource.OBJECT_CLASS)));
-            doc.add(Field.Text(DataSource.OBJECT_IDENTIFIER,
-                               parentDoc.get(DataSource.OBJECT_IDENTIFIER)));
-        }
-        if (!metadata.containsKey("fileContents"))
-        {
-            String extension = IOUtils.getFileExtension(contentFile);
-            FileContentHandler cHandler = ContentHandlerFactory.getContentHandler(extension);
-            if (cHandler != null)
-            {
-                cHandler.parse(doc, contentFile);
-                if (cHandler.isNested())
-                {
-                    List nestedData = cHandler.getNestedData();
-                    cat.debug("Nested data list size:" + nestedData.size());
-                    for (int i = 0; i < nestedData.size(); i++)
-                    {
-                        Map dataMap = (Map) nestedData.get(i);
-                        DocumentHandler handler = new DocumentHandler(writer, doc);
-                        handler.process(dataMap);
-                    }
-                }
-            }
-            else
-            {
-                cat.warn("FileContentHandler not found for " + contentFile.getName());
-            }
-        }
-        else
-            doc.add(Field.Text("fileContents", (String) metadata.get("fileContents")));
            addToWriter();
        }
-
-    public void addToWriter() throws IOException
+        else
        {
-        writer.addDocument(this.doc);
+            documents.add(doc);
+        }
    }

+    private List getDocuments()
+    {
+        return documents;
+    }
+
+    private Document createDocument()
+    {
+        return new Document();
+    }
+
+    /**
+     * Add the contents of a Map to a document.
+     *
+     * @param Map to add.
+     */
+    private void addMapToDoc(Map map)
+    {
+        for (Iterator it = map.keySet().iterator(); it.hasNext();)
+        {
+            String field = (String) it.next();
+            Object value = map.get(field);
+            if (value instanceof String)
+            {
+                String type = null;
+                if (customFields != null)
+                {
+                    type = (String) customFields.get(field);
+                }
+                addFieldToDoc(type, field, (String) value);
+            }
+            else if (value instanceof Reader)
+            {
+                addFieldToDoc(field, (Reader) value);
+            }
+        }
+    }
+
+    /**
+     * Add nested datasources.
+     *
+     * @param Map which contains the nested datasources.
+     */
+    private void addNestedDataSource(Map map) throws Exception
+    {
+        Object o = map.get(DataSource.NESTED_DATASOURCE);
+        if (o == null)
+            return;
+        if (o instanceof List)
+        {
+            List nestedDataSource = (List) o;
+            for (int i = 0; i < nestedDataSource.size(); i++)
+            {
+                DataSource ds = (DataSource) nestedDataSource.get(i);
+                addDataSource(ds);
+            }
+        }
+        else if (o instanceof DataSource)
+        {
+            DataSource ds = (DataSource) o;
+            addDataSource(ds);
+        }
+    }
+
+    /**
+     * Datasources are basically a collection of data maps to be indexed.
+     * addMapToDoc is invoked for each map.
+     *
+     * @param Datasource to add.
+     */
+    private void addDataSource(DataSource ds) throws Exception
+    {
+        Map[] data = ds.getData();
+        for (int i = 0; i < data.length; i++)
+        {
+            Map map = data[i];
+            if (map.containsKey(DataSource.OBJECT_IDENTIFIER))
+            {
+                /**
+                 * Create a new document because child datasources may need
+                 * to be retrieved independently of parent doc.
+                 */
+                DocumentHandler docHandler = new DocumentHandler(map, null, null);
+                docHandler.process();
+                documents.addAll(docHandler.getDocuments());
+            }
+            else
+            {
+                addMapToDoc(map);
+                /**
+                 * Add nested datasources of this datasource's data
+                 */
+                addNestedDataSource(map);
+            }
+        }
+    }
+
+    /**
+     * Adds a String-based field to a document.
+     *
+     * @param Type of field.
+     * @param Name of field.
+     * @param Value of field.
+     */
    private void addFieldToDoc(String type, String field, String value)
    {
        if (value == null)
-            value = EMPTY_STRING;
-        if (type.equalsIgnoreCase(SearchConfiguration.TEXT_FIELD_TYPE))
-            doc.add(Field.Text(field, value));
-        else if (type.equalsIgnoreCase(SearchConfiguration.KEYWORD_FIELD_TYPE))
+            value = StringUtils.EMPTY_STRING;
+        if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
            doc.add(Field.Keyword(field, value));
-        else if (type.equalsIgnoreCase(SearchConfiguration.UNINDEXED_FIELD_TYPE))
+        else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
            doc.add(Field.UnIndexed(field, value));
-        else if (type.equalsIgnoreCase(SearchConfiguration.UNSTORED_FIELD_TYPE))
+        else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
            doc.add(Field.UnStored(field, value));
+        else
+            doc.add(Field.Text(field, value));
+    }
+
+    /**
+     * Adds a Reader-based field to a document.
+     *
+     * @param Name of field.
+     * @param Reader.
+     */
+    private void addFieldToDoc(String field, Reader reader)
+    {
+        doc.add(Field.Text(field, reader));
+    }
+
+    /**
+     * Adds documents to the IndexWriter.
+     */
+    private void addToWriter() throws IOException
+    {
+        if (parentEncapsulation)
+        {
+            for (int i = 0; i < documents.size(); i++)
+            {
+                Document d = (Document) documents.get(i);
+                for (Enumeration e = d.fields(); e.hasMoreElements();)
+                {
+                    Field f = (Field) e.nextElement();
+                    String fieldName = f.name();
+                    if (!fieldName.equals(DataSource.CONTAINER_IDENTIFIER)
+                            && !fieldName.equals(DataSource.OBJECT_CLASS)
+                            && !fieldName.equals(DataSource.OBJECT_IDENTIFIER))
+                    {
+                        doc.add(f);
+                    }
+                }
+            }
+        }
+        writer.addDocument(doc);
+        for (int i = 0; i < documents.size(); i++)
+        {
+            writer.addDocument((Document) documents.get(i));
+        }
+        //cat.debug((documents.size() + 1) + " documents added.");
    }
 }
--- a/sandbox/projects/appex/src/java/search/FSDataSource.java
+++ b/sandbox/projects/appex/src/java/search/FSDataSource.java
@ -26,12 +26,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -55,9 +55,12 @@ package search;
 */

 import org.apache.lucene.document.DateField;
-import org.apache.lucene.document.Field;
+import search.contenthandler.FileContentHandler;
+import search.contenthandler.FileContentHandlerFactory;
+import search.util.IOUtils;

 import java.io.File;
+import java.io.Reader;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@ -70,23 +73,38 @@ import java.util.Map;
 */
 public class FSDataSource extends AbstractDataSource
 {
-    private File targetDirectory;
+    public static final String FILE_PATH_FIELD = "filePath";
+    public static final String FILE_NAME_FIELD = "fileName";
+    public static final String FILE_SIZE_FIELD = "fileSize";
+    public static final String FILE_FORMAT_FIELD = "fileFormat";
+    public static final String FILE_CONTENTS_FIELD = "fileContents";
+    public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";

-    public FSDataSource(SearchConfiguration config)
+    private File targetFileOrDir;
+
+    public FSDataSource(String targetFileOrDirStr)
    {
-        super(config);
+        this(new File(targetFileOrDirStr));
    }

-    public List getData()
+    public FSDataSource(File targetFileOrDir)
    {
-        List returnData = new ArrayList();
-        loadDataFromFiles(targetDirectory, returnData);
+        setTargetDirectory(targetFileOrDir);
+    }
+
+    public Map[] getData()
+    {
+        Map[] returnData = null;
+        List temp = new ArrayList();
+        loadDataFromFiles(targetFileOrDir, temp);
+        returnData = new Map[temp.size()];
+        returnData = (Map[]) temp.toArray(returnData);
        return returnData;
    }

-    public void setTargetDirectory(File targetDirectory)
+    public void setTargetDirectory(File targetFileOrDir)
    {
-        this.targetDirectory = targetDirectory;
+        this.targetFileOrDir = targetFileOrDir;
    }

    private void loadDataFromFiles(File f, List list)
@ -102,8 +120,40 @@ public class FSDataSource extends AbstractDataSource
        else
        {
            Map dataMap = new HashMap();
-            dataMap.put("filePath", f.getPath());
+            dataMap.put(FILE_PATH_FIELD, f.getPath());
+            dataMap.put(FILE_NAME_FIELD, f.getName());
+            dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
+                        DateField.timeToString(f.lastModified()));
+            dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
+            dataMap.put(FILE_FORMAT_FIELD,
+                        IOUtils.getFileExtension(f));
+            addFileContents(f, dataMap);
            list.add(dataMap);
        }
    }
+
+    private void addFileContents(File targetFile, Map dataMap)
+    {
+        FileContentHandler cHandler =
+                FileContentHandlerFactory.getContentHandler(targetFile);
+        if (cHandler != null)
+        {
+            if (cHandler.fileContentIsReadable())
+            {
+                Reader r = cHandler.getReader();
+                if (r != null)
+                {
+                    dataMap.put(FILE_CONTENTS_FIELD, r);
+                }
+            }
+            if (cHandler.containsNestedData())
+            {
+                dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
+            }
+        }
+        else
+        {
+            //cat.warn("ContentHandler not found for " + contentFile.getName());
+        }
+    }
 }
--- a/sandbox/projects/appex/src/java/search/IllegalConfigurationException.java
+++ b/sandbox/projects/appex/src/java/search/IllegalConfigurationException.java
@ -26,12 +26,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
--- a/sandbox/projects/appex/src/java/search/SearchConfiguration.java
+++ b/sandbox/projects/appex/src/java/search/SearchConfiguration.java
@ -26,12 +26,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -59,7 +59,7 @@ import org.jdom.Document;
 import org.jdom.Element;
 import org.jdom.input.SAXBuilder;
 import search.util.DataUnformatFilter;
-import search.contenthandler.ContentHandlerFactory;
+import search.contenthandler.FileContentHandlerFactory;

 import java.util.HashMap;
 import java.util.List;
@ -158,7 +158,7 @@ public class SearchConfiguration
        {
            if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
            {
-                contentHandlers.put(ContentHandlerFactory.DEFAULT_HANDLER_KEY
+                contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
                                    , generateObject(handlers[i]));
            }
        }
--- a/sandbox/projects/appex/src/java/search/SearchIndexer.java
+++ b/sandbox/projects/appex/src/java/search/SearchIndexer.java
@ -26,12 +26,12 @@ package search;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache POI" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -53,80 +53,75 @@ package search;
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */
-
+import org.apache.log4j.Category;
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.index.IndexWriter;
-import org.apache.log4j.Category;

 import java.io.IOException;
 import java.util.List;
 import java.util.Map;

-import search.contenthandler.ContentHandlerFactory;
+import search.contenthandler.FileContentHandlerFactory;

 /**
 * Entry point for search engine indexing.
 * <p>
- * SearchIndexer is responsible for creating the IndexWriter {@see org.apache.lucene.index.IndexWriter}
- * and passing it to DocumentHandlers {@link DocumentHandler} to index individual documents.
+ * SearchIndexer is responsible for creating the IndexWriter
+ * {@see org.apache.lucene.index.IndexWriter} and passing it to
+ *  DocumentHandlers {@link DocumentHandler} to index individual documents.
 * </p>
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
 */
 public class SearchIndexer
 {
    private static Category cat = Category.getInstance(SearchIndexer.class);
-
-    private IndexWriter writer;
-    private DataSource source;
+    private IndexWriter fsWriter;
+    private SearchConfiguration config;
    private int indexedDocuments = 0;

    public SearchIndexer() throws IOException
    {
-        writer = new IndexWriter("/usr/local/lucene/index",
-                                 new StandardAnalyzer(), true);
+        Analyzer a = new StandardAnalyzer();
+        String indexDirectory = "/usr/path/to/index";
+        fsWriter = new IndexWriter(indexDirectory, a, true);
+        fsWriter.maxFieldLength = 1000000;
    }

-    public void index() throws IOException, Exception
+    /**
+     * Indexes documents.
+     */
+    public synchronized void index() throws IOException, Exception
    {
-        cat.debug("Initiating indexing...");
-
-        init();
-        List dataMapList = source.getData();
-        for (int i = 0; i < dataMapList.size(); i++)
-        {
-            Map map = (Map) dataMapList.get(i);
-            DocumentHandler docHandler = new DocumentHandler(writer);
-            try
-            {
-                docHandler.process(map);
-                ++indexedDocuments;
-            }
-            catch (IOException ioe)
-            {
-                cat.error("Error encountered indexing:" + ioe.getMessage(),
-                          ioe);
-            }
-        }
-        writer.optimize();
-        writer.close();
-
-        cat.debug(indexedDocuments + " documents were indexed.");
-    }
-
-    public void setSource(DataSource source)
-    {
-        this.source = source;
-    }
-
-    public void init()
-    {
-        ContentHandlerFactory.setContentHandlers(source.getConfig().getContentHandlers());
-        DocumentHandler.setCustomFields(source.getConfig().getCustomFields());
+        cat.debug("Initiating search engine indexing...");
+        long start = System.currentTimeMillis();
+        loadConfig();
+        fsWriter.optimize();
+        fsWriter.close();
+        long stop = System.currentTimeMillis();
+        cat.debug("Indexing took " + (stop - start) + " milliseconds");
    }

    public int getIndexedDocuments()
    {
        return this.indexedDocuments;
    }
+
+    private void loadConfig() throws IllegalConfigurationException
+    {
+        config = new SearchConfiguration("/path/to/config");
+        FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
+    }
+
+    private void indexDataSource(DataSource source, Map customFields)
+            throws Exception
+    {
+        Map[] data = source.getData();
+        // here's a good place to spawn a couple of threads for indexing
+        for (int i = 0; i < data.length; i++)
+        {
+            DocumentHandler docHandler =
+                    new DocumentHandler(data[i], customFields, fsWriter);
+            docHandler.process();
+        }
+    }
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/FileContentHandler.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/FileContentHandler.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,33 +54,35 @@ package search.contenthandler;
 * <http://www.apache.org/>.
 */

-import org.apache.lucene.document.Document;
-
-import java.io.File;
+import java.io.Reader;
 import java.util.List;

 /**
 * A content handler determines how to index a file's contents.
- *
- * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
 */
 public interface FileContentHandler
 {
    /**
-     * Perform filetype-specific actions to index the file's contents and
-     * add it to the {@link org.apache.lucene.document.Document} object.
+     * Do the file contents of this file have any meaning? Should
+     * its contents be indexed?
     */
-    public void parse(Document doc, File f);
+    public boolean fileContentIsReadable();

    /**
-     * Is this a collection of files?
+     * Returns a reader for this file's contents.
     */
-    public boolean isNested();
+    public Reader getReader();

    /**
-     * Return the collection of files contained within the parent file.
+     * Does this file have nested data within?
     */
-    public List getNestedData();
+    public boolean containsNestedData();

-    public Object clone();
+    /**
+     * Return the datasources contained within the parent file.
+     * This can be URLs contained within a HTML file, files
+     * within a ZIP file, basically anything represented by a
+     * DataSource.
+     */
+    public List getNestedDataSource();
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/FileContentHandlerAdapter.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/FileContentHandlerAdapter.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,9 +54,8 @@ package search.contenthandler;
 * <http://www.apache.org/>.
 */

-import org.apache.lucene.document.Document;
-
 import java.io.File;
+import java.io.Reader;
 import java.util.List;

 /**
@ -70,12 +69,20 @@ import java.util.List;
 */
 public abstract class FileContentHandlerAdapter implements FileContentHandler
 {
-    public void parse(Document doc, File f)
+    protected File file;
+
+    protected FileContentHandlerAdapter(File file)
    {
+        this.file = file;
    }
-    public List getNestedData()
+
+    public Reader getReader()
+    {
+        return null;
+    }
+
+    public List getNestedDataSource()
    {
        return null;
    }
-    public abstract Object clone();
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/FileContentHandlerFactory.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/FileContentHandlerFactory.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -57,29 +57,123 @@ package search.contenthandler;
 import org.apache.log4j.Category;

 import java.util.Map;
+import java.io.File;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Constructor;
+
+import search.util.IOUtils;

 /**
 * Factory responsible for obtaining ContentHandlers.
 *
 * @author <a href="mailto:kelvin@relevanz.com">Kelvin Tan</a>
 */
-public abstract class ContentHandlerFactory
+public abstract class FileContentHandlerFactory
 {
    public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
-    static Category cat = Category.getInstance(ContentHandlerFactory.class.getName());
-    private static Map handlerCache = null;
-    public static FileContentHandler getContentHandler(String extension)
+    static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
+    private static Map handlerRegistry;
+
+    public static FileContentHandler getContentHandler(File f)
    {
-        if (handlerCache.containsKey(extension))
-            return (FileContentHandler) ((FileContentHandler) handlerCache.get(extension)).clone();
-        else if (handlerCache.containsKey(DEFAULT_HANDLER_KEY))
-            return (FileContentHandler) ((FileContentHandler) handlerCache.get(DEFAULT_HANDLER_KEY)).clone();
+        String extension = IOUtils.getFileExtension(f);
+        if (handlerRegistry.containsKey(extension))
+        {
+            String handlerClassname = (String) handlerRegistry.get(extension);
+            return (FileContentHandler) generateObject(handlerClassname,
+                                                                     new Class[]{File.class},
+                                                                     new Object[]{f});
+        }
+        else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
+        {
+            String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
+            return (FileContentHandler) generateObject(handlerClassname);
+        }
        else
+        {
            return NullHandler.getInstance();
        }
+    }

-    public static void setContentHandlers(Map contentHandlers)
+    public static void setHandlerRegistry(Map handlerRegistry)
    {
-        handlerCache = contentHandlers;
+        FileContentHandlerFactory.handlerRegistry = handlerRegistry;
+    }
+
+        /**
+     * Utility method to return an object based on its class name.
+     * The object needs to have a constructor which accepts no parameters.
+     *
+     * @param className  Class name of object to be generated
+     * @return Object
+     */
+    private static Object generateObject(String className)
+    {
+        Object o = null;
+        try
+        {
+            Class c = Class.forName(className);
+            o = c.newInstance();
+        }
+        catch (ClassNotFoundException cnfe)
+        {
+            cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
+        }
+        catch (InstantiationException ie)
+        {
+            cat.error(ie.getMessage() + " Class named '" + className + "' could not be  instantiated.", ie);
+        }
+        catch (IllegalAccessException iae)
+        {
+            cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
+        }
+        return o;
+    }
+
+        /**
+     * Utility method to return an object based on its class name.
+     *
+     * @param type  Class name of object to be generated
+     * @param clazz Class array of parameters.
+     * @param args Object array of arguments.
+     * @return Object
+     */
+    private static Object generateObject(String className,
+                                        Class[] clazz,
+                                        Object[] args)
+    {
+        Object o = null;
+        try
+        {
+            Class c = Class.forName(className);
+            Constructor con = c.getConstructor(clazz);
+            if (con != null)
+            {
+                o = con.newInstance(args);
+            }
+            else
+                throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
+        }
+        catch (ClassNotFoundException cnfe)
+        {
+            cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
+        }
+        catch (InstantiationException ie)
+        {
+            cat.error(ie.getMessage() + " Class named '" + className + "' could not be  instantiated.", ie);
+        }
+        catch (IllegalAccessException iae)
+        {
+            cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
+        }
+        catch (NoSuchMethodException nsme)
+        {
+            cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
+        }
+        catch (InvocationTargetException ite)
+        {
+            cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
+        }
+        return o;
    }
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/GZipHandler.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/GZipHandler.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -55,17 +55,14 @@ package search.contenthandler;
 */

 import org.apache.log4j.Category;
-import org.apache.lucene.document.DateField;
-import org.apache.lucene.document.Document;
+import search.DataSource;
+import search.FSDataSource;
+import search.util.IOUtils;

 import java.io.File;
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
+import java.io.Reader;
 import java.util.List;
-import java.util.Map;
-
-import search.util.IOUtils;

 /**
 * Handles GZip content.
@ -74,51 +71,60 @@ import search.util.IOUtils;
 */
 public class GZipHandler extends NestedFileContentHandlerAdapter
 {
-    static Category cat = Category.getInstance(GZipHandler.class.getName());
+    private static Category cat = Category.getInstance(GZipHandler.class.getName());

-    public void parse(Document doc, File f)
+    public GZipHandler(File file)
    {
-        if (!f.exists())
-            return;
+        super(file);
+    }
+
+    public Reader getReader()
+    {
+        return null;
+    }
+
+    public List getNestedDataSource()
+    {
+        if (!file.exists())
+            return null;
        try
        {
            File tempDir = new File(TEMP_FOLDER);
            tempDir.mkdirs();
            tempDir.deleteOnExit();
-            String filename = f.getName();
+            String filename = file.getName();
            File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
            tempFile.deleteOnExit();
-            IOUtils.extractGZip(f, tempFile);
-            indexGZipDirectory(tempDir, dataMapList);
+            IOUtils.extractGZip(file, tempFile);
+            indexGZipDirectory(tempDir);
        }
        catch (IOException ioe)
        {
-            cat.error("IOException ungzipping " + f.toString(), ioe);
+            cat.error("IOException ungzipping " + file.toString(), ioe);
        }
+        return nestedDataSource;
+    }
+
+    public boolean fileContentIsReadable()
+    {
+        return false;
    }

    // only one file, but let's just treat it like a directory anyway
-    private void indexGZipDirectory(File dir, List dataMapList)
+    private void indexGZipDirectory(File dir)
    {
        if (dir.isDirectory())
        {
            File[] dirContents = dir.listFiles();
            for (int i = 0; i < dirContents.length; i++)
            {
-                indexGZipDirectory(dirContents[i], dataMapList);
+                indexGZipDirectory(dirContents[i]);
            }
        }
        else if (dir.isFile())
        {
-            // here create new DataMap for the gzip entry
-            Map dataMap = new HashMap();
-            dataMap.put("filePath", dir.toString());
-            dataMapList.add(dataMap);
+            DataSource ds = new FSDataSource(dir);
+            nestedDataSource.add(nestedDataSource);
        }
    }
-
-    public Object clone()
-    {
-        return new GZipHandler();
-    }
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/NestedFileContentHandlerAdapter.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/NestedFileContentHandlerAdapter.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -76,17 +76,15 @@ public abstract class NestedFileContentHandlerAdapter
    protected final String TEMP_FOLDER = "/usr/temp" + '/'
            + Math.random() + '/';

-    protected List dataMapList = new ArrayList();
+    protected List nestedDataSource;

-    public abstract void parse(Document doc, File f);
+    public NestedFileContentHandlerAdapter(File file)
+    {
+        super(file);
+    }

-    public boolean isNested()
+    public boolean containsNestedData()
    {
        return true;
    }
-
-    public List getNestedData()
-    {
-        return this.dataMapList;
-    }
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/NullHandler.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/NullHandler.java
@ -1,5 +1,8 @@
 package search.contenthandler;

+import java.io.File;
+import java.io.Reader;
+
 /* ====================================================================
 * The Apache Software License, Version 1.1
 *
@ -26,12 +29,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -61,19 +64,29 @@ package search.contenthandler;
 */
 public class NullHandler extends FileContentHandlerAdapter
 {
-    static NullHandler singleton = new NullHandler();
+    private static NullHandler singleton = new NullHandler(null);

    public static FileContentHandler getInstance()
    {
        return singleton;
    }

-    public Object clone()
+    private NullHandler(File file)
    {
-        return this;
+        super(file);
    }

-    public boolean isNested()
+    public boolean fileContentIsReadable()
+    {
+        return false;
+    }
+
+    public Reader getReader()
+    {
+        return null;
+    }
+
+    public boolean containsNestedData()
    {
        return false;
    }
--- a/sandbox/projects/appex/src/java/search/contenthandler/TARHandler.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/TARHandler.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,17 +54,16 @@ package search.contenthandler;
 * <http://www.apache.org/>.
 */

-import search.util.IOUtils;
 import org.apache.log4j.Category;
-import org.apache.lucene.document.DateField;
-import org.apache.lucene.document.Document;
+import search.DataSource;
+import search.FSDataSource;
+import search.util.IOUtils;

 import java.io.File;
 import java.io.IOException;
+import java.io.Reader;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;

 /**
 * Handles Tar files.
@ -75,44 +74,58 @@ public class TARHandler extends NestedFileContentHandlerAdapter
 {
    static Category cat = Category.getInstance(TARHandler.class.getName());

-    public void parse(Document doc, File f)
+    public TARHandler(File file)
    {
-        if (!f.exists())
-            return;
+        super(file);
+    }
+
+    public Reader getReader()
+    {
+        return null;
+    }
+
+    public boolean fileContentIsReadable()
+    {
+        return false;
+    }
+
+    public List getNestedDataSource()
+    {
+        if (!file.exists())
+            return null;
+        if (nestedDataSource == null)
+        {
+            nestedDataSource = new ArrayList();
+        }
        try
        {
            File tempDir = new File(TEMP_FOLDER);
            tempDir.deleteOnExit();
-            IOUtils.extractTar(f, tempDir);
-            indexTarDirectory(tempDir, dataMapList);
+            IOUtils.extractTar(file, tempDir);
+            indexTarDirectory(tempDir);
        }
        catch (IOException ioe)
        {
            cat.error(ioe.getMessage(), ioe);
        }
+        return nestedDataSource;
    }

-    private void indexTarDirectory(File dir, List dataMapList)
+    private void indexTarDirectory(File dir)
    {
        if (dir.isDirectory())
        {
            File[] dirContents = dir.listFiles();
            for (int i = 0; i < dirContents.length; i++)
            {
-                indexTarDirectory(dirContents[i], dataMapList);
+                indexTarDirectory(dirContents[i]);
            }
        }
        else if (dir.isFile())
        {
            // here create new DataMap for the tarred file
-            Map dataMap = new HashMap();
-            dataMap.put("filePath", dir.toString());
-            dataMapList.add(dataMap);
+            DataSource ds = new FSDataSource(dir);
+            nestedDataSource.add(nestedDataSource);
        }
    }
-
-    public Object clone()
-    {
-        return new TARHandler();
-    }
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/TextHandler.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/TextHandler.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -71,37 +71,37 @@ public class TextHandler extends FileContentHandlerAdapter
 {
    static Category cat = Category.getInstance(TextHandler.class.getName());

-    public void parse(Document doc, File f)
+    public TextHandler(File file)
    {
-        if (!f.exists())
-        {
-            cat.error(f.toString() + " doesn't exist! Failing silently...");
-            return;
-        }
-        doc.add(Field.Text("fileContents", getReader(f)));
+        super(file);
    }

-    public boolean isNested()
+    public Reader getReader()
+    {
+        if (!file.exists())
+        {
+            cat.error(file.toString() + " doesn't exist! Failing silently...");
+            return null;
+        }
+        return getReader(file);
+    }
+
+    public boolean containsNestedData()
    {
        return false;
    }

+    public boolean fileContentIsReadable()
+    {
+        return true;
+    }
+
    private Reader getReader(File f)
    {
        Reader reader = null;
        try
        {
-            BufferedReader br = new BufferedReader(new FileReader(f));
-            String s = null;
-            StringBuffer strbf = new StringBuffer();
-            while ((s = br.readLine()) != null)
-            {
-                if (s.trim().length() > 0)
-                {
-                    strbf.append(StringUtils.removeUnreadableCharacters(s));
-                }
-            }
-            reader = new StringReader(strbf.toString());
+            reader = new FileReader(f);
        }
        catch (FileNotFoundException nfe)
        {
@ -113,9 +113,4 @@ public class TextHandler extends FileContentHandlerAdapter
        }
        return reader;
    }
-
-    public Object clone()
-    {
-        return new TextHandler();
-    }
 }
--- a/sandbox/projects/appex/src/java/search/contenthandler/ZIPHandler.java
+++ b/sandbox/projects/appex/src/java/search/contenthandler/ZIPHandler.java
@ -26,12 +26,12 @@ package search.contenthandler;
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
- *    "Apache Turbine" must not be used to endorse or promote products
+ *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
- *    "Apache Turbine", nor may "Apache" appear in their name, without
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
@ -54,15 +54,17 @@ package search.contenthandler;
 * <http://www.apache.org/>.
 */

-import search.util.IOUtils;
 import org.apache.log4j.Category;
-import org.apache.lucene.document.Document;
+import search.DataSource;
+import search.FSDataSource;
+import search.util.IOUtils;

 import java.io.File;
 import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
 import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.List;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipException;
 import java.util.zip.ZipFile;
@ -74,15 +76,34 @@ import java.util.zip.ZipFile;
 */
 public class ZIPHandler extends NestedFileContentHandlerAdapter
 {
-    static Category cat = Category.getInstance(ZIPHandler.class.getName());
+    private static Category cat = Category.getInstance(ZIPHandler.class);

-    public void parse(Document doc, File f)
+    public ZIPHandler(File file)
    {
-        if (!f.exists())
-            return;
+        super(file);
+    }
+
+    public boolean fileContentIsReadable()
+    {
+        return false;
+    }
+
+    public Reader getReader()
+    {
+        return null;
+    }
+
+    public List getNestedDataSource()
+    {
+        if (!file.exists())
+            return null;
+        if (nestedDataSource == null)
+        {
+            nestedDataSource = new ArrayList();
+        }
        try
        {
-            ZipFile zFile = new ZipFile(f);
+            ZipFile zFile = new ZipFile(file);
            for (Enumeration e = zFile.entries(); e.hasMoreElements();)
            {
                ZipEntry entry = (ZipEntry) e.nextElement();
@ -92,9 +113,8 @@ public class ZIPHandler extends NestedFileContentHandlerAdapter
                if (!entry.isDirectory())
                {
                    // create a new DataMap for each zip entry
-                    Map dataMap = new HashMap();
-                    dataMap.put("filePath", TEMP_FOLDER + entryName);
-                    dataMapList.add(dataMap);
+                    DataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
+                    nestedDataSource.add(ds);
                }
            }
            zFile.close();
@ -107,10 +127,6 @@ public class ZIPHandler extends NestedFileContentHandlerAdapter
        {
            cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
        }
-    }
-
-    public Object clone()
-    {
-        return new ZIPHandler();
+        return nestedDataSource;
    }
 }