Indyo is no longer supported or actively developed. Perhaps the code will live on, but in another form.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150882 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kelvin Tan 2003-05-02 01:28:55 +00:00
parent 75cb1474ba
commit b7db8168cc
27 changed files with 0 additions and 3309 deletions

View File

@ -1,2 +0,0 @@
AnyObjectId[346504c6d4bd7232f0776a4a0f8a32333cedd93e] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[93e77a4a4476afff71a110dda1e96465cb7f25a9] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[be4a9176c35a7feeecf5b70edf070ecb5d13ac5d] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[ff9b90061b65c32122fcdde27bfe7f1e61fbd7bd] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[329aef393bece9d77eef16279910f6cd73113c39] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[c1fa1d645474eee07f085a8ee29e38422f7614cf] was removed in git history.
Apache SVN contains full history.

View File

@ -1,19 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<Indyo>
<Search>
<ContentHandlers>
<ContentHandler name="Zip" extension="zip" handler="com.relevanz.indyo.contenthandler.ZIPHandler"/>
<ContentHandler name="Jar" extension="jar" handler="com.relevanz.indyo.contenthandler.ZIPHandler"/>
<ContentHandler name="GZip" extension="gz" handler="com.relevanz.indyo.contenthandler.GZipHandler"/>
<ContentHandler name="Tar" extension="tar" handler="com.relevanz.indyo.contenthandler.TARHandler"/>
<ContentHandler name="Htm" extension="htm" handler="com.relevanz.indyo.contenthandler.HTMLHandler"/>
<ContentHandler name="Html" extension="html" handler="com.relevanz.indyo.contenthandler.HTMLHandler"/>
<ContentHandler name="Text" extension="txt" handler="com.relevanz.indyo.contenthandler.TextHandler"/>
<ContentHandler name="MSWord" extension="doc" handler="com.relevanz.indyo.contenthandler.ReadableTextFilterHandler"/>
<ContentHandler name="MSPowerpoint" extension="ppt" handler="com.relevanz.indyo.contenthandler.ReadableTextFilterHandler"/>
<ContentHandler name="MSExcel" extension="xls" handler="com.relevanz.indyo.contenthandler.ReadableTextFilterHandler"/>
</ContentHandlers>
</Search>
</Indyo>

View File

@ -1,90 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Map;
import java.util.Set;
/**
* Generic implementation of an index datasource.
*
* @version $Id$
*/
public abstract class AbstractDataSource implements IndexDataSource
{
protected AbstractDataSource()
{
}
protected AbstractDataSource(Map map)
{
loadFields(map);
}
/**
* Fields to index.
*/
protected String[] fields;
/**
* Convenience method to load fields to index into a Map.
*/
protected void loadFields(Map map)
{
Set fieldSet = map.keySet();
fields = new String[fieldSet.size()];
fieldSet.toArray(fields);
}
}

View File

@ -1,332 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.relevanz.indyo.util.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
/**
* <p>
* A document is the atomic unit used for indexing purposes. It consists of
* metadata as well as its file contents. File contents are handled by
* {@link ContentHandler}.
* </p>
* <p>
* DocumentHandler creates the {@link org.apache.lucene.document.Document},
* adds fields to it, delegates to {@link ContentHandler} to handle
* file contents.
* </p>
*
* @version $Id$
*/
public class DocumentHandler
{
/**
* Field to retrieve all documents.
*/
public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
private static Logger log = Logger.getLogger(DocumentHandler.class);
private static boolean isDebugEnabled = log.isDebugEnabled();
/**
* Should parent documents include data of its children?
*/
private static boolean parentEncapsulation = false;
/**
* Document object this DocumentHandler is handling.
*/
private Document doc;
/**
* Map of metadata for this document. Contains the field:value pair
* to be added to the document.
*/
private Map metadata;
/**
* Map of fields. Contains field:type_of_field pair.
*/
private Map customFields;
/**
* IndexWriter.
*/
private IndexWriter writer;
/**
* A collection of documents to be added to the writer.
*/
private List documents = new ArrayList();
/**
* Ctor.
*
* @param Map of metadata for this document.
* @param Map of fields.
* @param Writer.
*/
public DocumentHandler(Map metadata,
Map customFields,
IndexWriter writer)
{
this.metadata = metadata;
this.customFields = customFields;
this.writer = writer;
}
/**
* Handles the actual processing of the document.
*/
public void process() throws IOException, Exception
{
String objectid = (String) metadata.get(IndexDataSource.OBJECT_IDENTIFIER);
if (objectid == null)
return;
doc = createDocument();
addMapToDoc(metadata);
addNestedDataSource(metadata);
doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
//documents.add(doc);
if (writer != null)
{
addToWriter();
}
else
{
documents.add(doc);
}
}
private List getDocuments()
{
return documents;
}
private Document createDocument()
{
return new Document();
}
/**
* Add the contents of a Map to a document.
*
* @param Map to add.
*/
private void addMapToDoc(Map map)
{
for (Iterator it = map.keySet().iterator(); it.hasNext();)
{
String field = (String) it.next();
Object value = map.get(field);
if (value instanceof String)
{
String type = null;
if (customFields != null)
{
type = (String) customFields.get(field);
}
addFieldToDoc(type, field, (String) value);
}
else if (value instanceof Reader)
{
addFieldToDoc(field, (Reader) value);
}
}
}
/**
* Add nested datasources.
*
* @param Map which contains the nested datasources.
*/
private void addNestedDataSource(Map map) throws Exception
{
Object o = map.get(IndexDataSource.NESTED_DATASOURCE);
if (o == null)
return;
if (o instanceof IndexDataSource)
{
IndexDataSource ds = (IndexDataSource) o;
addDataSource(ds);
}
else if (o instanceof List)
{
List nestedDataSource = (List) o;
for (int i = 0, n = nestedDataSource.size(); i < n; i++)
{
IndexDataSource ds = (IndexDataSource) nestedDataSource.get(i);
addDataSource(ds);
}
}
else if (o instanceof IndexDataSource[])
{
IndexDataSource[] nestedDataSource = (IndexDataSource[]) o;
for (int i = 0, n = nestedDataSource.length; i < n; i++)
{
IndexDataSource ds = (IndexDataSource) nestedDataSource[i];
addDataSource(ds);
}
}
else
{
log.warn("Unknown object found as nested datasource:" + o);
}
}
/**
* Datasources are basically a collection of data maps to be indexed.
* addMapToDoc is invoked for each map.
*
* @param Datasource to add.
*/
private void addDataSource(IndexDataSource ds) throws Exception
{
Map[] data = ds.getData();
for (int i = 0; i < data.length; i++)
{
Map map = data[i];
if (map.containsKey(IndexDataSource.OBJECT_IDENTIFIER))
{
/**
* Create a new document because child datasources may need
* to be retrieved independently of parent doc.
*/
DocumentHandler docHandler = new DocumentHandler(map, null, null);
docHandler.process();
documents.addAll(docHandler.getDocuments());
}
else
{
addMapToDoc(map);
/**
* Add nested datasources of this datasource's data
*/
addNestedDataSource(map);
}
}
}
/**
* Adds a String-based field to a document.
*
* @param Type of field.
* @param Name of field.
* @param Value of field.
*/
private void addFieldToDoc(String type, String field, String value)
{
if (value == null)
value = StringUtils.EMPTY_STRING;
if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
doc.add(Field.Keyword(field, value));
else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
doc.add(Field.UnIndexed(field, value));
else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
doc.add(Field.UnStored(field, value));
else
doc.add(Field.Text(field, value));
}
/**
* Adds a Reader-based field to a document.
*
* @param Name of field.
* @param Reader.
*/
private void addFieldToDoc(String field, Reader reader)
{
doc.add(Field.Text(field, reader));
}
/**
* Adds documents to the IndexWriter.
*/
private void addToWriter() throws IOException
{
if (parentEncapsulation)
{
for (int i = 0, n = documents.size(); i < n; i++)
{
Document d = (Document) documents.get(i);
for (Enumeration e = d.fields(); e.hasMoreElements();)
{
Field f = (Field) e.nextElement();
String fieldName = f.name();
if (!fieldName.equals(IndexDataSource.CONTAINER_IDENTIFIER)
&& !fieldName.equals(IndexDataSource.OBJECT_CLASS)
&& !fieldName.equals(IndexDataSource.OBJECT_IDENTIFIER))
{
doc.add(f);
}
}
}
}
writer.addDocument(doc);
for (int i = 0, n = documents.size(); i < n; i++)
{
writer.addDocument((Document) documents.get(i));
}
}
}

View File

@ -1,160 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.document.DateField;
import com.relevanz.indyo.contenthandler.FileContentHandler;
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
import com.relevanz.indyo.util.IOUtils;
import java.io.File;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A filesystem-based datasource.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class FSDataSource extends AbstractDataSource
{
public static final String FILE_PATH_FIELD = "filePath";
public static final String FILE_NAME_FIELD = "fileName";
public static final String FILE_SIZE_FIELD = "fileSize";
public static final String FILE_FORMAT_FIELD = "fileFormat";
public static final String FILE_CONTENTS_FIELD = "fileContents";
public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
private File targetFileOrDir;
public FSDataSource(String targetFileOrDirStr)
{
this(new File(targetFileOrDirStr));
}
public FSDataSource(File targetFileOrDir)
{
setTargetDirectory(targetFileOrDir);
}
public Map[] getData()
{
Map[] returnData = null;
List temp = new ArrayList();
loadDataFromFiles(targetFileOrDir, temp);
returnData = new Map[temp.size()];
returnData = (Map[]) temp.toArray(returnData);
return returnData;
}
public void setTargetDirectory(File targetFileOrDir)
{
this.targetFileOrDir = targetFileOrDir;
}
private void loadDataFromFiles(File f, List list)
{
if (f.isDirectory())
{
File[] directoryTree = f.listFiles();
for (int i = 0; i < directoryTree.length; i++)
{
loadDataFromFiles(directoryTree[i], list);
}
}
else
{
Map dataMap = new HashMap();
dataMap.put(FILE_PATH_FIELD, f.getPath());
dataMap.put(FILE_NAME_FIELD, f.getName());
dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
DateField.timeToString(f.lastModified()));
dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
dataMap.put(FILE_FORMAT_FIELD,
IOUtils.getFileExtension(f));
addFileContents(f, dataMap);
list.add(dataMap);
}
}
private void addFileContents(File targetFile, Map dataMap)
{
FileContentHandler cHandler =
FileContentHandlerFactory.getContentHandler(targetFile);
if (cHandler != null)
{
if (cHandler.fileContentIsReadable())
{
Reader r = cHandler.getReader();
if (r != null)
{
dataMap.put(FILE_CONTENTS_FIELD, r);
}
}
if (cHandler.containsNestedData())
{
dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
}
}
else
{
//cat.warn("ContentHandler not found for " + contentFile.getName());
}
}
}

View File

@ -1,69 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* Thrown when loading SearchConfiguration.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class IllegalConfigurationException extends Exception
{
public IllegalConfigurationException(String msg)
{
super(msg);
}
}

View File

@ -1,103 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.util.Map;
/**
* A datasource is any source of data (filesystem, database, URL, etc)
* which is indexed by SearchIndexer.
*
* @version $Id$
*/
public interface IndexDataSource
{
/**
* Key in the map (located in the list returned by getData)
* to represent the class name of the object being indexed.
*/
public static final String OBJECT_CLASS = "objectClass";
/**
* Key in the map (located in the list returned by getData)
* to represent the uuid of the object being indexed.
*/
public static final String OBJECT_IDENTIFIER = "objectId";
/**
* The key in the map (located in the list returned by getData)
* to represent nested datasources.
*/
public static final String NESTED_DATASOURCE = "nestedDataSource";
/**
* Key in the map (located in the list returned by getData)
* to represent the id of the datasource's container. Applies to
* nested datasources.
*/
public static final String CONTAINER_IDENTIFIER = "containerId";
/**
* Key in the map to represent the class name of the Search Result
* object for this datasource (if any).
*/
public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
/**
* Retrieve a array of Maps. Each map represents the
* a document to be indexed. The key:value pair of the map
* is the metadata of the document.
*/
public Map[] getData() throws Exception;
}

View File

@ -1,125 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache POI" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
/**
* Entry point for search engine indexing.
* <p>
* SearchIndexer is responsible for creating the IndexWriter
* {@see org.apache.lucene.index.IndexWriter} and passing it to
* DocumentHandlers {@link DocumentHandler} to index individual documents.
* </p>
*
* @version $Id$
*/
public class IndyoIndexer
{
private static Logger log = Logger.getLogger(IndyoIndexer.class);
private IndexWriter fsWriter;
private SearchConfiguration config;
public IndyoIndexer(String indexDirectory, String configFile)
throws IOException, IllegalConfigurationException
{
Analyzer a = new StandardAnalyzer();
fsWriter = new IndexWriter(indexDirectory, a, true);
fsWriter.maxFieldLength = 1000000;
loadConfig(configFile);
}
/**
* Indexes documents.
*/
public synchronized void index(IndexDataSource ds) throws IOException, Exception
{
log.debug("Initiating search engine indexing...");
long start = System.currentTimeMillis();
// temporarily use an empty map whilst custom fields get implemented
indexDataSource(ds, Collections.EMPTY_MAP);
fsWriter.optimize();
fsWriter.close();
long stop = System.currentTimeMillis();
log.debug("Indexing took " + (stop - start) + " milliseconds");
}
private void loadConfig(String configFile) throws IllegalConfigurationException
{
config = new SearchConfiguration(configFile);
FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
}
private void indexDataSource(IndexDataSource source, Map customFields)
throws Exception
{
Map[] data = source.getData();
// here's a good place to spawn a couple of threads for indexing
for (int i = 0; i < data.length; i++)
{
DocumentHandler docHandler =
new DocumentHandler(data[i], customFields, fsWriter);
docHandler.process();
}
}
}

View File

@ -1,259 +0,0 @@
package com.relevanz.indyo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
import com.relevanz.indyo.util.DataUnformatFilter;
import org.apache.log4j.Category;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.input.SAXBuilder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
/**
* Configures the indexing process using an XML file.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class SearchConfiguration
{
public static final String TEXT_FIELD_TYPE = "text";
public static final String KEYWORD_FIELD_TYPE = "keyword";
public static final String UNINDEXED_FIELD_TYPE = "unindexed";
public static final String UNSTORED_FIELD_TYPE = "unstored";
/** Log4j category.
*/
static Logger log = Logger.getLogger(SearchConfiguration.class.getName());
/**
* Key in the config file to declare content handlers.
*/
private static final String CONTENT_HANDLER_KEY = "Search.ContentHandlers";
/**
* Key in the config file to declare custom fields.
*/
private static final String FIELD_KEY = "Search.Fields";
/**
* Map of content handlers.
*/
private Map contentHandlers = new HashMap();
/**
* Map of (non-standard) custom fields to index.
*/
private Map customFields = new HashMap();
/**
* Document object which represents the xml configuration file.
*/
private Document doc;
/**
* Creates a new SearchConfiguration.
*
* @param configFile Name of the xml configuration file.
*/
public SearchConfiguration(String configFile) throws IllegalConfigurationException
{
try
{
SAXBuilder builder = new SAXBuilder();
DataUnformatFilter format = new DataUnformatFilter();
builder.setXMLFilter(format);
doc = builder.build(configFile);
}
catch (Exception e)
{
log.error("Error creating XML parser:" + e.getMessage(), e);
}
loadContentHandlers();
loadCustomFields();
}
public Map getContentHandlers()
{
return this.contentHandlers;
}
public Map getCustomFields()
{
return this.customFields;
}
/**
* Loads the content handlers.
*/
protected void loadContentHandlers() throws IllegalConfigurationException
{
String[] extensions = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "extension");
String[] handlers = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "handler");
if (extensions.length != handlers.length)
throw new IllegalConfigurationException(
"Illegal configuration of Search Content Handlers!");
for (int i = 0; i < extensions.length; i++)
{
contentHandlers.put(extensions[i], generateObject(handlers[i]));
}
String[] defaultExtension = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "default");
for (int i = 0; i < defaultExtension.length; i++)
{
if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
{
contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
, generateObject(handlers[i]));
}
}
}
/**
* Loads the custom fields to index.
*/
protected void loadCustomFields() throws IllegalConfigurationException
{
String[] fields = getChildPropertyAttributeValues(FIELD_KEY, "name");
String[] fieldtypes = getChildPropertyAttributeValues(FIELD_KEY, "type");
if (fields.length != fieldtypes.length)
throw new IllegalConfigurationException(
"Illegal configuration of custom search fields!");
for (int i = 0; i < fields.length; i++)
{
customFields.put(fields[i], fieldtypes[i]);
}
}
/**
* Return attribute values for all child nodes.
*/
private String[] getChildPropertyAttributeValues(String parent,
String attributeName)
{
String[] nodeName = parseNodeName(parent);
Element element = doc.getRootElement();
for (int i = 0; i < nodeName.length; i++)
{
element = element.getChild(nodeName[i]);
if (element == null)
{
return new String[]{};
}
}
List children = element.getChildren();
int childCount = children.size();
String[] childrenAttributeValue = new String[childCount];
for (int i = 0; i < childCount; i++)
{
childrenAttributeValue[i] =
((Element) children.get(i)).getAttributeValue(attributeName);
}
return childrenAttributeValue;
}
/**
* Node names are in the form "x.y.z". Returns a String array
* representation of the node elements.
*/
private String[] parseNodeName(String nodeName)
{
StringTokenizer st = new StringTokenizer(nodeName, ".");
String[] nodeElements = new String[st.countTokens()];
int i = 0;
while (st.hasMoreTokens())
{
nodeElements[i] = st.nextToken();
++i;
}
return nodeElements;
}
/**
* Utility method to return an object based on its class name.
* The object needs to have a constructor which accepts no parameters.
*
* @param className Class name of object to be generated
* @return Object
*/
private static Object generateObject(String className)
{
Object o = null;
try
{
Class c = Class.forName(className);
o = c.newInstance();
}
catch (ClassNotFoundException cnfe)
{
log.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
}
catch (InstantiationException ie)
{
log.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
}
catch (IllegalAccessException iae)
{
log.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
}
return o;
}
}

View File

@ -1,90 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.Reader;
import java.util.List;
/**
* A content handler determines how to index a file's contents.
*
* @version $Id$
*/
public interface FileContentHandler
{
/**
* Do the file contents of this file have any meaning? Should
* its contents be indexed?
*/
public boolean fileContentIsReadable();
/**
* Returns a reader for this file's contents.
*/
public Reader getReader();
/**
* Does this file have nested data within?
*/
public boolean containsNestedData();
/**
* Return the datasources contained within the parent file.
* This can be URLs contained within a HTML file, files
* within a ZIP file, basically anything represented by a
* DataSource.
*/
public List getNestedDataSource();
}

View File

@ -1,89 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.File;
import java.io.Reader;
import java.util.List;
/**
* A no-op implementation to make FileContentHandler creation easier.
* <p>
* Classes which need to implement the FileContentHandler interface should
* extend this class or {@link NestedFileContentHandlerAdapter}.
* </p>
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public abstract class FileContentHandlerAdapter implements FileContentHandler
{
protected File file;
protected FileContentHandlerAdapter(File file)
{
this.file = file;
}
public Reader getReader()
{
return null;
}
public List getNestedDataSource()
{
return null;
}
}

View File

@ -1,180 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.log4j.Category;
import java.util.Map;
import java.io.File;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Constructor;
import com.relevanz.indyo.util.IOUtils;
/**
* Factory responsible for obtaining ContentHandlers.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public abstract class FileContentHandlerFactory
{
public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
private static Map handlerRegistry;
public static FileContentHandler getContentHandler(File f)
{
String extension = IOUtils.getFileExtension(f);
if (handlerRegistry.containsKey(extension))
{
String handlerClassname = (String) handlerRegistry.get(extension);
return (FileContentHandler) generateObject(handlerClassname,
new Class[]{File.class},
new Object[]{f});
}
else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
{
String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
return (FileContentHandler) generateObject(handlerClassname);
}
else
{
return NullHandler.getInstance();
}
}
public static void setHandlerRegistry(Map handlerRegistry)
{
FileContentHandlerFactory.handlerRegistry = handlerRegistry;
}
/**
* Utility method to return an object based on its class name.
* The object needs to have a constructor which accepts no parameters.
*
* @param className Class name of object to be generated
* @return Object
*/
private static Object generateObject(String className)
{
Object o = null;
try
{
Class c = Class.forName(className);
o = c.newInstance();
}
catch (ClassNotFoundException cnfe)
{
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
}
catch (InstantiationException ie)
{
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
}
catch (IllegalAccessException iae)
{
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
}
return o;
}
/**
* Utility method to return an object based on its class name.
*
* @param type Class name of object to be generated
* @param clazz Class array of parameters.
* @param args Object array of arguments.
* @return Object
*/
private static Object generateObject(String className,
Class[] clazz,
Object[] args)
{
Object o = null;
try
{
Class c = Class.forName(className);
Constructor con = c.getConstructor(clazz);
if (con != null)
{
o = con.newInstance(args);
}
else
throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
}
catch (ClassNotFoundException cnfe)
{
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
}
catch (InstantiationException ie)
{
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
}
catch (IllegalAccessException iae)
{
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
}
catch (NoSuchMethodException nsme)
{
cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
}
catch (InvocationTargetException ite)
{
cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
}
return o;
}
}

View File

@ -1,131 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.log4j.Category;
import com.relevanz.indyo.IndexDataSource;
import com.relevanz.indyo.FSDataSource;
import com.relevanz.indyo.util.IOUtils;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
/**
* Handles GZip content.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class GZipHandler extends NestedFileContentHandlerAdapter
{
private static Category cat = Category.getInstance(GZipHandler.class.getName());
public GZipHandler(File file)
{
super(file);
}
public Reader getReader()
{
return null;
}
public List getNestedDataSource()
{
if (!file.exists())
return null;
try
{
File tempDir = new File(TEMP_FOLDER);
tempDir.mkdirs();
tempDir.deleteOnExit();
String filename = file.getName();
File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
tempFile.deleteOnExit();
IOUtils.extractGZip(file, tempFile);
indexGZipDirectory(tempDir);
}
catch (IOException ioe)
{
cat.error("IOException ungzipping " + file.toString(), ioe);
}
return nestedDataSource;
}
public boolean fileContentIsReadable()
{
return false;
}
// only one file, but let's just treat it like a directory anyway
private void indexGZipDirectory(File dir)
{
if (dir.isDirectory())
{
File[] dirContents = dir.listFiles();
for (int i = 0; i < dirContents.length; i++)
{
indexGZipDirectory(dirContents[i]);
}
}
else if (dir.isFile())
{
IndexDataSource ds = new FSDataSource(dir);
nestedDataSource.add(nestedDataSource);
}
}
}

View File

@ -1,91 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.document.Document;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* A no-op implementation to make FileContentHandler creation easier.
* <p>
* Classes which need to implement the FileContentHandler interface
* and need to handle nested content (example: zip, tar, rar, etc) should
* extend this class.
* </p>
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public abstract class NestedFileContentHandlerAdapter
extends FileContentHandlerAdapter
{
protected final String TEMP_FOLDER = "/usr/temp" + '/'
+ Math.random() + '/';
protected List nestedDataSource;
public NestedFileContentHandlerAdapter(File file)
{
super(file);
}
public boolean containsNestedData()
{
return true;
}
}

View File

@ -1,94 +0,0 @@
package com.relevanz.indyo.contenthandler;
import java.io.File;
import java.io.Reader;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* Do-nothing content handler.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class NullHandler extends FileContentHandlerAdapter
{
private static NullHandler singleton = new NullHandler(null);
public static FileContentHandler getInstance()
{
return singleton;
}
private NullHandler(File file)
{
super(file);
}
public boolean fileContentIsReadable()
{
return false;
}
public Reader getReader()
{
return null;
}
public boolean containsNestedData()
{
return false;
}
}

View File

@ -1,132 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.log4j.Category;
import com.relevanz.indyo.IndexDataSource;
import com.relevanz.indyo.FSDataSource;
import com.relevanz.indyo.util.IOUtils;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
/**
* Handles Tar files.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class TARHandler extends NestedFileContentHandlerAdapter
{
static Category cat = Category.getInstance(TARHandler.class.getName());
public TARHandler(File file)
{
super(file);
}
public Reader getReader()
{
return null;
}
public boolean fileContentIsReadable()
{
return false;
}
public List getNestedDataSource()
{
if (!file.exists())
return null;
if (nestedDataSource == null)
{
nestedDataSource = new ArrayList();
}
try
{
File tempDir = new File(TEMP_FOLDER);
tempDir.deleteOnExit();
IOUtils.extractTar(file, tempDir);
indexTarDirectory(tempDir);
}
catch (IOException ioe)
{
cat.error(ioe.getMessage(), ioe);
}
return nestedDataSource;
}
private void indexTarDirectory(File dir)
{
if (dir.isDirectory())
{
File[] dirContents = dir.listFiles();
for (int i = 0; i < dirContents.length; i++)
{
indexTarDirectory(dirContents[i]);
}
}
else if (dir.isFile())
{
// here create new DataMap for the tarred file
IndexDataSource ds = new FSDataSource(dir);
nestedDataSource.add(nestedDataSource);
}
}
}

View File

@ -1,117 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.log4j.Category;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.*;
import com.relevanz.indyo.util.StringUtils;
/**
* Handles text-based content.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class TextHandler extends FileContentHandlerAdapter
{
static Category cat = Category.getInstance(TextHandler.class.getName());
public TextHandler(File file)
{
super(file);
}
public Reader getReader()
{
if (!file.exists())
{
cat.error(file.toString() + " doesn't exist! Failing silently...");
return null;
}
return getReader(file);
}
public boolean containsNestedData()
{
return false;
}
public boolean fileContentIsReadable()
{
return true;
}
private Reader getReader(File f)
{
Reader reader = null;
try
{
reader = new FileReader(f);
}
catch (FileNotFoundException nfe)
{
cat.error("File Not Found Exception:" + f.toString(), nfe);
}
catch (IOException ioe)
{
cat.error(ioe.getMessage(), ioe);
}
return reader;
}
}

View File

@ -1,133 +0,0 @@
package com.relevanz.indyo.contenthandler;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.log4j.Category;
import com.relevanz.indyo.IndexDataSource;
import com.relevanz.indyo.FSDataSource;
import com.relevanz.indyo.util.IOUtils;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;
/**
* Handles Zip files.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public class ZIPHandler extends NestedFileContentHandlerAdapter
{
private static Category cat = Category.getInstance(ZIPHandler.class);
public ZIPHandler(File file)
{
super(file);
}
public boolean fileContentIsReadable()
{
return false;
}
public Reader getReader()
{
return null;
}
public List getNestedDataSource()
{
if (!file.exists())
return null;
if (nestedDataSource == null)
{
nestedDataSource = new ArrayList();
}
try
{
ZipFile zFile = new ZipFile(file);
for (Enumeration e = zFile.entries(); e.hasMoreElements();)
{
ZipEntry entry = (ZipEntry) e.nextElement();
String entryName = entry.getName();
IOUtils.writeToTempFile(zFile.getInputStream(entry),
TEMP_FOLDER + entryName);
if (!entry.isDirectory())
{
// create a new DataMap for each zip entry
IndexDataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
nestedDataSource.add(ds);
}
}
zFile.close();
}
catch (ZipException ze)
{
cat.error("ZipException parsing zip:" + ze.getMessage(), ze);
}
catch (IOException ioe)
{
cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
}
return nestedDataSource;
}
}

View File

@ -1,312 +0,0 @@
/*--
Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer that follows
these conditions in the documentation and/or other materials
provided with the distribution.
3. The name "JDOM" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact license@jdom.org.
4. Products derived from this software may not be called "JDOM", nor
may "JDOM" appear in their name, without prior written permission
from the JDOM Project Management (pm@jdom.org).
In addition, we request (but do not require) that you include in the
end-user documentation provided with the redistribution and/or in the
software itself an acknowledgement equivalent to the following:
"This product includes software developed by the
JDOM Project (http://www.jdom.org/)."
Alternatively, the acknowledgment may be graphical using the logos
available at http://www.jdom.org/images/logos.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the JDOM Project and was originally
created by Brett McLaughlin <brett@jdom.org> and
Jason Hunter <jhunter@jdom.org>. For more information on the
JDOM Project, please see <http://www.jdom.org/>.
*/
package com.relevanz.indyo.util;
import java.util.Stack;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Filter for removing formatting from data- or field-oriented XML.
*
* <i>Code and comments adapted from DataWriter-0.2, written
* by David Megginson and released into the public domain,
* without warranty.</i>
*
* <p>This filter removes leading and trailing whitespace from
* field-oriented XML without mixed content. Note that this class will
* likely not yield appropriate results for document-oriented XML like
* XHTML pages, which mix character data and elements together.</p>
*
* @see DataFormatFilter
*/
public class DataUnformatFilter extends XMLFilterBase
{
////////////////////////////////////////////////////////////////////
// Constructors.
////////////////////////////////////////////////////////////////////
/**
* Create a new filter.
*/
public DataUnformatFilter()
{
}
/**
* Create a new filter.
*
* <p>Use the XMLReader provided as the source of events.</p>
*
* @param xmlreader The parent in the filter chain.
*/
public DataUnformatFilter(XMLReader xmlreader)
{
super(xmlreader);
}
////////////////////////////////////////////////////////////////////
// Public methods.
////////////////////////////////////////////////////////////////////
/**
* Reset the filter so that it can be reused.
*
* <p>This method is especially useful if the filter failed
* with an exception the last time through.</p>
*/
public void reset ()
{
state = SEEN_NOTHING;
stateStack = new Stack();
whitespace = new StringBuffer();
}
/**
* Filter a start document event.
*
* <p>Reset state and pass the event on for further processing.</p>
*
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startDocument
*/
public void startDocument ()
throws SAXException
{
reset();
super.startDocument();
}
/**
* Filter a start element event.
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @param qName The element's qualified (prefixed) name.
* @param atts The element's attribute list.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
*/
public void startElement (String uri, String localName,
String qName, Attributes atts)
throws SAXException
{
clearWhitespace();
stateStack.push(SEEN_ELEMENT);
state = SEEN_NOTHING;
super.startElement(uri, localName, qName, atts);
}
/**
* Filter an end element event.
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @param qName The element's qualified (prefixed) name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#endElement
*/
public void endElement (String uri, String localName, String qName)
throws SAXException
{
if (state == SEEN_ELEMENT) {
clearWhitespace();
} else {
emitWhitespace();
}
state = stateStack.pop();
super.endElement(uri, localName, qName);
}
/**
* Filter a character data event.
*
* @param ch The characters to write.
* @param start The starting position in the array.
* @param length The number of characters to use.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#characters
*/
public void characters (char ch[], int start, int length)
throws SAXException
{
if (state != SEEN_DATA) {
/* Look for non-whitespace. */
int end = start + length;
while (end-- > start) {
if (!isXMLWhitespace(ch[end]))
break;
}
/*
* If all the characters are whitespace, save them for later.
* If we've got some data, emit any saved whitespace and update
* our state to show we've seen data.
*/
if (end < start) {
saveWhitespace(ch, start, length);
} else {
state = SEEN_DATA;
emitWhitespace();
}
}
/* Pass on everything inside a data field. */
if (state == SEEN_DATA) {
super.characters(ch, start, length);
}
}
/**
* Filter an ignorable whitespace event.
*
* @param ch The array of characters to write.
* @param start The starting position in the array.
* @param length The number of characters to write.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#ignorableWhitespace
*/
public void ignorableWhitespace (char ch[], int start, int length)
throws SAXException
{
emitWhitespace();
// ignore
}
/**
* Filter a processing instruction event.
*
* @param target The PI target.
* @param data The PI data.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#processingInstruction
*/
public void processingInstruction (String target, String data)
throws SAXException
{
emitWhitespace();
super.processingInstruction(target, data);
}
////////////////////////////////////////////////////////////////////
// Internal methods.
////////////////////////////////////////////////////////////////////
/**
* Saves trailing whitespace.
*/
protected void saveWhitespace (char[] ch, int start, int length) {
whitespace.append(ch, start, length);
}
/**
* Passes saved whitespace down the filter chain.
*/
protected void emitWhitespace ()
throws SAXException
{
char[] data = new char[whitespace.length()];
if (whitespace.length() > 0) {
whitespace.getChars(0, data.length, data, 0);
whitespace.setLength(0);
super.characters(data, 0, data.length);
}
}
/**
* Discards saved whitespace.
*/
protected void clearWhitespace () {
whitespace.setLength(0);
}
/**
* Returns <var>true</var> if character is XML whitespace.
*/
private boolean isXMLWhitespace (char c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
////////////////////////////////////////////////////////////////////
// Constants.
////////////////////////////////////////////////////////////////////
private static final Object SEEN_NOTHING = new Object();
private static final Object SEEN_ELEMENT = new Object();
private static final Object SEEN_DATA = new Object();
////////////////////////////////////////////////////////////////////
// Internal state.
////////////////////////////////////////////////////////////////////
private Object state = SEEN_NOTHING;
private Stack stateStack = new Stack();
private StringBuffer whitespace = new StringBuffer();
}
// end of DataUnformatFilter.java

View File

@ -1,274 +0,0 @@
package com.relevanz.indyo.util;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.ice.tar.TarArchive;
import org.apache.log4j.Category;
import java.io.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
/**
* Utility IO-related methods.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public final class IOUtils
{
/**
* Log4j category.
*/
private static Category cat = Category.getInstance(IOUtils.class.getName());
/**
* Writes data from the inputstream to the outputstream.
*
* @param in InputStream to read from.
* @param out OutputStream to write to.
* @throws IOException I/O error.
*/
public static void transferData(InputStream in, OutputStream out)
throws IOException
{
byte[] data = new byte[10000];
int len;
while ((len = in.read(data)) != -1)
{
out.write(data, 0, len);
}
}
/**
* Recursively deletes a directory.
* @param File Directory to delete.
*/
public static void deleteDirectory(File directory)
{
File[] fArray = directory.listFiles();
for (int i = 0; i < fArray.length; i++)
{
if (fArray[i].isDirectory())
{
deleteDirectory(fArray[i]);
}
fArray[i].delete();
}
directory.delete();
}
/**
* Writes an input stream to a temporary file which is set
* to delete when the VM exits.
* @param Inputstream to read data from
* @param Temporary file to write to
*/
public static void writeToTempFile(InputStream in, String tempfile)
throws IOException
{
OutputStream out = null;
try
{
File f = new File(tempfile);
f.deleteOnExit();
char lastChar = tempfile.charAt(tempfile.length() - 1);
// make no assumptions that java.io.File detects directories
// in a cross-platform manner
if (f.isDirectory() || lastChar == '\\' || lastChar == '/')
f.mkdirs();
else
{
// ensure that all necessary directories are created
File parent = f.getParentFile();
parent.deleteOnExit();
parent.mkdirs();
out = new FileOutputStream(tempfile);
transferData(in, out);
}
}
finally
{
if (out != null)
out.close();
}
}
/**
* Writes an file to a ZipOutputStream.
* @param File to read data from
* @param Path of the ZipEntry
* @param ZipOutputStream to write to
*/
public static void addToZipOutputStream(String file,
String zipPath,
ZipOutputStream out)
throws FileNotFoundException, IOException
{
File f = new File(file);
byte[] buffer = new byte[8192]; // Create a buffer for copying
int bytes_read;
FileInputStream in = null;
try
{
in = new FileInputStream(f); // Stream to read file
ZipEntry entry = new ZipEntry(zipPath); // Make a ZipEntry
out.putNextEntry(entry); // Store entry in zipfile
while ((bytes_read = in.read(buffer)) != -1) // Copy bytes to zipfile
out.write(buffer, 0, bytes_read);
}
finally
{
if (in != null)
in.close(); // Close input stream
}
}
/**
* Extracts a tar file to a directory.
* @param Tar file to read data from
* @param Directory to write to
*/
public static void extractTar(File tarFile, File destDir)
throws IOException
{
FileInputStream fis = null;
try
{
fis = new FileInputStream(tarFile);
TarArchive ta = new TarArchive(fis);
ta.extractContents(destDir);
ta.closeArchive();
}
finally
{
if (fis != null)
fis.close();
}
}
/**
* Extracts a GZip file to a file.
* @param GZip file to read data from
* @param File to write to
*/
public static void extractGZip(File f, File destFile) throws IOException
{
FileOutputStream out = null;
FileInputStream fis = null;
GZIPInputStream gzin = null;
try
{
out = new FileOutputStream(destFile);
fis = new FileInputStream(f);
gzin = new GZIPInputStream(fis);
byte[] data = new byte[10000];
int len;
while ((len = gzin.read(data)) != -1)
{
out.write(data, 0, len);
}
out.flush();
}
finally
{
if (gzin != null)
gzin.close();
if (out != null)
out.close();
if (fis != null)
fis.close();
}
}
/**
* reads all bytes from the given stream
* @param is the stream to read from
*/
public static final byte[] loadBytes(InputStream is) throws IOException
{
// read in the entry data
int count = 0;
byte[] buffer = new byte[0];
byte[] chunk = new byte[4096];
while ((count = is.read(chunk)) >= 0)
{
byte[] t = new byte[buffer.length + count];
System.arraycopy(buffer, 0, t, 0, buffer.length);
System.arraycopy(chunk, 0, t, buffer.length, count);
buffer = t;
}
return buffer;
}
/** Returns the file extension of a file.
* @param filename Filename to obtain the file extension.
* @return File extension (without the ".").
*/
public static String getFileExtension(String filename)
{
return filename.substring(filename.lastIndexOf(".") + 1); // + 1 to remove the "."
}
/** Returns the file extension of a file.
* @param f File object to obtain the file extension.
* @return File extension (without the ".").
*/
public static String getFileExtension(File f)
{
return getFileExtension(f.getName());
}
}

View File

@ -1,93 +0,0 @@
package com.relevanz.indyo.util;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.oro.text.perl.Perl5Util;
/**
* Utility String-related methods.
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Id$
*/
public final class StringUtils
{
public static final String EMPTY_STRING = "";
private static final char[] QUOTE_ENCODE = "&quot;".toCharArray();
private static final char[] AMP_ENCODE = "&amp;".toCharArray();
private static final char[] LT_ENCODE = "&lt;".toCharArray();
private static final char[] GT_ENCODE = "&gt;".toCharArray();
private static final char[] APOS_ENCODE = "&apos;".toCharArray();
// Create a regular expression engine
private static Perl5Util perl5Util = new Perl5Util();
public static final String removeUnreadableCharacters(String s)
{
if (perl5Util.match("/\\W+/", s))
{
// replace unreadable characters with a space
s = perl5Util.substitute("s#[^a-zA-Z0-9_@]+# #gm", s);
// remove any single/double word characters
s = perl5Util.substitute("s#\\b[a-zA-Z0-9_]{1,2}\\b##gm", s);
}
return trimWhitespace(s);
}
public static final String trimWhitespace(String s)
{
s = perl5Util.substitute("s#[\\s]{3,}# #m", s);
return s;
}
}

View File

@ -1,404 +0,0 @@
/*--
Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions, and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer that follows
these conditions in the documentation and/or other materials
provided with the distribution.
3. The name "JDOM" must not be used to endorse or promote products
derived from this software without prior written permission. For
written permission, please contact license@jdom.org.
4. Products derived from this software may not be called "JDOM", nor
may "JDOM" appear in their name, without prior written permission
from the JDOM Project Management (pm@jdom.org).
In addition, we request (but do not require) that you include in the
end-user documentation provided with the redistribution and/or in the
software itself an acknowledgement equivalent to the following:
"This product includes software developed by the
JDOM Project (http://www.jdom.org/)."
Alternatively, the acknowledgment may be graphical using the logos
available at http://www.jdom.org/images/logos.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.
This software consists of voluntary contributions made by many
individuals on behalf of the JDOM Project and was originally
created by Brett McLaughlin <brett@jdom.org> and
Jason Hunter <jhunter@jdom.org>. For more information on the
JDOM Project, please see <http://www.jdom.org/>.
*/
package com.relevanz.indyo.util;
import java.io.IOException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLFilterImpl;
/**
* Adds convenience methods to base SAX2 Filter implementation.
*
* <i>Code and comments adapted from XMLWriter-0.2, written
* by David Megginson and released into the public domain,
* without warranty.</i>
*
* <p>The convenience methods are provided so that clients do not have to
* create empty attribute lists or provide empty strings as parameters;
* for example, the method invocation</p>
*
* <pre>
* w.startElement("foo");
* </pre>
*
* <p>is equivalent to the regular SAX2 ContentHandler method</p>
*
* <pre>
* w.startElement("", "foo", "", new AttributesImpl());
* </pre>
*
* <p>Except that it is more efficient because it does not allocate
* a new empty attribute list each time.</p>
*
* <p>In fact, there is an even simpler convenience method,
* <var>dataElement</var>, designed for writing elements that
* contain only character data.</p>
*
* <pre>
* w.dataElement("greeting", "Hello, world!");
* </pre>
*
* <p>is equivalent to</p>
*
* <pre>
* w.startElement("greeting");
* w.characters("Hello, world!");
* w.endElement("greeting");
* </pre>
*
* @see org.xml.sax.helpers.XMLFilterImpl
*/
class XMLFilterBase extends XMLFilterImpl
{
////////////////////////////////////////////////////////////////////
// Constructors.
////////////////////////////////////////////////////////////////////
/**
* Construct an XML filter with no parent.
*
* <p>This filter will have no parent: you must assign a parent
* before you start a parse or do any configuration with
* setFeature or setProperty.</p>
*
* @see org.xml.sax.XMLReader#setFeature
* @see org.xml.sax.XMLReader#setProperty
*/
public XMLFilterBase()
{
}
/**
* Create an XML filter with the specified parent.
*
* <p>Use the XMLReader provided as the source of events.</p>
*
* @param xmlreader The parent in the filter chain.
*/
public XMLFilterBase(XMLReader parent)
{
super(parent);
}
////////////////////////////////////////////////////////////////////
// Convenience methods.
////////////////////////////////////////////////////////////////////
/**
* Start a new element without a qname or attributes.
*
* <p>This method will provide a default empty attribute
* list and an empty string for the qualified name.
* It invokes {@link
* #startElement(String, String, String, Attributes)}
* directly.</p>
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
*/
public void startElement (String uri, String localName) throws SAXException
{
startElement(uri, localName, "", EMPTY_ATTS);
}
/**
* Start a new element without a qname, attributes or a Namespace URI.
*
* <p>This method will provide an empty string for the
* Namespace URI, and empty string for the qualified name,
* and a default empty attribute list. It invokes
* #startElement(String, String, String, Attributes)}
* directly.</p>
*
* @param localName The element's local name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
*/
public void startElement (String localName) throws SAXException
{
startElement("", localName, "", EMPTY_ATTS);
}
/**
* End an element without a qname.
*
* <p>This method will supply an empty string for the qName.
* It invokes {@link #endElement(String, String, String)}
* directly.</p>
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#endElement
*/
public void endElement (String uri, String localName) throws SAXException
{
endElement(uri, localName, "");
}
/**
* End an element without a Namespace URI or qname.
*
* <p>This method will supply an empty string for the qName
* and an empty string for the Namespace URI.
* It invokes {@link #endElement(String, String, String)}
* directly.</p>
*
* @param localName The element's local name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#endElement
*/
public void endElement (String localName) throws SAXException
{
endElement("", localName, "");
}
/**
* Add an empty element.
*
* Both a {@link #startElement startElement} and an
* {@link #endElement endElement} event will be passed on down
* the filter chain.
*
* @param uri The element's Namespace URI, or the empty string
* if the element has no Namespace or if Namespace
* processing is not being performed.
* @param localName The element's local name (without prefix). This
* parameter must be provided.
* @param qName The element's qualified name (with prefix), or
* the empty string if none is available. This parameter
* is strictly advisory: the writer may or may not use
* the prefix attached.
* @param atts The element's attribute list.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
* @see org.xml.sax.ContentHandler#endElement
*/
public void emptyElement (String uri, String localName, String qName,
Attributes atts) throws SAXException
{
startElement(uri, localName, qName, atts);
endElement(uri, localName, qName);
}
/**
* Add an empty element without a qname or attributes.
*
* <p>This method will supply an empty string for the qname
* and an empty attribute list. It invokes
* {@link #emptyElement(String, String, String, Attributes)}
* directly.</p>
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see #emptyElement(String, String, String, Attributes)
*/
public void emptyElement (String uri, String localName) throws SAXException
{
emptyElement(uri, localName, "", EMPTY_ATTS);
}
/**
* Add an empty element without a Namespace URI, qname or attributes.
*
* <p>This method will supply an empty string for the qname,
* and empty string for the Namespace URI, and an empty
* attribute list. It invokes
* {@link #emptyElement(String, String, String, Attributes)}
* directly.</p>
*
* @param localName The element's local name.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see #emptyElement(String, String, String, Attributes)
*/
public void emptyElement (String localName) throws SAXException
{
emptyElement("", localName, "", EMPTY_ATTS);
}
/**
* Add an element with character data content.
*
* <p>This is a convenience method to add a complete element
* with character data content, including the start tag
* and end tag.</p>
*
* <p>This method invokes
* {@link @see org.xml.sax.ContentHandler#startElement},
* followed by
* {@link #characters(String)}, followed by
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @param qName The element's default qualified name.
* @param atts The element's attributes.
* @param content The character data content.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
* @see #characters(String)
* @see org.xml.sax.ContentHandler#endElement
*/
public void dataElement (String uri, String localName, String qName,
Attributes atts, String content) throws SAXException
{
startElement(uri, localName, qName, atts);
characters(content);
endElement(uri, localName, qName);
}
/**
* Add an element with character data content but no attributes.
*
* <p>This is a convenience method to add a complete element
* with character data content, including the start tag
* and end tag. This method provides an empty string
* for the qname and an empty attribute list.</p>
*
* <p>This method invokes
* {@link @see org.xml.sax.ContentHandler#startElement},
* followed by
* {@link #characters(String)}, followed by
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
*
* @param uri The element's Namespace URI.
* @param localName The element's local name.
* @param content The character data content.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
* @see #characters(String)
* @see org.xml.sax.ContentHandler#endElement
*/
public void dataElement (String uri, String localName, String content)
throws SAXException
{
dataElement(uri, localName, "", EMPTY_ATTS, content);
}
/**
* Add an element with character data content but no attributes or
* Namespace URI.
*
* <p>This is a convenience method to add a complete element
* with character data content, including the start tag
* and end tag. The method provides an empty string for the
* Namespace URI, and empty string for the qualified name,
* and an empty attribute list.</p>
*
* <p>This method invokes
* {@link @see org.xml.sax.ContentHandler#startElement},
* followed by
* {@link #characters(String)}, followed by
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
*
* @param localName The element's local name.
* @param content The character data content.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see org.xml.sax.ContentHandler#startElement
* @see #characters(String)
* @see org.xml.sax.ContentHandler#endElement
*/
public void dataElement (String localName, String content)
throws SAXException
{
dataElement("", localName, "", EMPTY_ATTS, content);
}
/**
* Add a string of character data, with XML escaping.
*
* <p>This is a convenience method that takes an XML
* String, converts it to a character array, then invokes
* {@link @see org.xml.sax.ContentHandler#characters}.</p>
*
* @param data The character data.
* @exception org.xml.sax.SAXException If a filter
* further down the chain raises an exception.
* @see @see org.xml.sax.ContentHandler#characters
*/
public void characters (String data) throws SAXException
{
char ch[] = data.toCharArray();
characters(ch, 0, ch.length);
}
////////////////////////////////////////////////////////////////////
// Constants.
////////////////////////////////////////////////////////////////////
protected static final Attributes EMPTY_ATTS = new AttributesImpl();
}
// end of XMLFilterBase.java