mirror of https://github.com/apache/lucene.git
Initial import of source and libs.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150808 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a2e1270e2
commit
4474e5fd64
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[346504c6d4bd7232f0776a4a0f8a32333cedd93e] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[93e77a4a4476afff71a110dda1e96465cb7f25a9] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[be4a9176c35a7feeecf5b70edf070ecb5d13ac5d] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[ff9b90061b65c32122fcdde27bfe7f1e61fbd7bd] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[329aef393bece9d77eef16279910f6cd73113c39] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
||||||
|
AnyObjectId[c1fa1d645474eee07f085a8ee29e38422f7614cf] was removed in git history.
|
||||||
|
Apache SVN contains full history.
|
|
@ -0,0 +1,90 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic implementation of an index datasource.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public abstract class AbstractDataSource implements IndexDataSource
|
||||||
|
{
|
||||||
|
protected AbstractDataSource()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractDataSource(Map map)
|
||||||
|
{
|
||||||
|
loadFields(map);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fields to index.
|
||||||
|
*/
|
||||||
|
protected String[] fields;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convenience method to load fields to index into a Map.
|
||||||
|
*/
|
||||||
|
protected void loadFields(Map map)
|
||||||
|
{
|
||||||
|
Set fieldSet = map.keySet();
|
||||||
|
fields = new String[fieldSet.size()];
|
||||||
|
fieldSet.toArray(fields);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,332 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import com.relevanz.indyo.util.StringUtils;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* A document is the atomic unit used for indexing purposes. It consists of
|
||||||
|
* metadata as well as its file contents. File contents are handled by
|
||||||
|
* {@link ContentHandler}.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* DocumentHandler creates the {@link org.apache.lucene.document.Document},
|
||||||
|
* adds fields to it, delegates to {@link ContentHandler} to handle
|
||||||
|
* file contents.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class DocumentHandler
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Field to retrieve all documents.
|
||||||
|
*/
|
||||||
|
public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
|
||||||
|
|
||||||
|
private static Logger log = Logger.getLogger(DocumentHandler.class);
|
||||||
|
|
||||||
|
private static boolean isDebugEnabled = log.isDebugEnabled();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should parent documents include data of its children?
|
||||||
|
*/
|
||||||
|
private static boolean parentEncapsulation = false;
|
||||||
|
/**
|
||||||
|
* Document object this DocumentHandler is handling.
|
||||||
|
*/
|
||||||
|
private Document doc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map of metadata for this document. Contains the field:value pair
|
||||||
|
* to be added to the document.
|
||||||
|
*/
|
||||||
|
private Map metadata;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map of fields. Contains field:type_of_field pair.
|
||||||
|
*/
|
||||||
|
private Map customFields;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* IndexWriter.
|
||||||
|
*/
|
||||||
|
private IndexWriter writer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A collection of documents to be added to the writer.
|
||||||
|
*/
|
||||||
|
private List documents = new ArrayList();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ctor.
|
||||||
|
*
|
||||||
|
* @param Map of metadata for this document.
|
||||||
|
* @param Map of fields.
|
||||||
|
* @param Writer.
|
||||||
|
*/
|
||||||
|
public DocumentHandler(Map metadata,
|
||||||
|
Map customFields,
|
||||||
|
IndexWriter writer)
|
||||||
|
{
|
||||||
|
this.metadata = metadata;
|
||||||
|
this.customFields = customFields;
|
||||||
|
this.writer = writer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles the actual processing of the document.
|
||||||
|
*/
|
||||||
|
public void process() throws IOException, Exception
|
||||||
|
{
|
||||||
|
String objectid = (String) metadata.get(IndexDataSource.OBJECT_IDENTIFIER);
|
||||||
|
if (objectid == null)
|
||||||
|
return;
|
||||||
|
doc = createDocument();
|
||||||
|
addMapToDoc(metadata);
|
||||||
|
addNestedDataSource(metadata);
|
||||||
|
doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
|
||||||
|
//documents.add(doc);
|
||||||
|
if (writer != null)
|
||||||
|
{
|
||||||
|
addToWriter();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
documents.add(doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List getDocuments()
|
||||||
|
{
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Document createDocument()
|
||||||
|
{
|
||||||
|
return new Document();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add the contents of a Map to a document.
|
||||||
|
*
|
||||||
|
* @param Map to add.
|
||||||
|
*/
|
||||||
|
private void addMapToDoc(Map map)
|
||||||
|
{
|
||||||
|
for (Iterator it = map.keySet().iterator(); it.hasNext();)
|
||||||
|
{
|
||||||
|
String field = (String) it.next();
|
||||||
|
Object value = map.get(field);
|
||||||
|
if (value instanceof String)
|
||||||
|
{
|
||||||
|
String type = null;
|
||||||
|
if (customFields != null)
|
||||||
|
{
|
||||||
|
type = (String) customFields.get(field);
|
||||||
|
}
|
||||||
|
addFieldToDoc(type, field, (String) value);
|
||||||
|
}
|
||||||
|
else if (value instanceof Reader)
|
||||||
|
{
|
||||||
|
addFieldToDoc(field, (Reader) value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add nested datasources.
|
||||||
|
*
|
||||||
|
* @param Map which contains the nested datasources.
|
||||||
|
*/
|
||||||
|
private void addNestedDataSource(Map map) throws Exception
|
||||||
|
{
|
||||||
|
Object o = map.get(IndexDataSource.NESTED_DATASOURCE);
|
||||||
|
if (o == null)
|
||||||
|
return;
|
||||||
|
if (o instanceof IndexDataSource)
|
||||||
|
{
|
||||||
|
IndexDataSource ds = (IndexDataSource) o;
|
||||||
|
addDataSource(ds);
|
||||||
|
}
|
||||||
|
else if (o instanceof List)
|
||||||
|
{
|
||||||
|
List nestedDataSource = (List) o;
|
||||||
|
for (int i = 0, n = nestedDataSource.size(); i < n; i++)
|
||||||
|
{
|
||||||
|
IndexDataSource ds = (IndexDataSource) nestedDataSource.get(i);
|
||||||
|
addDataSource(ds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (o instanceof IndexDataSource[])
|
||||||
|
{
|
||||||
|
IndexDataSource[] nestedDataSource = (IndexDataSource[]) o;
|
||||||
|
for (int i = 0, n = nestedDataSource.length; i < n; i++)
|
||||||
|
{
|
||||||
|
IndexDataSource ds = (IndexDataSource) nestedDataSource[i];
|
||||||
|
addDataSource(ds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
log.warn("Unknown object found as nested datasource:" + o);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Datasources are basically a collection of data maps to be indexed.
|
||||||
|
* addMapToDoc is invoked for each map.
|
||||||
|
*
|
||||||
|
* @param Datasource to add.
|
||||||
|
*/
|
||||||
|
private void addDataSource(IndexDataSource ds) throws Exception
|
||||||
|
{
|
||||||
|
Map[] data = ds.getData();
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
{
|
||||||
|
Map map = data[i];
|
||||||
|
if (map.containsKey(IndexDataSource.OBJECT_IDENTIFIER))
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Create a new document because child datasources may need
|
||||||
|
* to be retrieved independently of parent doc.
|
||||||
|
*/
|
||||||
|
DocumentHandler docHandler = new DocumentHandler(map, null, null);
|
||||||
|
docHandler.process();
|
||||||
|
documents.addAll(docHandler.getDocuments());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
addMapToDoc(map);
|
||||||
|
/**
|
||||||
|
* Add nested datasources of this datasource's data
|
||||||
|
*/
|
||||||
|
addNestedDataSource(map);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a String-based field to a document.
|
||||||
|
*
|
||||||
|
* @param Type of field.
|
||||||
|
* @param Name of field.
|
||||||
|
* @param Value of field.
|
||||||
|
*/
|
||||||
|
private void addFieldToDoc(String type, String field, String value)
|
||||||
|
{
|
||||||
|
if (value == null)
|
||||||
|
value = StringUtils.EMPTY_STRING;
|
||||||
|
if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
|
||||||
|
doc.add(Field.Keyword(field, value));
|
||||||
|
else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
|
||||||
|
doc.add(Field.UnIndexed(field, value));
|
||||||
|
else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
|
||||||
|
doc.add(Field.UnStored(field, value));
|
||||||
|
else
|
||||||
|
doc.add(Field.Text(field, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a Reader-based field to a document.
|
||||||
|
*
|
||||||
|
* @param Name of field.
|
||||||
|
* @param Reader.
|
||||||
|
*/
|
||||||
|
private void addFieldToDoc(String field, Reader reader)
|
||||||
|
{
|
||||||
|
doc.add(Field.Text(field, reader));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds documents to the IndexWriter.
|
||||||
|
*/
|
||||||
|
private void addToWriter() throws IOException
|
||||||
|
{
|
||||||
|
if (parentEncapsulation)
|
||||||
|
{
|
||||||
|
for (int i = 0, n = documents.size(); i < n; i++)
|
||||||
|
{
|
||||||
|
Document d = (Document) documents.get(i);
|
||||||
|
for (Enumeration e = d.fields(); e.hasMoreElements();)
|
||||||
|
{
|
||||||
|
Field f = (Field) e.nextElement();
|
||||||
|
String fieldName = f.name();
|
||||||
|
if (!fieldName.equals(IndexDataSource.CONTAINER_IDENTIFIER)
|
||||||
|
&& !fieldName.equals(IndexDataSource.OBJECT_CLASS)
|
||||||
|
&& !fieldName.equals(IndexDataSource.OBJECT_IDENTIFIER))
|
||||||
|
{
|
||||||
|
doc.add(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
writer.addDocument(doc);
|
||||||
|
|
||||||
|
for (int i = 0, n = documents.size(); i < n; i++)
|
||||||
|
{
|
||||||
|
writer.addDocument((Document) documents.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,160 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.document.DateField;
|
||||||
|
import com.relevanz.indyo.contenthandler.FileContentHandler;
|
||||||
|
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
|
||||||
|
import com.relevanz.indyo.util.IOUtils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A filesystem-based datasource.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class FSDataSource extends AbstractDataSource
|
||||||
|
{
|
||||||
|
public static final String FILE_PATH_FIELD = "filePath";
|
||||||
|
public static final String FILE_NAME_FIELD = "fileName";
|
||||||
|
public static final String FILE_SIZE_FIELD = "fileSize";
|
||||||
|
public static final String FILE_FORMAT_FIELD = "fileFormat";
|
||||||
|
public static final String FILE_CONTENTS_FIELD = "fileContents";
|
||||||
|
public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
|
||||||
|
|
||||||
|
private File targetFileOrDir;
|
||||||
|
|
||||||
|
public FSDataSource(String targetFileOrDirStr)
|
||||||
|
{
|
||||||
|
this(new File(targetFileOrDirStr));
|
||||||
|
}
|
||||||
|
|
||||||
|
public FSDataSource(File targetFileOrDir)
|
||||||
|
{
|
||||||
|
setTargetDirectory(targetFileOrDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map[] getData()
|
||||||
|
{
|
||||||
|
Map[] returnData = null;
|
||||||
|
List temp = new ArrayList();
|
||||||
|
loadDataFromFiles(targetFileOrDir, temp);
|
||||||
|
returnData = new Map[temp.size()];
|
||||||
|
returnData = (Map[]) temp.toArray(returnData);
|
||||||
|
return returnData;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTargetDirectory(File targetFileOrDir)
|
||||||
|
{
|
||||||
|
this.targetFileOrDir = targetFileOrDir;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadDataFromFiles(File f, List list)
|
||||||
|
{
|
||||||
|
if (f.isDirectory())
|
||||||
|
{
|
||||||
|
File[] directoryTree = f.listFiles();
|
||||||
|
for (int i = 0; i < directoryTree.length; i++)
|
||||||
|
{
|
||||||
|
loadDataFromFiles(directoryTree[i], list);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Map dataMap = new HashMap();
|
||||||
|
dataMap.put(FILE_PATH_FIELD, f.getPath());
|
||||||
|
dataMap.put(FILE_NAME_FIELD, f.getName());
|
||||||
|
dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
|
||||||
|
DateField.timeToString(f.lastModified()));
|
||||||
|
dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
|
||||||
|
dataMap.put(FILE_FORMAT_FIELD,
|
||||||
|
IOUtils.getFileExtension(f));
|
||||||
|
addFileContents(f, dataMap);
|
||||||
|
list.add(dataMap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addFileContents(File targetFile, Map dataMap)
|
||||||
|
{
|
||||||
|
FileContentHandler cHandler =
|
||||||
|
FileContentHandlerFactory.getContentHandler(targetFile);
|
||||||
|
if (cHandler != null)
|
||||||
|
{
|
||||||
|
if (cHandler.fileContentIsReadable())
|
||||||
|
{
|
||||||
|
Reader r = cHandler.getReader();
|
||||||
|
if (r != null)
|
||||||
|
{
|
||||||
|
dataMap.put(FILE_CONTENTS_FIELD, r);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cHandler.containsNestedData())
|
||||||
|
{
|
||||||
|
dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//cat.warn("ContentHandler not found for " + contentFile.getName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,69 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Thrown when loading SearchConfiguration.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class IllegalConfigurationException extends Exception
|
||||||
|
{
|
||||||
|
public IllegalConfigurationException(String msg)
|
||||||
|
{
|
||||||
|
super(msg);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,103 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A datasource is any source of data (filesystem, database, URL, etc)
|
||||||
|
* which is indexed by SearchIndexer.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public interface IndexDataSource
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Key in the map (located in the list returned by getData)
|
||||||
|
* to represent the class name of the object being indexed.
|
||||||
|
*/
|
||||||
|
public static final String OBJECT_CLASS = "objectClass";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the map (located in the list returned by getData)
|
||||||
|
* to represent the uuid of the object being indexed.
|
||||||
|
*/
|
||||||
|
public static final String OBJECT_IDENTIFIER = "objectId";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The key in the map (located in the list returned by getData)
|
||||||
|
* to represent nested datasources.
|
||||||
|
*/
|
||||||
|
public static final String NESTED_DATASOURCE = "nestedDataSource";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the map (located in the list returned by getData)
|
||||||
|
* to represent the id of the datasource's container. Applies to
|
||||||
|
* nested datasources.
|
||||||
|
*/
|
||||||
|
public static final String CONTAINER_IDENTIFIER = "containerId";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the map to represent the class name of the Search Result
|
||||||
|
* object for this datasource (if any).
|
||||||
|
*/
|
||||||
|
public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieve a array of Maps. Each map represents the
|
||||||
|
* a document to be indexed. The key:value pair of the map
|
||||||
|
* is the metadata of the document.
|
||||||
|
*/
|
||||||
|
public Map[] getData() throws Exception;
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache POI" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Entry point for search engine indexing.
|
||||||
|
* <p>
|
||||||
|
* SearchIndexer is responsible for creating the IndexWriter
|
||||||
|
* {@see org.apache.lucene.index.IndexWriter} and passing it to
|
||||||
|
* DocumentHandlers {@link DocumentHandler} to index individual documents.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class IndyoIndexer
|
||||||
|
{
|
||||||
|
private static Logger log = Logger.getLogger(IndyoIndexer.class);
|
||||||
|
private IndexWriter fsWriter;
|
||||||
|
private SearchConfiguration config;
|
||||||
|
|
||||||
|
public IndyoIndexer(String indexDirectory, String configFile)
|
||||||
|
throws IOException, IllegalConfigurationException
|
||||||
|
{
|
||||||
|
Analyzer a = new StandardAnalyzer();
|
||||||
|
fsWriter = new IndexWriter(indexDirectory, a, true);
|
||||||
|
fsWriter.maxFieldLength = 1000000;
|
||||||
|
loadConfig(configFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indexes documents.
|
||||||
|
*/
|
||||||
|
public synchronized void index(IndexDataSource ds) throws IOException, Exception
|
||||||
|
{
|
||||||
|
log.debug("Initiating search engine indexing...");
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
// temporarily use an empty map whilst custom fields get implemented
|
||||||
|
indexDataSource(ds, Collections.EMPTY_MAP);
|
||||||
|
fsWriter.optimize();
|
||||||
|
fsWriter.close();
|
||||||
|
long stop = System.currentTimeMillis();
|
||||||
|
log.debug("Indexing took " + (stop - start) + " milliseconds");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadConfig(String configFile) throws IllegalConfigurationException
|
||||||
|
{
|
||||||
|
config = new SearchConfiguration(configFile);
|
||||||
|
FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void indexDataSource(IndexDataSource source, Map customFields)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
Map[] data = source.getData();
|
||||||
|
// here's a good place to spawn a couple of threads for indexing
|
||||||
|
for (int i = 0; i < data.length; i++)
|
||||||
|
{
|
||||||
|
DocumentHandler docHandler =
|
||||||
|
new DocumentHandler(data[i], customFields, fsWriter);
|
||||||
|
docHandler.process();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,259 @@
|
||||||
|
package com.relevanz.indyo;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
|
||||||
|
import com.relevanz.indyo.util.DataUnformatFilter;
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.jdom.Document;
|
||||||
|
import org.jdom.Element;
|
||||||
|
import org.jdom.input.SAXBuilder;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Configures the indexing process using an XML file.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class SearchConfiguration
|
||||||
|
{
|
||||||
|
public static final String TEXT_FIELD_TYPE = "text";
|
||||||
|
public static final String KEYWORD_FIELD_TYPE = "keyword";
|
||||||
|
public static final String UNINDEXED_FIELD_TYPE = "unindexed";
|
||||||
|
public static final String UNSTORED_FIELD_TYPE = "unstored";
|
||||||
|
|
||||||
|
/** Log4j category.
|
||||||
|
*/
|
||||||
|
static Logger log = Logger.getLogger(SearchConfiguration.class.getName());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the config file to declare content handlers.
|
||||||
|
*/
|
||||||
|
private static final String CONTENT_HANDLER_KEY = "Search.ContentHandlers";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Key in the config file to declare custom fields.
|
||||||
|
*/
|
||||||
|
private static final String FIELD_KEY = "Search.Fields";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map of content handlers.
|
||||||
|
*/
|
||||||
|
private Map contentHandlers = new HashMap();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map of (non-standard) custom fields to index.
|
||||||
|
*/
|
||||||
|
private Map customFields = new HashMap();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Document object which represents the xml configuration file.
|
||||||
|
*/
|
||||||
|
private Document doc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new SearchConfiguration.
|
||||||
|
*
|
||||||
|
* @param configFile Name of the xml configuration file.
|
||||||
|
*/
|
||||||
|
public SearchConfiguration(String configFile) throws IllegalConfigurationException
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
SAXBuilder builder = new SAXBuilder();
|
||||||
|
DataUnformatFilter format = new DataUnformatFilter();
|
||||||
|
builder.setXMLFilter(format);
|
||||||
|
doc = builder.build(configFile);
|
||||||
|
}
|
||||||
|
catch (Exception e)
|
||||||
|
{
|
||||||
|
log.error("Error creating XML parser:" + e.getMessage(), e);
|
||||||
|
}
|
||||||
|
loadContentHandlers();
|
||||||
|
loadCustomFields();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map getContentHandlers()
|
||||||
|
{
|
||||||
|
return this.contentHandlers;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map getCustomFields()
|
||||||
|
{
|
||||||
|
return this.customFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the content handlers.
|
||||||
|
*/
|
||||||
|
protected void loadContentHandlers() throws IllegalConfigurationException
|
||||||
|
{
|
||||||
|
String[] extensions = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "extension");
|
||||||
|
String[] handlers = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "handler");
|
||||||
|
if (extensions.length != handlers.length)
|
||||||
|
throw new IllegalConfigurationException(
|
||||||
|
"Illegal configuration of Search Content Handlers!");
|
||||||
|
for (int i = 0; i < extensions.length; i++)
|
||||||
|
{
|
||||||
|
contentHandlers.put(extensions[i], generateObject(handlers[i]));
|
||||||
|
}
|
||||||
|
String[] defaultExtension = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "default");
|
||||||
|
for (int i = 0; i < defaultExtension.length; i++)
|
||||||
|
{
|
||||||
|
if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
|
||||||
|
{
|
||||||
|
contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
|
||||||
|
, generateObject(handlers[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Loads the custom fields to index.
|
||||||
|
*/
|
||||||
|
protected void loadCustomFields() throws IllegalConfigurationException
|
||||||
|
{
|
||||||
|
String[] fields = getChildPropertyAttributeValues(FIELD_KEY, "name");
|
||||||
|
String[] fieldtypes = getChildPropertyAttributeValues(FIELD_KEY, "type");
|
||||||
|
if (fields.length != fieldtypes.length)
|
||||||
|
throw new IllegalConfigurationException(
|
||||||
|
"Illegal configuration of custom search fields!");
|
||||||
|
for (int i = 0; i < fields.length; i++)
|
||||||
|
{
|
||||||
|
customFields.put(fields[i], fieldtypes[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return attribute values for all child nodes.
|
||||||
|
*/
|
||||||
|
private String[] getChildPropertyAttributeValues(String parent,
|
||||||
|
String attributeName)
|
||||||
|
{
|
||||||
|
String[] nodeName = parseNodeName(parent);
|
||||||
|
Element element = doc.getRootElement();
|
||||||
|
for (int i = 0; i < nodeName.length; i++)
|
||||||
|
{
|
||||||
|
element = element.getChild(nodeName[i]);
|
||||||
|
if (element == null)
|
||||||
|
{
|
||||||
|
return new String[]{};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
List children = element.getChildren();
|
||||||
|
int childCount = children.size();
|
||||||
|
String[] childrenAttributeValue = new String[childCount];
|
||||||
|
for (int i = 0; i < childCount; i++)
|
||||||
|
{
|
||||||
|
childrenAttributeValue[i] =
|
||||||
|
((Element) children.get(i)).getAttributeValue(attributeName);
|
||||||
|
}
|
||||||
|
return childrenAttributeValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Node names are in the form "x.y.z". Returns a String array
|
||||||
|
* representation of the node elements.
|
||||||
|
*/
|
||||||
|
private String[] parseNodeName(String nodeName)
|
||||||
|
{
|
||||||
|
StringTokenizer st = new StringTokenizer(nodeName, ".");
|
||||||
|
String[] nodeElements = new String[st.countTokens()];
|
||||||
|
int i = 0;
|
||||||
|
while (st.hasMoreTokens())
|
||||||
|
{
|
||||||
|
nodeElements[i] = st.nextToken();
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
return nodeElements;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method to return an object based on its class name.
|
||||||
|
* The object needs to have a constructor which accepts no parameters.
|
||||||
|
*
|
||||||
|
* @param className Class name of object to be generated
|
||||||
|
* @return Object
|
||||||
|
*/
|
||||||
|
private static Object generateObject(String className)
|
||||||
|
{
|
||||||
|
Object o = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Class c = Class.forName(className);
|
||||||
|
o = c.newInstance();
|
||||||
|
}
|
||||||
|
catch (ClassNotFoundException cnfe)
|
||||||
|
{
|
||||||
|
log.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||||
|
}
|
||||||
|
catch (InstantiationException ie)
|
||||||
|
{
|
||||||
|
log.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||||
|
}
|
||||||
|
catch (IllegalAccessException iae)
|
||||||
|
{
|
||||||
|
log.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||||
|
}
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,90 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A content handler determines how to index a file's contents.
|
||||||
|
*
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public interface FileContentHandler
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Do the file contents of this file have any meaning? Should
|
||||||
|
* its contents be indexed?
|
||||||
|
*/
|
||||||
|
public boolean fileContentIsReadable();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a reader for this file's contents.
|
||||||
|
*/
|
||||||
|
public Reader getReader();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does this file have nested data within?
|
||||||
|
*/
|
||||||
|
public boolean containsNestedData();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the datasources contained within the parent file.
|
||||||
|
* This can be URLs contained within a HTML file, files
|
||||||
|
* within a ZIP file, basically anything represented by a
|
||||||
|
* DataSource.
|
||||||
|
*/
|
||||||
|
public List getNestedDataSource();
|
||||||
|
}
|
|
@ -0,0 +1,89 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A no-op implementation to make FileContentHandler creation easier.
|
||||||
|
* <p>
|
||||||
|
* Classes which need to implement the FileContentHandler interface should
|
||||||
|
* extend this class or {@link NestedFileContentHandlerAdapter}.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public abstract class FileContentHandlerAdapter implements FileContentHandler
|
||||||
|
{
|
||||||
|
protected File file;
|
||||||
|
|
||||||
|
protected FileContentHandlerAdapter(File file)
|
||||||
|
{
|
||||||
|
this.file = file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,180 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.io.File;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.lang.reflect.Constructor;
|
||||||
|
|
||||||
|
import com.relevanz.indyo.util.IOUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory responsible for obtaining ContentHandlers.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public abstract class FileContentHandlerFactory
|
||||||
|
{
|
||||||
|
public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
|
||||||
|
static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
|
||||||
|
private static Map handlerRegistry;
|
||||||
|
|
||||||
|
public static FileContentHandler getContentHandler(File f)
|
||||||
|
{
|
||||||
|
String extension = IOUtils.getFileExtension(f);
|
||||||
|
if (handlerRegistry.containsKey(extension))
|
||||||
|
{
|
||||||
|
String handlerClassname = (String) handlerRegistry.get(extension);
|
||||||
|
return (FileContentHandler) generateObject(handlerClassname,
|
||||||
|
new Class[]{File.class},
|
||||||
|
new Object[]{f});
|
||||||
|
}
|
||||||
|
else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
|
||||||
|
{
|
||||||
|
String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
|
||||||
|
return (FileContentHandler) generateObject(handlerClassname);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return NullHandler.getInstance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void setHandlerRegistry(Map handlerRegistry)
|
||||||
|
{
|
||||||
|
FileContentHandlerFactory.handlerRegistry = handlerRegistry;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method to return an object based on its class name.
|
||||||
|
* The object needs to have a constructor which accepts no parameters.
|
||||||
|
*
|
||||||
|
* @param className Class name of object to be generated
|
||||||
|
* @return Object
|
||||||
|
*/
|
||||||
|
private static Object generateObject(String className)
|
||||||
|
{
|
||||||
|
Object o = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Class c = Class.forName(className);
|
||||||
|
o = c.newInstance();
|
||||||
|
}
|
||||||
|
catch (ClassNotFoundException cnfe)
|
||||||
|
{
|
||||||
|
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||||
|
}
|
||||||
|
catch (InstantiationException ie)
|
||||||
|
{
|
||||||
|
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||||
|
}
|
||||||
|
catch (IllegalAccessException iae)
|
||||||
|
{
|
||||||
|
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||||
|
}
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method to return an object based on its class name.
|
||||||
|
*
|
||||||
|
* @param type Class name of object to be generated
|
||||||
|
* @param clazz Class array of parameters.
|
||||||
|
* @param args Object array of arguments.
|
||||||
|
* @return Object
|
||||||
|
*/
|
||||||
|
private static Object generateObject(String className,
|
||||||
|
Class[] clazz,
|
||||||
|
Object[] args)
|
||||||
|
{
|
||||||
|
Object o = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Class c = Class.forName(className);
|
||||||
|
Constructor con = c.getConstructor(clazz);
|
||||||
|
if (con != null)
|
||||||
|
{
|
||||||
|
o = con.newInstance(args);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
|
||||||
|
}
|
||||||
|
catch (ClassNotFoundException cnfe)
|
||||||
|
{
|
||||||
|
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||||
|
}
|
||||||
|
catch (InstantiationException ie)
|
||||||
|
{
|
||||||
|
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||||
|
}
|
||||||
|
catch (IllegalAccessException iae)
|
||||||
|
{
|
||||||
|
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||||
|
}
|
||||||
|
catch (NoSuchMethodException nsme)
|
||||||
|
{
|
||||||
|
cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
|
||||||
|
}
|
||||||
|
catch (InvocationTargetException ite)
|
||||||
|
{
|
||||||
|
cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
|
||||||
|
}
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,131 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
import com.relevanz.indyo.IndexDataSource;
|
||||||
|
import com.relevanz.indyo.FSDataSource;
|
||||||
|
import com.relevanz.indyo.util.IOUtils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles GZip content.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class GZipHandler extends NestedFileContentHandlerAdapter
|
||||||
|
{
|
||||||
|
private static Category cat = Category.getInstance(GZipHandler.class.getName());
|
||||||
|
|
||||||
|
public GZipHandler(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
return null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
File tempDir = new File(TEMP_FOLDER);
|
||||||
|
tempDir.mkdirs();
|
||||||
|
tempDir.deleteOnExit();
|
||||||
|
String filename = file.getName();
|
||||||
|
File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
|
||||||
|
tempFile.deleteOnExit();
|
||||||
|
IOUtils.extractGZip(file, tempFile);
|
||||||
|
indexGZipDirectory(tempDir);
|
||||||
|
}
|
||||||
|
catch (IOException ioe)
|
||||||
|
{
|
||||||
|
cat.error("IOException ungzipping " + file.toString(), ioe);
|
||||||
|
}
|
||||||
|
return nestedDataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// only one file, but let's just treat it like a directory anyway
|
||||||
|
private void indexGZipDirectory(File dir)
|
||||||
|
{
|
||||||
|
if (dir.isDirectory())
|
||||||
|
{
|
||||||
|
File[] dirContents = dir.listFiles();
|
||||||
|
for (int i = 0; i < dirContents.length; i++)
|
||||||
|
{
|
||||||
|
indexGZipDirectory(dirContents[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (dir.isFile())
|
||||||
|
{
|
||||||
|
IndexDataSource ds = new FSDataSource(dir);
|
||||||
|
nestedDataSource.add(nestedDataSource);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,91 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A no-op implementation to make FileContentHandler creation easier.
|
||||||
|
* <p>
|
||||||
|
* Classes which need to implement the FileContentHandler interface
|
||||||
|
* and need to handle nested content (example: zip, tar, rar, etc) should
|
||||||
|
* extend this class.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public abstract class NestedFileContentHandlerAdapter
|
||||||
|
extends FileContentHandlerAdapter
|
||||||
|
{
|
||||||
|
protected final String TEMP_FOLDER = "/usr/temp" + '/'
|
||||||
|
+ Math.random() + '/';
|
||||||
|
|
||||||
|
protected List nestedDataSource;
|
||||||
|
|
||||||
|
public NestedFileContentHandlerAdapter(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsNestedData()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,94 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.Reader;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Do-nothing content handler.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class NullHandler extends FileContentHandlerAdapter
|
||||||
|
{
|
||||||
|
private static NullHandler singleton = new NullHandler(null);
|
||||||
|
|
||||||
|
public static FileContentHandler getInstance()
|
||||||
|
{
|
||||||
|
return singleton;
|
||||||
|
}
|
||||||
|
|
||||||
|
private NullHandler(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsNestedData()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,132 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
import com.relevanz.indyo.IndexDataSource;
|
||||||
|
import com.relevanz.indyo.FSDataSource;
|
||||||
|
import com.relevanz.indyo.util.IOUtils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles Tar files.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class TARHandler extends NestedFileContentHandlerAdapter
|
||||||
|
{
|
||||||
|
static Category cat = Category.getInstance(TARHandler.class.getName());
|
||||||
|
|
||||||
|
public TARHandler(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
return null;
|
||||||
|
if (nestedDataSource == null)
|
||||||
|
{
|
||||||
|
nestedDataSource = new ArrayList();
|
||||||
|
}
|
||||||
|
try
|
||||||
|
{
|
||||||
|
File tempDir = new File(TEMP_FOLDER);
|
||||||
|
tempDir.deleteOnExit();
|
||||||
|
IOUtils.extractTar(file, tempDir);
|
||||||
|
indexTarDirectory(tempDir);
|
||||||
|
}
|
||||||
|
catch (IOException ioe)
|
||||||
|
{
|
||||||
|
cat.error(ioe.getMessage(), ioe);
|
||||||
|
}
|
||||||
|
return nestedDataSource;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void indexTarDirectory(File dir)
|
||||||
|
{
|
||||||
|
if (dir.isDirectory())
|
||||||
|
{
|
||||||
|
File[] dirContents = dir.listFiles();
|
||||||
|
for (int i = 0; i < dirContents.length; i++)
|
||||||
|
{
|
||||||
|
indexTarDirectory(dirContents[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (dir.isFile())
|
||||||
|
{
|
||||||
|
// here create new DataMap for the tarred file
|
||||||
|
IndexDataSource ds = new FSDataSource(dir);
|
||||||
|
nestedDataSource.add(nestedDataSource);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,117 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
import com.relevanz.indyo.util.StringUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles text-based content.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class TextHandler extends FileContentHandlerAdapter
|
||||||
|
{
|
||||||
|
static Category cat = Category.getInstance(TextHandler.class.getName());
|
||||||
|
|
||||||
|
public TextHandler(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
{
|
||||||
|
cat.error(file.toString() + " doesn't exist! Failing silently...");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return getReader(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsNestedData()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Reader getReader(File f)
|
||||||
|
{
|
||||||
|
Reader reader = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
reader = new FileReader(f);
|
||||||
|
}
|
||||||
|
catch (FileNotFoundException nfe)
|
||||||
|
{
|
||||||
|
cat.error("File Not Found Exception:" + f.toString(), nfe);
|
||||||
|
}
|
||||||
|
catch (IOException ioe)
|
||||||
|
{
|
||||||
|
cat.error(ioe.getMessage(), ioe);
|
||||||
|
}
|
||||||
|
return reader;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,133 @@
|
||||||
|
package com.relevanz.indyo.contenthandler;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
import com.relevanz.indyo.IndexDataSource;
|
||||||
|
import com.relevanz.indyo.FSDataSource;
|
||||||
|
import com.relevanz.indyo.util.IOUtils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Enumeration;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipException;
|
||||||
|
import java.util.zip.ZipFile;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Handles Zip files.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public class ZIPHandler extends NestedFileContentHandlerAdapter
|
||||||
|
{
|
||||||
|
private static Category cat = Category.getInstance(ZIPHandler.class);
|
||||||
|
|
||||||
|
public ZIPHandler(File file)
|
||||||
|
{
|
||||||
|
super(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean fileContentIsReadable()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Reader getReader()
|
||||||
|
{
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List getNestedDataSource()
|
||||||
|
{
|
||||||
|
if (!file.exists())
|
||||||
|
return null;
|
||||||
|
if (nestedDataSource == null)
|
||||||
|
{
|
||||||
|
nestedDataSource = new ArrayList();
|
||||||
|
}
|
||||||
|
try
|
||||||
|
{
|
||||||
|
ZipFile zFile = new ZipFile(file);
|
||||||
|
for (Enumeration e = zFile.entries(); e.hasMoreElements();)
|
||||||
|
{
|
||||||
|
ZipEntry entry = (ZipEntry) e.nextElement();
|
||||||
|
String entryName = entry.getName();
|
||||||
|
IOUtils.writeToTempFile(zFile.getInputStream(entry),
|
||||||
|
TEMP_FOLDER + entryName);
|
||||||
|
if (!entry.isDirectory())
|
||||||
|
{
|
||||||
|
// create a new DataMap for each zip entry
|
||||||
|
IndexDataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
|
||||||
|
nestedDataSource.add(ds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
zFile.close();
|
||||||
|
}
|
||||||
|
catch (ZipException ze)
|
||||||
|
{
|
||||||
|
cat.error("ZipException parsing zip:" + ze.getMessage(), ze);
|
||||||
|
}
|
||||||
|
catch (IOException ioe)
|
||||||
|
{
|
||||||
|
cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
|
||||||
|
}
|
||||||
|
return nestedDataSource;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,312 @@
|
||||||
|
/*--
|
||||||
|
|
||||||
|
Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer that follows
|
||||||
|
these conditions in the documentation and/or other materials
|
||||||
|
provided with the distribution.
|
||||||
|
|
||||||
|
3. The name "JDOM" must not be used to endorse or promote products
|
||||||
|
derived from this software without prior written permission. For
|
||||||
|
written permission, please contact license@jdom.org.
|
||||||
|
|
||||||
|
4. Products derived from this software may not be called "JDOM", nor
|
||||||
|
may "JDOM" appear in their name, without prior written permission
|
||||||
|
from the JDOM Project Management (pm@jdom.org).
|
||||||
|
|
||||||
|
In addition, we request (but do not require) that you include in the
|
||||||
|
end-user documentation provided with the redistribution and/or in the
|
||||||
|
software itself an acknowledgement equivalent to the following:
|
||||||
|
"This product includes software developed by the
|
||||||
|
JDOM Project (http://www.jdom.org/)."
|
||||||
|
Alternatively, the acknowledgment may be graphical using the logos
|
||||||
|
available at http://www.jdom.org/images/logos.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
|
||||||
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
SUCH DAMAGE.
|
||||||
|
|
||||||
|
This software consists of voluntary contributions made by many
|
||||||
|
individuals on behalf of the JDOM Project and was originally
|
||||||
|
created by Brett McLaughlin <brett@jdom.org> and
|
||||||
|
Jason Hunter <jhunter@jdom.org>. For more information on the
|
||||||
|
JDOM Project, please see <http://www.jdom.org/>.
|
||||||
|
|
||||||
|
*/
|
||||||
|
package com.relevanz.indyo.util;
|
||||||
|
|
||||||
|
import java.util.Stack;
|
||||||
|
|
||||||
|
import org.xml.sax.Attributes;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
import org.xml.sax.XMLReader;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter for removing formatting from data- or field-oriented XML.
|
||||||
|
*
|
||||||
|
* <i>Code and comments adapted from DataWriter-0.2, written
|
||||||
|
* by David Megginson and released into the public domain,
|
||||||
|
* without warranty.</i>
|
||||||
|
*
|
||||||
|
* <p>This filter removes leading and trailing whitespace from
|
||||||
|
* field-oriented XML without mixed content. Note that this class will
|
||||||
|
* likely not yield appropriate results for document-oriented XML like
|
||||||
|
* XHTML pages, which mix character data and elements together.</p>
|
||||||
|
*
|
||||||
|
* @see DataFormatFilter
|
||||||
|
*/
|
||||||
|
public class DataUnformatFilter extends XMLFilterBase
|
||||||
|
{
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Constructors.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new filter.
|
||||||
|
*/
|
||||||
|
public DataUnformatFilter()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new filter.
|
||||||
|
*
|
||||||
|
* <p>Use the XMLReader provided as the source of events.</p>
|
||||||
|
*
|
||||||
|
* @param xmlreader The parent in the filter chain.
|
||||||
|
*/
|
||||||
|
public DataUnformatFilter(XMLReader xmlreader)
|
||||||
|
{
|
||||||
|
super(xmlreader);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Public methods.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset the filter so that it can be reused.
|
||||||
|
*
|
||||||
|
* <p>This method is especially useful if the filter failed
|
||||||
|
* with an exception the last time through.</p>
|
||||||
|
*/
|
||||||
|
public void reset ()
|
||||||
|
{
|
||||||
|
state = SEEN_NOTHING;
|
||||||
|
stateStack = new Stack();
|
||||||
|
whitespace = new StringBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter a start document event.
|
||||||
|
*
|
||||||
|
* <p>Reset state and pass the event on for further processing.</p>
|
||||||
|
*
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startDocument
|
||||||
|
*/
|
||||||
|
public void startDocument ()
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
super.startDocument();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter a start element event.
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @param qName The element's qualified (prefixed) name.
|
||||||
|
* @param atts The element's attribute list.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
*/
|
||||||
|
public void startElement (String uri, String localName,
|
||||||
|
String qName, Attributes atts)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
clearWhitespace();
|
||||||
|
stateStack.push(SEEN_ELEMENT);
|
||||||
|
state = SEEN_NOTHING;
|
||||||
|
super.startElement(uri, localName, qName, atts);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter an end element event.
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @param qName The element's qualified (prefixed) name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void endElement (String uri, String localName, String qName)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
if (state == SEEN_ELEMENT) {
|
||||||
|
clearWhitespace();
|
||||||
|
} else {
|
||||||
|
emitWhitespace();
|
||||||
|
}
|
||||||
|
state = stateStack.pop();
|
||||||
|
super.endElement(uri, localName, qName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter a character data event.
|
||||||
|
*
|
||||||
|
* @param ch The characters to write.
|
||||||
|
* @param start The starting position in the array.
|
||||||
|
* @param length The number of characters to use.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#characters
|
||||||
|
*/
|
||||||
|
public void characters (char ch[], int start, int length)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
if (state != SEEN_DATA) {
|
||||||
|
|
||||||
|
/* Look for non-whitespace. */
|
||||||
|
int end = start + length;
|
||||||
|
while (end-- > start) {
|
||||||
|
if (!isXMLWhitespace(ch[end]))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If all the characters are whitespace, save them for later.
|
||||||
|
* If we've got some data, emit any saved whitespace and update
|
||||||
|
* our state to show we've seen data.
|
||||||
|
*/
|
||||||
|
if (end < start) {
|
||||||
|
saveWhitespace(ch, start, length);
|
||||||
|
} else {
|
||||||
|
state = SEEN_DATA;
|
||||||
|
emitWhitespace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Pass on everything inside a data field. */
|
||||||
|
if (state == SEEN_DATA) {
|
||||||
|
super.characters(ch, start, length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter an ignorable whitespace event.
|
||||||
|
*
|
||||||
|
* @param ch The array of characters to write.
|
||||||
|
* @param start The starting position in the array.
|
||||||
|
* @param length The number of characters to write.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#ignorableWhitespace
|
||||||
|
*/
|
||||||
|
public void ignorableWhitespace (char ch[], int start, int length)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
emitWhitespace();
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter a processing instruction event.
|
||||||
|
*
|
||||||
|
* @param target The PI target.
|
||||||
|
* @param data The PI data.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#processingInstruction
|
||||||
|
*/
|
||||||
|
public void processingInstruction (String target, String data)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
emitWhitespace();
|
||||||
|
super.processingInstruction(target, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Internal methods.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Saves trailing whitespace.
|
||||||
|
*/
|
||||||
|
protected void saveWhitespace (char[] ch, int start, int length) {
|
||||||
|
whitespace.append(ch, start, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Passes saved whitespace down the filter chain.
|
||||||
|
*/
|
||||||
|
protected void emitWhitespace ()
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
char[] data = new char[whitespace.length()];
|
||||||
|
if (whitespace.length() > 0) {
|
||||||
|
whitespace.getChars(0, data.length, data, 0);
|
||||||
|
whitespace.setLength(0);
|
||||||
|
super.characters(data, 0, data.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Discards saved whitespace.
|
||||||
|
*/
|
||||||
|
protected void clearWhitespace () {
|
||||||
|
whitespace.setLength(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns <var>true</var> if character is XML whitespace.
|
||||||
|
*/
|
||||||
|
private boolean isXMLWhitespace (char c)
|
||||||
|
{
|
||||||
|
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Constants.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
private static final Object SEEN_NOTHING = new Object();
|
||||||
|
private static final Object SEEN_ELEMENT = new Object();
|
||||||
|
private static final Object SEEN_DATA = new Object();
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Internal state.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
private Object state = SEEN_NOTHING;
|
||||||
|
private Stack stateStack = new Stack();
|
||||||
|
|
||||||
|
private StringBuffer whitespace = new StringBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
// end of DataUnformatFilter.java
|
|
@ -0,0 +1,274 @@
|
||||||
|
package com.relevanz.indyo.util;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import com.ice.tar.TarArchive;
|
||||||
|
import org.apache.log4j.Category;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility IO-related methods.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public final class IOUtils
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Log4j category.
|
||||||
|
*/
|
||||||
|
private static Category cat = Category.getInstance(IOUtils.class.getName());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes data from the inputstream to the outputstream.
|
||||||
|
*
|
||||||
|
* @param in InputStream to read from.
|
||||||
|
* @param out OutputStream to write to.
|
||||||
|
* @throws IOException I/O error.
|
||||||
|
*/
|
||||||
|
public static void transferData(InputStream in, OutputStream out)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
byte[] data = new byte[10000];
|
||||||
|
int len;
|
||||||
|
while ((len = in.read(data)) != -1)
|
||||||
|
{
|
||||||
|
out.write(data, 0, len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursively deletes a directory.
|
||||||
|
* @param File Directory to delete.
|
||||||
|
*/
|
||||||
|
public static void deleteDirectory(File directory)
|
||||||
|
{
|
||||||
|
File[] fArray = directory.listFiles();
|
||||||
|
for (int i = 0; i < fArray.length; i++)
|
||||||
|
{
|
||||||
|
if (fArray[i].isDirectory())
|
||||||
|
{
|
||||||
|
deleteDirectory(fArray[i]);
|
||||||
|
}
|
||||||
|
fArray[i].delete();
|
||||||
|
}
|
||||||
|
directory.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes an input stream to a temporary file which is set
|
||||||
|
* to delete when the VM exits.
|
||||||
|
* @param Inputstream to read data from
|
||||||
|
* @param Temporary file to write to
|
||||||
|
*/
|
||||||
|
public static void writeToTempFile(InputStream in, String tempfile)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
OutputStream out = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
File f = new File(tempfile);
|
||||||
|
f.deleteOnExit();
|
||||||
|
char lastChar = tempfile.charAt(tempfile.length() - 1);
|
||||||
|
// make no assumptions that java.io.File detects directories
|
||||||
|
// in a cross-platform manner
|
||||||
|
if (f.isDirectory() || lastChar == '\\' || lastChar == '/')
|
||||||
|
f.mkdirs();
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// ensure that all necessary directories are created
|
||||||
|
File parent = f.getParentFile();
|
||||||
|
parent.deleteOnExit();
|
||||||
|
parent.mkdirs();
|
||||||
|
out = new FileOutputStream(tempfile);
|
||||||
|
transferData(in, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (out != null)
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes an file to a ZipOutputStream.
|
||||||
|
* @param File to read data from
|
||||||
|
* @param Path of the ZipEntry
|
||||||
|
* @param ZipOutputStream to write to
|
||||||
|
*/
|
||||||
|
public static void addToZipOutputStream(String file,
|
||||||
|
String zipPath,
|
||||||
|
ZipOutputStream out)
|
||||||
|
throws FileNotFoundException, IOException
|
||||||
|
{
|
||||||
|
File f = new File(file);
|
||||||
|
byte[] buffer = new byte[8192]; // Create a buffer for copying
|
||||||
|
int bytes_read;
|
||||||
|
FileInputStream in = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
in = new FileInputStream(f); // Stream to read file
|
||||||
|
ZipEntry entry = new ZipEntry(zipPath); // Make a ZipEntry
|
||||||
|
out.putNextEntry(entry); // Store entry in zipfile
|
||||||
|
while ((bytes_read = in.read(buffer)) != -1) // Copy bytes to zipfile
|
||||||
|
out.write(buffer, 0, bytes_read);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (in != null)
|
||||||
|
in.close(); // Close input stream
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts a tar file to a directory.
|
||||||
|
* @param Tar file to read data from
|
||||||
|
* @param Directory to write to
|
||||||
|
*/
|
||||||
|
public static void extractTar(File tarFile, File destDir)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
FileInputStream fis = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
fis = new FileInputStream(tarFile);
|
||||||
|
TarArchive ta = new TarArchive(fis);
|
||||||
|
ta.extractContents(destDir);
|
||||||
|
ta.closeArchive();
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (fis != null)
|
||||||
|
fis.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts a GZip file to a file.
|
||||||
|
* @param GZip file to read data from
|
||||||
|
* @param File to write to
|
||||||
|
*/
|
||||||
|
public static void extractGZip(File f, File destFile) throws IOException
|
||||||
|
{
|
||||||
|
FileOutputStream out = null;
|
||||||
|
FileInputStream fis = null;
|
||||||
|
GZIPInputStream gzin = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
out = new FileOutputStream(destFile);
|
||||||
|
fis = new FileInputStream(f);
|
||||||
|
gzin = new GZIPInputStream(fis);
|
||||||
|
byte[] data = new byte[10000];
|
||||||
|
int len;
|
||||||
|
while ((len = gzin.read(data)) != -1)
|
||||||
|
{
|
||||||
|
out.write(data, 0, len);
|
||||||
|
}
|
||||||
|
out.flush();
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
if (gzin != null)
|
||||||
|
gzin.close();
|
||||||
|
if (out != null)
|
||||||
|
out.close();
|
||||||
|
if (fis != null)
|
||||||
|
fis.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* reads all bytes from the given stream
|
||||||
|
* @param is the stream to read from
|
||||||
|
*/
|
||||||
|
public static final byte[] loadBytes(InputStream is) throws IOException
|
||||||
|
{
|
||||||
|
// read in the entry data
|
||||||
|
int count = 0;
|
||||||
|
byte[] buffer = new byte[0];
|
||||||
|
byte[] chunk = new byte[4096];
|
||||||
|
while ((count = is.read(chunk)) >= 0)
|
||||||
|
{
|
||||||
|
byte[] t = new byte[buffer.length + count];
|
||||||
|
System.arraycopy(buffer, 0, t, 0, buffer.length);
|
||||||
|
System.arraycopy(chunk, 0, t, buffer.length, count);
|
||||||
|
buffer = t;
|
||||||
|
}
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the file extension of a file.
|
||||||
|
* @param filename Filename to obtain the file extension.
|
||||||
|
* @return File extension (without the ".").
|
||||||
|
*/
|
||||||
|
public static String getFileExtension(String filename)
|
||||||
|
{
|
||||||
|
return filename.substring(filename.lastIndexOf(".") + 1); // + 1 to remove the "."
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the file extension of a file.
|
||||||
|
* @param f File object to obtain the file extension.
|
||||||
|
* @return File extension (without the ".").
|
||||||
|
*/
|
||||||
|
public static String getFileExtension(File f)
|
||||||
|
{
|
||||||
|
return getFileExtension(f.getName());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,93 @@
|
||||||
|
package com.relevanz.indyo.util;
|
||||||
|
|
||||||
|
/* ====================================================================
|
||||||
|
* The Apache Software License, Version 1.1
|
||||||
|
*
|
||||||
|
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||||
|
* reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in
|
||||||
|
* the documentation and/or other materials provided with the
|
||||||
|
* distribution.
|
||||||
|
*
|
||||||
|
* 3. The end-user documentation included with the redistribution,
|
||||||
|
* if any, must include the following acknowledgment:
|
||||||
|
* "This product includes software developed by the
|
||||||
|
* Apache Software Foundation (http://www.apache.org/)."
|
||||||
|
* Alternately, this acknowledgment may appear in the software itself,
|
||||||
|
* if and wherever such third-party acknowledgments normally appear.
|
||||||
|
*
|
||||||
|
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||||
|
* "Apache Lucene" must not be used to endorse or promote products
|
||||||
|
* derived from this software without prior written permission. For
|
||||||
|
* written permission, please contact apache@apache.org.
|
||||||
|
*
|
||||||
|
* 5. Products derived from this software may not be called "Apache",
|
||||||
|
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||||
|
* prior written permission of the Apache Software Foundation.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||||
|
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
* ====================================================================
|
||||||
|
*
|
||||||
|
* This software consists of voluntary contributions made by many
|
||||||
|
* individuals on behalf of the Apache Software Foundation. For more
|
||||||
|
* information on the Apache Software Foundation, please see
|
||||||
|
* <http://www.apache.org/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.oro.text.perl.Perl5Util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility String-related methods.
|
||||||
|
*
|
||||||
|
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||||
|
* @version $Id$
|
||||||
|
*/
|
||||||
|
public final class StringUtils
|
||||||
|
{
|
||||||
|
public static final String EMPTY_STRING = "";
|
||||||
|
private static final char[] QUOTE_ENCODE = """.toCharArray();
|
||||||
|
private static final char[] AMP_ENCODE = "&".toCharArray();
|
||||||
|
private static final char[] LT_ENCODE = "<".toCharArray();
|
||||||
|
private static final char[] GT_ENCODE = ">".toCharArray();
|
||||||
|
private static final char[] APOS_ENCODE = "'".toCharArray();
|
||||||
|
// Create a regular expression engine
|
||||||
|
private static Perl5Util perl5Util = new Perl5Util();
|
||||||
|
|
||||||
|
public static final String removeUnreadableCharacters(String s)
|
||||||
|
{
|
||||||
|
if (perl5Util.match("/\\W+/", s))
|
||||||
|
{
|
||||||
|
// replace unreadable characters with a space
|
||||||
|
s = perl5Util.substitute("s#[^a-zA-Z0-9_@]+# #gm", s);
|
||||||
|
// remove any single/double word characters
|
||||||
|
s = perl5Util.substitute("s#\\b[a-zA-Z0-9_]{1,2}\\b##gm", s);
|
||||||
|
}
|
||||||
|
return trimWhitespace(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final String trimWhitespace(String s)
|
||||||
|
{
|
||||||
|
s = perl5Util.substitute("s#[\\s]{3,}# #m", s);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,404 @@
|
||||||
|
/*--
|
||||||
|
|
||||||
|
Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer that follows
|
||||||
|
these conditions in the documentation and/or other materials
|
||||||
|
provided with the distribution.
|
||||||
|
|
||||||
|
3. The name "JDOM" must not be used to endorse or promote products
|
||||||
|
derived from this software without prior written permission. For
|
||||||
|
written permission, please contact license@jdom.org.
|
||||||
|
|
||||||
|
4. Products derived from this software may not be called "JDOM", nor
|
||||||
|
may "JDOM" appear in their name, without prior written permission
|
||||||
|
from the JDOM Project Management (pm@jdom.org).
|
||||||
|
|
||||||
|
In addition, we request (but do not require) that you include in the
|
||||||
|
end-user documentation provided with the redistribution and/or in the
|
||||||
|
software itself an acknowledgement equivalent to the following:
|
||||||
|
"This product includes software developed by the
|
||||||
|
JDOM Project (http://www.jdom.org/)."
|
||||||
|
Alternatively, the acknowledgment may be graphical using the logos
|
||||||
|
available at http://www.jdom.org/images/logos.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||||
|
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||||
|
DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
|
||||||
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||||
|
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||||
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||||
|
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
SUCH DAMAGE.
|
||||||
|
|
||||||
|
This software consists of voluntary contributions made by many
|
||||||
|
individuals on behalf of the JDOM Project and was originally
|
||||||
|
created by Brett McLaughlin <brett@jdom.org> and
|
||||||
|
Jason Hunter <jhunter@jdom.org>. For more information on the
|
||||||
|
JDOM Project, please see <http://www.jdom.org/>.
|
||||||
|
|
||||||
|
*/
|
||||||
|
package com.relevanz.indyo.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.xml.sax.Attributes;
|
||||||
|
import org.xml.sax.InputSource;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
import org.xml.sax.SAXNotRecognizedException;
|
||||||
|
import org.xml.sax.SAXNotSupportedException;
|
||||||
|
import org.xml.sax.XMLReader;
|
||||||
|
import org.xml.sax.ext.LexicalHandler;
|
||||||
|
import org.xml.sax.helpers.AttributesImpl;
|
||||||
|
import org.xml.sax.helpers.XMLFilterImpl;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds convenience methods to base SAX2 Filter implementation.
|
||||||
|
*
|
||||||
|
* <i>Code and comments adapted from XMLWriter-0.2, written
|
||||||
|
* by David Megginson and released into the public domain,
|
||||||
|
* without warranty.</i>
|
||||||
|
*
|
||||||
|
* <p>The convenience methods are provided so that clients do not have to
|
||||||
|
* create empty attribute lists or provide empty strings as parameters;
|
||||||
|
* for example, the method invocation</p>
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* w.startElement("foo");
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* <p>is equivalent to the regular SAX2 ContentHandler method</p>
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* w.startElement("", "foo", "", new AttributesImpl());
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* <p>Except that it is more efficient because it does not allocate
|
||||||
|
* a new empty attribute list each time.</p>
|
||||||
|
*
|
||||||
|
* <p>In fact, there is an even simpler convenience method,
|
||||||
|
* <var>dataElement</var>, designed for writing elements that
|
||||||
|
* contain only character data.</p>
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* w.dataElement("greeting", "Hello, world!");
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* <p>is equivalent to</p>
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* w.startElement("greeting");
|
||||||
|
* w.characters("Hello, world!");
|
||||||
|
* w.endElement("greeting");
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @see org.xml.sax.helpers.XMLFilterImpl
|
||||||
|
*/
|
||||||
|
class XMLFilterBase extends XMLFilterImpl
|
||||||
|
{
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Constructors.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct an XML filter with no parent.
|
||||||
|
*
|
||||||
|
* <p>This filter will have no parent: you must assign a parent
|
||||||
|
* before you start a parse or do any configuration with
|
||||||
|
* setFeature or setProperty.</p>
|
||||||
|
*
|
||||||
|
* @see org.xml.sax.XMLReader#setFeature
|
||||||
|
* @see org.xml.sax.XMLReader#setProperty
|
||||||
|
*/
|
||||||
|
public XMLFilterBase()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create an XML filter with the specified parent.
|
||||||
|
*
|
||||||
|
* <p>Use the XMLReader provided as the source of events.</p>
|
||||||
|
*
|
||||||
|
* @param xmlreader The parent in the filter chain.
|
||||||
|
*/
|
||||||
|
public XMLFilterBase(XMLReader parent)
|
||||||
|
{
|
||||||
|
super(parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Convenience methods.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start a new element without a qname or attributes.
|
||||||
|
*
|
||||||
|
* <p>This method will provide a default empty attribute
|
||||||
|
* list and an empty string for the qualified name.
|
||||||
|
* It invokes {@link
|
||||||
|
* #startElement(String, String, String, Attributes)}
|
||||||
|
* directly.</p>
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
*/
|
||||||
|
public void startElement (String uri, String localName) throws SAXException
|
||||||
|
{
|
||||||
|
startElement(uri, localName, "", EMPTY_ATTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Start a new element without a qname, attributes or a Namespace URI.
|
||||||
|
*
|
||||||
|
* <p>This method will provide an empty string for the
|
||||||
|
* Namespace URI, and empty string for the qualified name,
|
||||||
|
* and a default empty attribute list. It invokes
|
||||||
|
* #startElement(String, String, String, Attributes)}
|
||||||
|
* directly.</p>
|
||||||
|
*
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
*/
|
||||||
|
public void startElement (String localName) throws SAXException
|
||||||
|
{
|
||||||
|
startElement("", localName, "", EMPTY_ATTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* End an element without a qname.
|
||||||
|
*
|
||||||
|
* <p>This method will supply an empty string for the qName.
|
||||||
|
* It invokes {@link #endElement(String, String, String)}
|
||||||
|
* directly.</p>
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void endElement (String uri, String localName) throws SAXException
|
||||||
|
{
|
||||||
|
endElement(uri, localName, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* End an element without a Namespace URI or qname.
|
||||||
|
*
|
||||||
|
* <p>This method will supply an empty string for the qName
|
||||||
|
* and an empty string for the Namespace URI.
|
||||||
|
* It invokes {@link #endElement(String, String, String)}
|
||||||
|
* directly.</p>
|
||||||
|
*
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void endElement (String localName) throws SAXException
|
||||||
|
{
|
||||||
|
endElement("", localName, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an empty element.
|
||||||
|
*
|
||||||
|
* Both a {@link #startElement startElement} and an
|
||||||
|
* {@link #endElement endElement} event will be passed on down
|
||||||
|
* the filter chain.
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI, or the empty string
|
||||||
|
* if the element has no Namespace or if Namespace
|
||||||
|
* processing is not being performed.
|
||||||
|
* @param localName The element's local name (without prefix). This
|
||||||
|
* parameter must be provided.
|
||||||
|
* @param qName The element's qualified name (with prefix), or
|
||||||
|
* the empty string if none is available. This parameter
|
||||||
|
* is strictly advisory: the writer may or may not use
|
||||||
|
* the prefix attached.
|
||||||
|
* @param atts The element's attribute list.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void emptyElement (String uri, String localName, String qName,
|
||||||
|
Attributes atts) throws SAXException
|
||||||
|
{
|
||||||
|
startElement(uri, localName, qName, atts);
|
||||||
|
endElement(uri, localName, qName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an empty element without a qname or attributes.
|
||||||
|
*
|
||||||
|
* <p>This method will supply an empty string for the qname
|
||||||
|
* and an empty attribute list. It invokes
|
||||||
|
* {@link #emptyElement(String, String, String, Attributes)}
|
||||||
|
* directly.</p>
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see #emptyElement(String, String, String, Attributes)
|
||||||
|
*/
|
||||||
|
public void emptyElement (String uri, String localName) throws SAXException
|
||||||
|
{
|
||||||
|
emptyElement(uri, localName, "", EMPTY_ATTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an empty element without a Namespace URI, qname or attributes.
|
||||||
|
*
|
||||||
|
* <p>This method will supply an empty string for the qname,
|
||||||
|
* and empty string for the Namespace URI, and an empty
|
||||||
|
* attribute list. It invokes
|
||||||
|
* {@link #emptyElement(String, String, String, Attributes)}
|
||||||
|
* directly.</p>
|
||||||
|
*
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see #emptyElement(String, String, String, Attributes)
|
||||||
|
*/
|
||||||
|
public void emptyElement (String localName) throws SAXException
|
||||||
|
{
|
||||||
|
emptyElement("", localName, "", EMPTY_ATTS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an element with character data content.
|
||||||
|
*
|
||||||
|
* <p>This is a convenience method to add a complete element
|
||||||
|
* with character data content, including the start tag
|
||||||
|
* and end tag.</p>
|
||||||
|
*
|
||||||
|
* <p>This method invokes
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#startElement},
|
||||||
|
* followed by
|
||||||
|
* {@link #characters(String)}, followed by
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @param qName The element's default qualified name.
|
||||||
|
* @param atts The element's attributes.
|
||||||
|
* @param content The character data content.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
* @see #characters(String)
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void dataElement (String uri, String localName, String qName,
|
||||||
|
Attributes atts, String content) throws SAXException
|
||||||
|
{
|
||||||
|
startElement(uri, localName, qName, atts);
|
||||||
|
characters(content);
|
||||||
|
endElement(uri, localName, qName);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an element with character data content but no attributes.
|
||||||
|
*
|
||||||
|
* <p>This is a convenience method to add a complete element
|
||||||
|
* with character data content, including the start tag
|
||||||
|
* and end tag. This method provides an empty string
|
||||||
|
* for the qname and an empty attribute list.</p>
|
||||||
|
*
|
||||||
|
* <p>This method invokes
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#startElement},
|
||||||
|
* followed by
|
||||||
|
* {@link #characters(String)}, followed by
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
|
||||||
|
*
|
||||||
|
* @param uri The element's Namespace URI.
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @param content The character data content.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
* @see #characters(String)
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void dataElement (String uri, String localName, String content)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
dataElement(uri, localName, "", EMPTY_ATTS, content);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an element with character data content but no attributes or
|
||||||
|
* Namespace URI.
|
||||||
|
*
|
||||||
|
* <p>This is a convenience method to add a complete element
|
||||||
|
* with character data content, including the start tag
|
||||||
|
* and end tag. The method provides an empty string for the
|
||||||
|
* Namespace URI, and empty string for the qualified name,
|
||||||
|
* and an empty attribute list.</p>
|
||||||
|
*
|
||||||
|
* <p>This method invokes
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#startElement},
|
||||||
|
* followed by
|
||||||
|
* {@link #characters(String)}, followed by
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
|
||||||
|
*
|
||||||
|
* @param localName The element's local name.
|
||||||
|
* @param content The character data content.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see org.xml.sax.ContentHandler#startElement
|
||||||
|
* @see #characters(String)
|
||||||
|
* @see org.xml.sax.ContentHandler#endElement
|
||||||
|
*/
|
||||||
|
public void dataElement (String localName, String content)
|
||||||
|
throws SAXException
|
||||||
|
{
|
||||||
|
dataElement("", localName, "", EMPTY_ATTS, content);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a string of character data, with XML escaping.
|
||||||
|
*
|
||||||
|
* <p>This is a convenience method that takes an XML
|
||||||
|
* String, converts it to a character array, then invokes
|
||||||
|
* {@link @see org.xml.sax.ContentHandler#characters}.</p>
|
||||||
|
*
|
||||||
|
* @param data The character data.
|
||||||
|
* @exception org.xml.sax.SAXException If a filter
|
||||||
|
* further down the chain raises an exception.
|
||||||
|
* @see @see org.xml.sax.ContentHandler#characters
|
||||||
|
*/
|
||||||
|
public void characters (String data) throws SAXException
|
||||||
|
{
|
||||||
|
char ch[] = data.toCharArray();
|
||||||
|
characters(ch, 0, ch.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
// Constants.
|
||||||
|
////////////////////////////////////////////////////////////////////
|
||||||
|
protected static final Attributes EMPTY_ATTS = new AttributesImpl();
|
||||||
|
}
|
||||||
|
|
||||||
|
// end of XMLFilterBase.java
|
Loading…
Reference in New Issue