mirror of https://github.com/apache/lucene.git
Initial import of source and libs.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150808 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a2e1270e2
commit
4474e5fd64
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[346504c6d4bd7232f0776a4a0f8a32333cedd93e] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[93e77a4a4476afff71a110dda1e96465cb7f25a9] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[be4a9176c35a7feeecf5b70edf070ecb5d13ac5d] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ff9b90061b65c32122fcdde27bfe7f1e61fbd7bd] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[329aef393bece9d77eef16279910f6cd73113c39] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[c1fa1d645474eee07f085a8ee29e38422f7614cf] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,90 @@
|
|||
package com.relevanz.indyo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache POI" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Generic implementation of an index datasource.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public abstract class AbstractDataSource implements IndexDataSource
|
||||
{
|
||||
protected AbstractDataSource()
|
||||
{
|
||||
}
|
||||
|
||||
protected AbstractDataSource(Map map)
|
||||
{
|
||||
loadFields(map);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fields to index.
|
||||
*/
|
||||
protected String[] fields;
|
||||
|
||||
/**
|
||||
* Convenience method to load fields to index into a Map.
|
||||
*/
|
||||
protected void loadFields(Map map)
|
||||
{
|
||||
Set fieldSet = map.keySet();
|
||||
fields = new String[fieldSet.size()];
|
||||
fieldSet.toArray(fields);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,332 @@
|
|||
package com.relevanz.indyo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.relevanz.indyo.util.StringUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A document is the atomic unit used for indexing purposes. It consists of
|
||||
* metadata as well as its file contents. File contents are handled by
|
||||
* {@link ContentHandler}.
|
||||
* </p>
|
||||
* <p>
|
||||
* DocumentHandler creates the {@link org.apache.lucene.document.Document},
|
||||
* adds fields to it, delegates to {@link ContentHandler} to handle
|
||||
* file contents.
|
||||
* </p>
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class DocumentHandler
|
||||
{
|
||||
/**
|
||||
* Field to retrieve all documents.
|
||||
*/
|
||||
public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
|
||||
|
||||
private static Logger log = Logger.getLogger(DocumentHandler.class);
|
||||
|
||||
private static boolean isDebugEnabled = log.isDebugEnabled();
|
||||
|
||||
/**
|
||||
* Should parent documents include data of its children?
|
||||
*/
|
||||
private static boolean parentEncapsulation = false;
|
||||
/**
|
||||
* Document object this DocumentHandler is handling.
|
||||
*/
|
||||
private Document doc;
|
||||
|
||||
/**
|
||||
* Map of metadata for this document. Contains the field:value pair
|
||||
* to be added to the document.
|
||||
*/
|
||||
private Map metadata;
|
||||
|
||||
/**
|
||||
* Map of fields. Contains field:type_of_field pair.
|
||||
*/
|
||||
private Map customFields;
|
||||
|
||||
/**
|
||||
* IndexWriter.
|
||||
*/
|
||||
private IndexWriter writer;
|
||||
|
||||
/**
|
||||
* A collection of documents to be added to the writer.
|
||||
*/
|
||||
private List documents = new ArrayList();
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param Map of metadata for this document.
|
||||
* @param Map of fields.
|
||||
* @param Writer.
|
||||
*/
|
||||
public DocumentHandler(Map metadata,
|
||||
Map customFields,
|
||||
IndexWriter writer)
|
||||
{
|
||||
this.metadata = metadata;
|
||||
this.customFields = customFields;
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles the actual processing of the document.
|
||||
*/
|
||||
public void process() throws IOException, Exception
|
||||
{
|
||||
String objectid = (String) metadata.get(IndexDataSource.OBJECT_IDENTIFIER);
|
||||
if (objectid == null)
|
||||
return;
|
||||
doc = createDocument();
|
||||
addMapToDoc(metadata);
|
||||
addNestedDataSource(metadata);
|
||||
doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
|
||||
//documents.add(doc);
|
||||
if (writer != null)
|
||||
{
|
||||
addToWriter();
|
||||
}
|
||||
else
|
||||
{
|
||||
documents.add(doc);
|
||||
}
|
||||
}
|
||||
|
||||
private List getDocuments()
|
||||
{
|
||||
return documents;
|
||||
}
|
||||
|
||||
private Document createDocument()
|
||||
{
|
||||
return new Document();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the contents of a Map to a document.
|
||||
*
|
||||
* @param Map to add.
|
||||
*/
|
||||
private void addMapToDoc(Map map)
|
||||
{
|
||||
for (Iterator it = map.keySet().iterator(); it.hasNext();)
|
||||
{
|
||||
String field = (String) it.next();
|
||||
Object value = map.get(field);
|
||||
if (value instanceof String)
|
||||
{
|
||||
String type = null;
|
||||
if (customFields != null)
|
||||
{
|
||||
type = (String) customFields.get(field);
|
||||
}
|
||||
addFieldToDoc(type, field, (String) value);
|
||||
}
|
||||
else if (value instanceof Reader)
|
||||
{
|
||||
addFieldToDoc(field, (Reader) value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add nested datasources.
|
||||
*
|
||||
* @param Map which contains the nested datasources.
|
||||
*/
|
||||
private void addNestedDataSource(Map map) throws Exception
|
||||
{
|
||||
Object o = map.get(IndexDataSource.NESTED_DATASOURCE);
|
||||
if (o == null)
|
||||
return;
|
||||
if (o instanceof IndexDataSource)
|
||||
{
|
||||
IndexDataSource ds = (IndexDataSource) o;
|
||||
addDataSource(ds);
|
||||
}
|
||||
else if (o instanceof List)
|
||||
{
|
||||
List nestedDataSource = (List) o;
|
||||
for (int i = 0, n = nestedDataSource.size(); i < n; i++)
|
||||
{
|
||||
IndexDataSource ds = (IndexDataSource) nestedDataSource.get(i);
|
||||
addDataSource(ds);
|
||||
}
|
||||
}
|
||||
else if (o instanceof IndexDataSource[])
|
||||
{
|
||||
IndexDataSource[] nestedDataSource = (IndexDataSource[]) o;
|
||||
for (int i = 0, n = nestedDataSource.length; i < n; i++)
|
||||
{
|
||||
IndexDataSource ds = (IndexDataSource) nestedDataSource[i];
|
||||
addDataSource(ds);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log.warn("Unknown object found as nested datasource:" + o);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Datasources are basically a collection of data maps to be indexed.
|
||||
* addMapToDoc is invoked for each map.
|
||||
*
|
||||
* @param Datasource to add.
|
||||
*/
|
||||
private void addDataSource(IndexDataSource ds) throws Exception
|
||||
{
|
||||
Map[] data = ds.getData();
|
||||
for (int i = 0; i < data.length; i++)
|
||||
{
|
||||
Map map = data[i];
|
||||
if (map.containsKey(IndexDataSource.OBJECT_IDENTIFIER))
|
||||
{
|
||||
/**
|
||||
* Create a new document because child datasources may need
|
||||
* to be retrieved independently of parent doc.
|
||||
*/
|
||||
DocumentHandler docHandler = new DocumentHandler(map, null, null);
|
||||
docHandler.process();
|
||||
documents.addAll(docHandler.getDocuments());
|
||||
}
|
||||
else
|
||||
{
|
||||
addMapToDoc(map);
|
||||
/**
|
||||
* Add nested datasources of this datasource's data
|
||||
*/
|
||||
addNestedDataSource(map);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a String-based field to a document.
|
||||
*
|
||||
* @param Type of field.
|
||||
* @param Name of field.
|
||||
* @param Value of field.
|
||||
*/
|
||||
private void addFieldToDoc(String type, String field, String value)
|
||||
{
|
||||
if (value == null)
|
||||
value = StringUtils.EMPTY_STRING;
|
||||
if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
|
||||
doc.add(Field.Keyword(field, value));
|
||||
else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
|
||||
doc.add(Field.UnIndexed(field, value));
|
||||
else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
|
||||
doc.add(Field.UnStored(field, value));
|
||||
else
|
||||
doc.add(Field.Text(field, value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a Reader-based field to a document.
|
||||
*
|
||||
* @param Name of field.
|
||||
* @param Reader.
|
||||
*/
|
||||
private void addFieldToDoc(String field, Reader reader)
|
||||
{
|
||||
doc.add(Field.Text(field, reader));
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds documents to the IndexWriter.
|
||||
*/
|
||||
private void addToWriter() throws IOException
|
||||
{
|
||||
if (parentEncapsulation)
|
||||
{
|
||||
for (int i = 0, n = documents.size(); i < n; i++)
|
||||
{
|
||||
Document d = (Document) documents.get(i);
|
||||
for (Enumeration e = d.fields(); e.hasMoreElements();)
|
||||
{
|
||||
Field f = (Field) e.nextElement();
|
||||
String fieldName = f.name();
|
||||
if (!fieldName.equals(IndexDataSource.CONTAINER_IDENTIFIER)
|
||||
&& !fieldName.equals(IndexDataSource.OBJECT_CLASS)
|
||||
&& !fieldName.equals(IndexDataSource.OBJECT_IDENTIFIER))
|
||||
{
|
||||
doc.add(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
|
||||
for (int i = 0, n = documents.size(); i < n; i++)
|
||||
{
|
||||
writer.addDocument((Document) documents.get(i));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
package com.relevanz.indyo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.DateField;
|
||||
import com.relevanz.indyo.contenthandler.FileContentHandler;
|
||||
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
|
||||
import com.relevanz.indyo.util.IOUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A filesystem-based datasource.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class FSDataSource extends AbstractDataSource
|
||||
{
|
||||
public static final String FILE_PATH_FIELD = "filePath";
|
||||
public static final String FILE_NAME_FIELD = "fileName";
|
||||
public static final String FILE_SIZE_FIELD = "fileSize";
|
||||
public static final String FILE_FORMAT_FIELD = "fileFormat";
|
||||
public static final String FILE_CONTENTS_FIELD = "fileContents";
|
||||
public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
|
||||
|
||||
private File targetFileOrDir;
|
||||
|
||||
public FSDataSource(String targetFileOrDirStr)
|
||||
{
|
||||
this(new File(targetFileOrDirStr));
|
||||
}
|
||||
|
||||
public FSDataSource(File targetFileOrDir)
|
||||
{
|
||||
setTargetDirectory(targetFileOrDir);
|
||||
}
|
||||
|
||||
public Map[] getData()
|
||||
{
|
||||
Map[] returnData = null;
|
||||
List temp = new ArrayList();
|
||||
loadDataFromFiles(targetFileOrDir, temp);
|
||||
returnData = new Map[temp.size()];
|
||||
returnData = (Map[]) temp.toArray(returnData);
|
||||
return returnData;
|
||||
}
|
||||
|
||||
public void setTargetDirectory(File targetFileOrDir)
|
||||
{
|
||||
this.targetFileOrDir = targetFileOrDir;
|
||||
}
|
||||
|
||||
private void loadDataFromFiles(File f, List list)
|
||||
{
|
||||
if (f.isDirectory())
|
||||
{
|
||||
File[] directoryTree = f.listFiles();
|
||||
for (int i = 0; i < directoryTree.length; i++)
|
||||
{
|
||||
loadDataFromFiles(directoryTree[i], list);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
Map dataMap = new HashMap();
|
||||
dataMap.put(FILE_PATH_FIELD, f.getPath());
|
||||
dataMap.put(FILE_NAME_FIELD, f.getName());
|
||||
dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
|
||||
DateField.timeToString(f.lastModified()));
|
||||
dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
|
||||
dataMap.put(FILE_FORMAT_FIELD,
|
||||
IOUtils.getFileExtension(f));
|
||||
addFileContents(f, dataMap);
|
||||
list.add(dataMap);
|
||||
}
|
||||
}
|
||||
|
||||
private void addFileContents(File targetFile, Map dataMap)
|
||||
{
|
||||
FileContentHandler cHandler =
|
||||
FileContentHandlerFactory.getContentHandler(targetFile);
|
||||
if (cHandler != null)
|
||||
{
|
||||
if (cHandler.fileContentIsReadable())
|
||||
{
|
||||
Reader r = cHandler.getReader();
|
||||
if (r != null)
|
||||
{
|
||||
dataMap.put(FILE_CONTENTS_FIELD, r);
|
||||
}
|
||||
}
|
||||
if (cHandler.containsNestedData())
|
||||
{
|
||||
dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//cat.warn("ContentHandler not found for " + contentFile.getName());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
package com.relevanz.indyo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache POI" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Thrown when loading SearchConfiguration.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class IllegalConfigurationException extends Exception
|
||||
{
|
||||
public IllegalConfigurationException(String msg)
|
||||
{
|
||||
super(msg);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
package com.relevanz.indyo;
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache POI" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A datasource is any source of data (filesystem, database, URL, etc)
|
||||
* which is indexed by SearchIndexer.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public interface IndexDataSource
|
||||
{
|
||||
/**
|
||||
* Key in the map (located in the list returned by getData)
|
||||
* to represent the class name of the object being indexed.
|
||||
*/
|
||||
public static final String OBJECT_CLASS = "objectClass";
|
||||
|
||||
/**
|
||||
* Key in the map (located in the list returned by getData)
|
||||
* to represent the uuid of the object being indexed.
|
||||
*/
|
||||
public static final String OBJECT_IDENTIFIER = "objectId";
|
||||
|
||||
/**
|
||||
* The key in the map (located in the list returned by getData)
|
||||
* to represent nested datasources.
|
||||
*/
|
||||
public static final String NESTED_DATASOURCE = "nestedDataSource";
|
||||
|
||||
/**
|
||||
* Key in the map (located in the list returned by getData)
|
||||
* to represent the id of the datasource's container. Applies to
|
||||
* nested datasources.
|
||||
*/
|
||||
public static final String CONTAINER_IDENTIFIER = "containerId";
|
||||
|
||||
/**
|
||||
* Key in the map to represent the class name of the Search Result
|
||||
* object for this datasource (if any).
|
||||
*/
|
||||
public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
|
||||
|
||||
/**
|
||||
* Retrieve a array of Maps. Each map represents the
|
||||
* a document to be indexed. The key:value pair of the map
|
||||
* is the metadata of the document.
|
||||
*/
|
||||
public Map[] getData() throws Exception;
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
package com.relevanz.indyo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache POI" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Entry point for search engine indexing.
|
||||
* <p>
|
||||
* SearchIndexer is responsible for creating the IndexWriter
|
||||
* {@see org.apache.lucene.index.IndexWriter} and passing it to
|
||||
* DocumentHandlers {@link DocumentHandler} to index individual documents.
|
||||
* </p>
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class IndyoIndexer
|
||||
{
|
||||
private static Logger log = Logger.getLogger(IndyoIndexer.class);
|
||||
private IndexWriter fsWriter;
|
||||
private SearchConfiguration config;
|
||||
|
||||
public IndyoIndexer(String indexDirectory, String configFile)
|
||||
throws IOException, IllegalConfigurationException
|
||||
{
|
||||
Analyzer a = new StandardAnalyzer();
|
||||
fsWriter = new IndexWriter(indexDirectory, a, true);
|
||||
fsWriter.maxFieldLength = 1000000;
|
||||
loadConfig(configFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Indexes documents.
|
||||
*/
|
||||
public synchronized void index(IndexDataSource ds) throws IOException, Exception
|
||||
{
|
||||
log.debug("Initiating search engine indexing...");
|
||||
long start = System.currentTimeMillis();
|
||||
// temporarily use an empty map whilst custom fields get implemented
|
||||
indexDataSource(ds, Collections.EMPTY_MAP);
|
||||
fsWriter.optimize();
|
||||
fsWriter.close();
|
||||
long stop = System.currentTimeMillis();
|
||||
log.debug("Indexing took " + (stop - start) + " milliseconds");
|
||||
}
|
||||
|
||||
private void loadConfig(String configFile) throws IllegalConfigurationException
|
||||
{
|
||||
config = new SearchConfiguration(configFile);
|
||||
FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
|
||||
}
|
||||
|
||||
private void indexDataSource(IndexDataSource source, Map customFields)
|
||||
throws Exception
|
||||
{
|
||||
Map[] data = source.getData();
|
||||
// here's a good place to spawn a couple of threads for indexing
|
||||
for (int i = 0; i < data.length; i++)
|
||||
{
|
||||
DocumentHandler docHandler =
|
||||
new DocumentHandler(data[i], customFields, fsWriter);
|
||||
docHandler.process();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,259 @@
|
|||
package com.relevanz.indyo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
|
||||
import com.relevanz.indyo.util.DataUnformatFilter;
|
||||
import org.apache.log4j.Category;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.jdom.Document;
|
||||
import org.jdom.Element;
|
||||
import org.jdom.input.SAXBuilder;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* Configures the indexing process using an XML file.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class SearchConfiguration
|
||||
{
|
||||
public static final String TEXT_FIELD_TYPE = "text";
|
||||
public static final String KEYWORD_FIELD_TYPE = "keyword";
|
||||
public static final String UNINDEXED_FIELD_TYPE = "unindexed";
|
||||
public static final String UNSTORED_FIELD_TYPE = "unstored";
|
||||
|
||||
/** Log4j category.
|
||||
*/
|
||||
static Logger log = Logger.getLogger(SearchConfiguration.class.getName());
|
||||
|
||||
/**
|
||||
* Key in the config file to declare content handlers.
|
||||
*/
|
||||
private static final String CONTENT_HANDLER_KEY = "Search.ContentHandlers";
|
||||
|
||||
/**
|
||||
* Key in the config file to declare custom fields.
|
||||
*/
|
||||
private static final String FIELD_KEY = "Search.Fields";
|
||||
|
||||
/**
|
||||
* Map of content handlers.
|
||||
*/
|
||||
private Map contentHandlers = new HashMap();
|
||||
|
||||
/**
|
||||
* Map of (non-standard) custom fields to index.
|
||||
*/
|
||||
private Map customFields = new HashMap();
|
||||
|
||||
/**
|
||||
* Document object which represents the xml configuration file.
|
||||
*/
|
||||
private Document doc;
|
||||
|
||||
/**
|
||||
* Creates a new SearchConfiguration.
|
||||
*
|
||||
* @param configFile Name of the xml configuration file.
|
||||
*/
|
||||
public SearchConfiguration(String configFile) throws IllegalConfigurationException
|
||||
{
|
||||
try
|
||||
{
|
||||
SAXBuilder builder = new SAXBuilder();
|
||||
DataUnformatFilter format = new DataUnformatFilter();
|
||||
builder.setXMLFilter(format);
|
||||
doc = builder.build(configFile);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
log.error("Error creating XML parser:" + e.getMessage(), e);
|
||||
}
|
||||
loadContentHandlers();
|
||||
loadCustomFields();
|
||||
}
|
||||
|
||||
public Map getContentHandlers()
|
||||
{
|
||||
return this.contentHandlers;
|
||||
}
|
||||
|
||||
public Map getCustomFields()
|
||||
{
|
||||
return this.customFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the content handlers.
|
||||
*/
|
||||
protected void loadContentHandlers() throws IllegalConfigurationException
|
||||
{
|
||||
String[] extensions = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "extension");
|
||||
String[] handlers = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "handler");
|
||||
if (extensions.length != handlers.length)
|
||||
throw new IllegalConfigurationException(
|
||||
"Illegal configuration of Search Content Handlers!");
|
||||
for (int i = 0; i < extensions.length; i++)
|
||||
{
|
||||
contentHandlers.put(extensions[i], generateObject(handlers[i]));
|
||||
}
|
||||
String[] defaultExtension = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "default");
|
||||
for (int i = 0; i < defaultExtension.length; i++)
|
||||
{
|
||||
if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
|
||||
{
|
||||
contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
|
||||
, generateObject(handlers[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the custom fields to index.
|
||||
*/
|
||||
protected void loadCustomFields() throws IllegalConfigurationException
|
||||
{
|
||||
String[] fields = getChildPropertyAttributeValues(FIELD_KEY, "name");
|
||||
String[] fieldtypes = getChildPropertyAttributeValues(FIELD_KEY, "type");
|
||||
if (fields.length != fieldtypes.length)
|
||||
throw new IllegalConfigurationException(
|
||||
"Illegal configuration of custom search fields!");
|
||||
for (int i = 0; i < fields.length; i++)
|
||||
{
|
||||
customFields.put(fields[i], fieldtypes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return attribute values for all child nodes.
|
||||
*/
|
||||
private String[] getChildPropertyAttributeValues(String parent,
|
||||
String attributeName)
|
||||
{
|
||||
String[] nodeName = parseNodeName(parent);
|
||||
Element element = doc.getRootElement();
|
||||
for (int i = 0; i < nodeName.length; i++)
|
||||
{
|
||||
element = element.getChild(nodeName[i]);
|
||||
if (element == null)
|
||||
{
|
||||
return new String[]{};
|
||||
}
|
||||
}
|
||||
List children = element.getChildren();
|
||||
int childCount = children.size();
|
||||
String[] childrenAttributeValue = new String[childCount];
|
||||
for (int i = 0; i < childCount; i++)
|
||||
{
|
||||
childrenAttributeValue[i] =
|
||||
((Element) children.get(i)).getAttributeValue(attributeName);
|
||||
}
|
||||
return childrenAttributeValue;
|
||||
}
|
||||
|
||||
/**
|
||||
* Node names are in the form "x.y.z". Returns a String array
|
||||
* representation of the node elements.
|
||||
*/
|
||||
private String[] parseNodeName(String nodeName)
|
||||
{
|
||||
StringTokenizer st = new StringTokenizer(nodeName, ".");
|
||||
String[] nodeElements = new String[st.countTokens()];
|
||||
int i = 0;
|
||||
while (st.hasMoreTokens())
|
||||
{
|
||||
nodeElements[i] = st.nextToken();
|
||||
++i;
|
||||
}
|
||||
return nodeElements;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to return an object based on its class name.
|
||||
* The object needs to have a constructor which accepts no parameters.
|
||||
*
|
||||
* @param className Class name of object to be generated
|
||||
* @return Object
|
||||
*/
|
||||
private static Object generateObject(String className)
|
||||
{
|
||||
Object o = null;
|
||||
try
|
||||
{
|
||||
Class c = Class.forName(className);
|
||||
o = c.newInstance();
|
||||
}
|
||||
catch (ClassNotFoundException cnfe)
|
||||
{
|
||||
log.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||
}
|
||||
catch (InstantiationException ie)
|
||||
{
|
||||
log.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||
}
|
||||
catch (IllegalAccessException iae)
|
||||
{
|
||||
log.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A content handler determines how to index a file's contents.
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public interface FileContentHandler
|
||||
{
|
||||
/**
|
||||
* Do the file contents of this file have any meaning? Should
|
||||
* its contents be indexed?
|
||||
*/
|
||||
public boolean fileContentIsReadable();
|
||||
|
||||
/**
|
||||
* Returns a reader for this file's contents.
|
||||
*/
|
||||
public Reader getReader();
|
||||
|
||||
/**
|
||||
* Does this file have nested data within?
|
||||
*/
|
||||
public boolean containsNestedData();
|
||||
|
||||
/**
|
||||
* Return the datasources contained within the parent file.
|
||||
* This can be URLs contained within a HTML file, files
|
||||
* within a ZIP file, basically anything represented by a
|
||||
* DataSource.
|
||||
*/
|
||||
public List getNestedDataSource();
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A no-op implementation to make FileContentHandler creation easier.
|
||||
* <p>
|
||||
* Classes which need to implement the FileContentHandler interface should
|
||||
* extend this class or {@link NestedFileContentHandlerAdapter}.
|
||||
* </p>
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public abstract class FileContentHandlerAdapter implements FileContentHandler
|
||||
{
|
||||
protected File file;
|
||||
|
||||
protected FileContentHandlerAdapter(File file)
|
||||
{
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
public Reader getReader()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
public List getNestedDataSource()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.log4j.Category;
|
||||
|
||||
import java.util.Map;
|
||||
import java.io.File;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Constructor;
|
||||
|
||||
import com.relevanz.indyo.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Factory responsible for obtaining ContentHandlers.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public abstract class FileContentHandlerFactory
|
||||
{
|
||||
public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
|
||||
static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
|
||||
private static Map handlerRegistry;
|
||||
|
||||
public static FileContentHandler getContentHandler(File f)
|
||||
{
|
||||
String extension = IOUtils.getFileExtension(f);
|
||||
if (handlerRegistry.containsKey(extension))
|
||||
{
|
||||
String handlerClassname = (String) handlerRegistry.get(extension);
|
||||
return (FileContentHandler) generateObject(handlerClassname,
|
||||
new Class[]{File.class},
|
||||
new Object[]{f});
|
||||
}
|
||||
else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
|
||||
{
|
||||
String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
|
||||
return (FileContentHandler) generateObject(handlerClassname);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NullHandler.getInstance();
|
||||
}
|
||||
}
|
||||
|
||||
public static void setHandlerRegistry(Map handlerRegistry)
|
||||
{
|
||||
FileContentHandlerFactory.handlerRegistry = handlerRegistry;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to return an object based on its class name.
|
||||
* The object needs to have a constructor which accepts no parameters.
|
||||
*
|
||||
* @param className Class name of object to be generated
|
||||
* @return Object
|
||||
*/
|
||||
private static Object generateObject(String className)
|
||||
{
|
||||
Object o = null;
|
||||
try
|
||||
{
|
||||
Class c = Class.forName(className);
|
||||
o = c.newInstance();
|
||||
}
|
||||
catch (ClassNotFoundException cnfe)
|
||||
{
|
||||
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||
}
|
||||
catch (InstantiationException ie)
|
||||
{
|
||||
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||
}
|
||||
catch (IllegalAccessException iae)
|
||||
{
|
||||
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to return an object based on its class name.
|
||||
*
|
||||
* @param type Class name of object to be generated
|
||||
* @param clazz Class array of parameters.
|
||||
* @param args Object array of arguments.
|
||||
* @return Object
|
||||
*/
|
||||
private static Object generateObject(String className,
|
||||
Class[] clazz,
|
||||
Object[] args)
|
||||
{
|
||||
Object o = null;
|
||||
try
|
||||
{
|
||||
Class c = Class.forName(className);
|
||||
Constructor con = c.getConstructor(clazz);
|
||||
if (con != null)
|
||||
{
|
||||
o = con.newInstance(args);
|
||||
}
|
||||
else
|
||||
throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
|
||||
}
|
||||
catch (ClassNotFoundException cnfe)
|
||||
{
|
||||
cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
|
||||
}
|
||||
catch (InstantiationException ie)
|
||||
{
|
||||
cat.error(ie.getMessage() + " Class named '" + className + "' could not be instantiated.", ie);
|
||||
}
|
||||
catch (IllegalAccessException iae)
|
||||
{
|
||||
cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
|
||||
}
|
||||
catch (NoSuchMethodException nsme)
|
||||
{
|
||||
cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
|
||||
}
|
||||
catch (InvocationTargetException ite)
|
||||
{
|
||||
cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
|
||||
}
|
||||
return o;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.log4j.Category;
|
||||
import com.relevanz.indyo.IndexDataSource;
|
||||
import com.relevanz.indyo.FSDataSource;
|
||||
import com.relevanz.indyo.util.IOUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Handles GZip content.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class GZipHandler extends NestedFileContentHandlerAdapter
|
||||
{
|
||||
private static Category cat = Category.getInstance(GZipHandler.class.getName());
|
||||
|
||||
public GZipHandler(File file)
|
||||
{
|
||||
super(file);
|
||||
}
|
||||
|
||||
public Reader getReader()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
public List getNestedDataSource()
|
||||
{
|
||||
if (!file.exists())
|
||||
return null;
|
||||
try
|
||||
{
|
||||
File tempDir = new File(TEMP_FOLDER);
|
||||
tempDir.mkdirs();
|
||||
tempDir.deleteOnExit();
|
||||
String filename = file.getName();
|
||||
File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
|
||||
tempFile.deleteOnExit();
|
||||
IOUtils.extractGZip(file, tempFile);
|
||||
indexGZipDirectory(tempDir);
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
cat.error("IOException ungzipping " + file.toString(), ioe);
|
||||
}
|
||||
return nestedDataSource;
|
||||
}
|
||||
|
||||
public boolean fileContentIsReadable()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// only one file, but let's just treat it like a directory anyway
|
||||
private void indexGZipDirectory(File dir)
|
||||
{
|
||||
if (dir.isDirectory())
|
||||
{
|
||||
File[] dirContents = dir.listFiles();
|
||||
for (int i = 0; i < dirContents.length; i++)
|
||||
{
|
||||
indexGZipDirectory(dirContents[i]);
|
||||
}
|
||||
}
|
||||
else if (dir.isFile())
|
||||
{
|
||||
IndexDataSource ds = new FSDataSource(dir);
|
||||
nestedDataSource.add(nestedDataSource);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A no-op implementation to make FileContentHandler creation easier.
|
||||
* <p>
|
||||
* Classes which need to implement the FileContentHandler interface
|
||||
* and need to handle nested content (example: zip, tar, rar, etc) should
|
||||
* extend this class.
|
||||
* </p>
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public abstract class NestedFileContentHandlerAdapter
|
||||
extends FileContentHandlerAdapter
|
||||
{
|
||||
protected final String TEMP_FOLDER = "/usr/temp" + '/'
|
||||
+ Math.random() + '/';
|
||||
|
||||
protected List nestedDataSource;
|
||||
|
||||
public NestedFileContentHandlerAdapter(File file)
|
||||
{
|
||||
super(file);
|
||||
}
|
||||
|
||||
public boolean containsNestedData()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Reader;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Do-nothing content handler.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class NullHandler extends FileContentHandlerAdapter
|
||||
{
|
||||
private static NullHandler singleton = new NullHandler(null);
|
||||
|
||||
public static FileContentHandler getInstance()
|
||||
{
|
||||
return singleton;
|
||||
}
|
||||
|
||||
private NullHandler(File file)
|
||||
{
|
||||
super(file);
|
||||
}
|
||||
|
||||
public boolean fileContentIsReadable()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public Reader getReader()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean containsNestedData()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.log4j.Category;
|
||||
import com.relevanz.indyo.IndexDataSource;
|
||||
import com.relevanz.indyo.FSDataSource;
|
||||
import com.relevanz.indyo.util.IOUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Handles Tar files.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TARHandler extends NestedFileContentHandlerAdapter
|
||||
{
|
||||
static Category cat = Category.getInstance(TARHandler.class.getName());
|
||||
|
||||
public TARHandler(File file)
|
||||
{
|
||||
super(file);
|
||||
}
|
||||
|
||||
public Reader getReader()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean fileContentIsReadable()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public List getNestedDataSource()
|
||||
{
|
||||
if (!file.exists())
|
||||
return null;
|
||||
if (nestedDataSource == null)
|
||||
{
|
||||
nestedDataSource = new ArrayList();
|
||||
}
|
||||
try
|
||||
{
|
||||
File tempDir = new File(TEMP_FOLDER);
|
||||
tempDir.deleteOnExit();
|
||||
IOUtils.extractTar(file, tempDir);
|
||||
indexTarDirectory(tempDir);
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
cat.error(ioe.getMessage(), ioe);
|
||||
}
|
||||
return nestedDataSource;
|
||||
}
|
||||
|
||||
private void indexTarDirectory(File dir)
|
||||
{
|
||||
if (dir.isDirectory())
|
||||
{
|
||||
File[] dirContents = dir.listFiles();
|
||||
for (int i = 0; i < dirContents.length; i++)
|
||||
{
|
||||
indexTarDirectory(dirContents[i]);
|
||||
}
|
||||
}
|
||||
else if (dir.isFile())
|
||||
{
|
||||
// here create new DataMap for the tarred file
|
||||
IndexDataSource ds = new FSDataSource(dir);
|
||||
nestedDataSource.add(nestedDataSource);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.log4j.Category;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import com.relevanz.indyo.util.StringUtils;
|
||||
|
||||
/**
|
||||
* Handles text-based content.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TextHandler extends FileContentHandlerAdapter
|
||||
{
|
||||
static Category cat = Category.getInstance(TextHandler.class.getName());
|
||||
|
||||
public TextHandler(File file)
|
||||
{
|
||||
super(file);
|
||||
}
|
||||
|
||||
public Reader getReader()
|
||||
{
|
||||
if (!file.exists())
|
||||
{
|
||||
cat.error(file.toString() + " doesn't exist! Failing silently...");
|
||||
return null;
|
||||
}
|
||||
return getReader(file);
|
||||
}
|
||||
|
||||
public boolean containsNestedData()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean fileContentIsReadable()
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
private Reader getReader(File f)
|
||||
{
|
||||
Reader reader = null;
|
||||
try
|
||||
{
|
||||
reader = new FileReader(f);
|
||||
}
|
||||
catch (FileNotFoundException nfe)
|
||||
{
|
||||
cat.error("File Not Found Exception:" + f.toString(), nfe);
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
cat.error(ioe.getMessage(), ioe);
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
package com.relevanz.indyo.contenthandler;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.log4j.Category;
|
||||
import com.relevanz.indyo.IndexDataSource;
|
||||
import com.relevanz.indyo.FSDataSource;
|
||||
import com.relevanz.indyo.util.IOUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipException;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
/**
|
||||
* Handles Zip files.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public class ZIPHandler extends NestedFileContentHandlerAdapter
|
||||
{
|
||||
private static Category cat = Category.getInstance(ZIPHandler.class);
|
||||
|
||||
public ZIPHandler(File file)
|
||||
{
|
||||
super(file);
|
||||
}
|
||||
|
||||
public boolean fileContentIsReadable()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
public Reader getReader()
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
public List getNestedDataSource()
|
||||
{
|
||||
if (!file.exists())
|
||||
return null;
|
||||
if (nestedDataSource == null)
|
||||
{
|
||||
nestedDataSource = new ArrayList();
|
||||
}
|
||||
try
|
||||
{
|
||||
ZipFile zFile = new ZipFile(file);
|
||||
for (Enumeration e = zFile.entries(); e.hasMoreElements();)
|
||||
{
|
||||
ZipEntry entry = (ZipEntry) e.nextElement();
|
||||
String entryName = entry.getName();
|
||||
IOUtils.writeToTempFile(zFile.getInputStream(entry),
|
||||
TEMP_FOLDER + entryName);
|
||||
if (!entry.isDirectory())
|
||||
{
|
||||
// create a new DataMap for each zip entry
|
||||
IndexDataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
|
||||
nestedDataSource.add(ds);
|
||||
}
|
||||
}
|
||||
zFile.close();
|
||||
}
|
||||
catch (ZipException ze)
|
||||
{
|
||||
cat.error("ZipException parsing zip:" + ze.getMessage(), ze);
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
|
||||
}
|
||||
return nestedDataSource;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,312 @@
|
|||
/*--
|
||||
|
||||
Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer that follows
|
||||
these conditions in the documentation and/or other materials
|
||||
provided with the distribution.
|
||||
|
||||
3. The name "JDOM" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact license@jdom.org.
|
||||
|
||||
4. Products derived from this software may not be called "JDOM", nor
|
||||
may "JDOM" appear in their name, without prior written permission
|
||||
from the JDOM Project Management (pm@jdom.org).
|
||||
|
||||
In addition, we request (but do not require) that you include in the
|
||||
end-user documentation provided with the redistribution and/or in the
|
||||
software itself an acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the
|
||||
JDOM Project (http://www.jdom.org/)."
|
||||
Alternatively, the acknowledgment may be graphical using the logos
|
||||
available at http://www.jdom.org/images/logos.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the JDOM Project and was originally
|
||||
created by Brett McLaughlin <brett@jdom.org> and
|
||||
Jason Hunter <jhunter@jdom.org>. For more information on the
|
||||
JDOM Project, please see <http://www.jdom.org/>.
|
||||
|
||||
*/
|
||||
package com.relevanz.indyo.util;
|
||||
|
||||
import java.util.Stack;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.XMLReader;
|
||||
|
||||
|
||||
/**
|
||||
* Filter for removing formatting from data- or field-oriented XML.
|
||||
*
|
||||
* <i>Code and comments adapted from DataWriter-0.2, written
|
||||
* by David Megginson and released into the public domain,
|
||||
* without warranty.</i>
|
||||
*
|
||||
* <p>This filter removes leading and trailing whitespace from
|
||||
* field-oriented XML without mixed content. Note that this class will
|
||||
* likely not yield appropriate results for document-oriented XML like
|
||||
* XHTML pages, which mix character data and elements together.</p>
|
||||
*
|
||||
* @see DataFormatFilter
|
||||
*/
|
||||
public class DataUnformatFilter extends XMLFilterBase
|
||||
{
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Constructors.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Create a new filter.
|
||||
*/
|
||||
public DataUnformatFilter()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new filter.
|
||||
*
|
||||
* <p>Use the XMLReader provided as the source of events.</p>
|
||||
*
|
||||
* @param xmlreader The parent in the filter chain.
|
||||
*/
|
||||
public DataUnformatFilter(XMLReader xmlreader)
|
||||
{
|
||||
super(xmlreader);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Public methods.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Reset the filter so that it can be reused.
|
||||
*
|
||||
* <p>This method is especially useful if the filter failed
|
||||
* with an exception the last time through.</p>
|
||||
*/
|
||||
public void reset ()
|
||||
{
|
||||
state = SEEN_NOTHING;
|
||||
stateStack = new Stack();
|
||||
whitespace = new StringBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter a start document event.
|
||||
*
|
||||
* <p>Reset state and pass the event on for further processing.</p>
|
||||
*
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startDocument
|
||||
*/
|
||||
public void startDocument ()
|
||||
throws SAXException
|
||||
{
|
||||
reset();
|
||||
super.startDocument();
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter a start element event.
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @param qName The element's qualified (prefixed) name.
|
||||
* @param atts The element's attribute list.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
*/
|
||||
public void startElement (String uri, String localName,
|
||||
String qName, Attributes atts)
|
||||
throws SAXException
|
||||
{
|
||||
clearWhitespace();
|
||||
stateStack.push(SEEN_ELEMENT);
|
||||
state = SEEN_NOTHING;
|
||||
super.startElement(uri, localName, qName, atts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter an end element event.
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @param qName The element's qualified (prefixed) name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void endElement (String uri, String localName, String qName)
|
||||
throws SAXException
|
||||
{
|
||||
if (state == SEEN_ELEMENT) {
|
||||
clearWhitespace();
|
||||
} else {
|
||||
emitWhitespace();
|
||||
}
|
||||
state = stateStack.pop();
|
||||
super.endElement(uri, localName, qName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter a character data event.
|
||||
*
|
||||
* @param ch The characters to write.
|
||||
* @param start The starting position in the array.
|
||||
* @param length The number of characters to use.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#characters
|
||||
*/
|
||||
public void characters (char ch[], int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
if (state != SEEN_DATA) {
|
||||
|
||||
/* Look for non-whitespace. */
|
||||
int end = start + length;
|
||||
while (end-- > start) {
|
||||
if (!isXMLWhitespace(ch[end]))
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If all the characters are whitespace, save them for later.
|
||||
* If we've got some data, emit any saved whitespace and update
|
||||
* our state to show we've seen data.
|
||||
*/
|
||||
if (end < start) {
|
||||
saveWhitespace(ch, start, length);
|
||||
} else {
|
||||
state = SEEN_DATA;
|
||||
emitWhitespace();
|
||||
}
|
||||
}
|
||||
|
||||
/* Pass on everything inside a data field. */
|
||||
if (state == SEEN_DATA) {
|
||||
super.characters(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter an ignorable whitespace event.
|
||||
*
|
||||
* @param ch The array of characters to write.
|
||||
* @param start The starting position in the array.
|
||||
* @param length The number of characters to write.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#ignorableWhitespace
|
||||
*/
|
||||
public void ignorableWhitespace (char ch[], int start, int length)
|
||||
throws SAXException
|
||||
{
|
||||
emitWhitespace();
|
||||
// ignore
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter a processing instruction event.
|
||||
*
|
||||
* @param target The PI target.
|
||||
* @param data The PI data.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#processingInstruction
|
||||
*/
|
||||
public void processingInstruction (String target, String data)
|
||||
throws SAXException
|
||||
{
|
||||
emitWhitespace();
|
||||
super.processingInstruction(target, data);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Internal methods.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Saves trailing whitespace.
|
||||
*/
|
||||
protected void saveWhitespace (char[] ch, int start, int length) {
|
||||
whitespace.append(ch, start, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Passes saved whitespace down the filter chain.
|
||||
*/
|
||||
protected void emitWhitespace ()
|
||||
throws SAXException
|
||||
{
|
||||
char[] data = new char[whitespace.length()];
|
||||
if (whitespace.length() > 0) {
|
||||
whitespace.getChars(0, data.length, data, 0);
|
||||
whitespace.setLength(0);
|
||||
super.characters(data, 0, data.length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Discards saved whitespace.
|
||||
*/
|
||||
protected void clearWhitespace () {
|
||||
whitespace.setLength(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns <var>true</var> if character is XML whitespace.
|
||||
*/
|
||||
private boolean isXMLWhitespace (char c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Constants.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
private static final Object SEEN_NOTHING = new Object();
|
||||
private static final Object SEEN_ELEMENT = new Object();
|
||||
private static final Object SEEN_DATA = new Object();
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Internal state.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
private Object state = SEEN_NOTHING;
|
||||
private Stack stateStack = new Stack();
|
||||
|
||||
private StringBuffer whitespace = new StringBuffer();
|
||||
}
|
||||
|
||||
// end of DataUnformatFilter.java
|
|
@ -0,0 +1,274 @@
|
|||
package com.relevanz.indyo.util;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.ice.tar.TarArchive;
|
||||
import org.apache.log4j.Category;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
/**
|
||||
* Utility IO-related methods.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class IOUtils
|
||||
{
|
||||
/**
|
||||
* Log4j category.
|
||||
*/
|
||||
private static Category cat = Category.getInstance(IOUtils.class.getName());
|
||||
|
||||
/**
|
||||
* Writes data from the inputstream to the outputstream.
|
||||
*
|
||||
* @param in InputStream to read from.
|
||||
* @param out OutputStream to write to.
|
||||
* @throws IOException I/O error.
|
||||
*/
|
||||
public static void transferData(InputStream in, OutputStream out)
|
||||
throws IOException
|
||||
{
|
||||
byte[] data = new byte[10000];
|
||||
int len;
|
||||
while ((len = in.read(data)) != -1)
|
||||
{
|
||||
out.write(data, 0, len);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively deletes a directory.
|
||||
* @param File Directory to delete.
|
||||
*/
|
||||
public static void deleteDirectory(File directory)
|
||||
{
|
||||
File[] fArray = directory.listFiles();
|
||||
for (int i = 0; i < fArray.length; i++)
|
||||
{
|
||||
if (fArray[i].isDirectory())
|
||||
{
|
||||
deleteDirectory(fArray[i]);
|
||||
}
|
||||
fArray[i].delete();
|
||||
}
|
||||
directory.delete();
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an input stream to a temporary file which is set
|
||||
* to delete when the VM exits.
|
||||
* @param Inputstream to read data from
|
||||
* @param Temporary file to write to
|
||||
*/
|
||||
public static void writeToTempFile(InputStream in, String tempfile)
|
||||
throws IOException
|
||||
{
|
||||
OutputStream out = null;
|
||||
try
|
||||
{
|
||||
File f = new File(tempfile);
|
||||
f.deleteOnExit();
|
||||
char lastChar = tempfile.charAt(tempfile.length() - 1);
|
||||
// make no assumptions that java.io.File detects directories
|
||||
// in a cross-platform manner
|
||||
if (f.isDirectory() || lastChar == '\\' || lastChar == '/')
|
||||
f.mkdirs();
|
||||
else
|
||||
{
|
||||
// ensure that all necessary directories are created
|
||||
File parent = f.getParentFile();
|
||||
parent.deleteOnExit();
|
||||
parent.mkdirs();
|
||||
out = new FileOutputStream(tempfile);
|
||||
transferData(in, out);
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (out != null)
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an file to a ZipOutputStream.
|
||||
* @param File to read data from
|
||||
* @param Path of the ZipEntry
|
||||
* @param ZipOutputStream to write to
|
||||
*/
|
||||
public static void addToZipOutputStream(String file,
|
||||
String zipPath,
|
||||
ZipOutputStream out)
|
||||
throws FileNotFoundException, IOException
|
||||
{
|
||||
File f = new File(file);
|
||||
byte[] buffer = new byte[8192]; // Create a buffer for copying
|
||||
int bytes_read;
|
||||
FileInputStream in = null;
|
||||
try
|
||||
{
|
||||
in = new FileInputStream(f); // Stream to read file
|
||||
ZipEntry entry = new ZipEntry(zipPath); // Make a ZipEntry
|
||||
out.putNextEntry(entry); // Store entry in zipfile
|
||||
while ((bytes_read = in.read(buffer)) != -1) // Copy bytes to zipfile
|
||||
out.write(buffer, 0, bytes_read);
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (in != null)
|
||||
in.close(); // Close input stream
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a tar file to a directory.
|
||||
* @param Tar file to read data from
|
||||
* @param Directory to write to
|
||||
*/
|
||||
public static void extractTar(File tarFile, File destDir)
|
||||
throws IOException
|
||||
{
|
||||
FileInputStream fis = null;
|
||||
try
|
||||
{
|
||||
fis = new FileInputStream(tarFile);
|
||||
TarArchive ta = new TarArchive(fis);
|
||||
ta.extractContents(destDir);
|
||||
ta.closeArchive();
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (fis != null)
|
||||
fis.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts a GZip file to a file.
|
||||
* @param GZip file to read data from
|
||||
* @param File to write to
|
||||
*/
|
||||
public static void extractGZip(File f, File destFile) throws IOException
|
||||
{
|
||||
FileOutputStream out = null;
|
||||
FileInputStream fis = null;
|
||||
GZIPInputStream gzin = null;
|
||||
try
|
||||
{
|
||||
out = new FileOutputStream(destFile);
|
||||
fis = new FileInputStream(f);
|
||||
gzin = new GZIPInputStream(fis);
|
||||
byte[] data = new byte[10000];
|
||||
int len;
|
||||
while ((len = gzin.read(data)) != -1)
|
||||
{
|
||||
out.write(data, 0, len);
|
||||
}
|
||||
out.flush();
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (gzin != null)
|
||||
gzin.close();
|
||||
if (out != null)
|
||||
out.close();
|
||||
if (fis != null)
|
||||
fis.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* reads all bytes from the given stream
|
||||
* @param is the stream to read from
|
||||
*/
|
||||
public static final byte[] loadBytes(InputStream is) throws IOException
|
||||
{
|
||||
// read in the entry data
|
||||
int count = 0;
|
||||
byte[] buffer = new byte[0];
|
||||
byte[] chunk = new byte[4096];
|
||||
while ((count = is.read(chunk)) >= 0)
|
||||
{
|
||||
byte[] t = new byte[buffer.length + count];
|
||||
System.arraycopy(buffer, 0, t, 0, buffer.length);
|
||||
System.arraycopy(chunk, 0, t, buffer.length, count);
|
||||
buffer = t;
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/** Returns the file extension of a file.
|
||||
* @param filename Filename to obtain the file extension.
|
||||
* @return File extension (without the ".").
|
||||
*/
|
||||
public static String getFileExtension(String filename)
|
||||
{
|
||||
return filename.substring(filename.lastIndexOf(".") + 1); // + 1 to remove the "."
|
||||
}
|
||||
|
||||
/** Returns the file extension of a file.
|
||||
* @param f File object to obtain the file extension.
|
||||
* @return File extension (without the ".").
|
||||
*/
|
||||
public static String getFileExtension(File f)
|
||||
{
|
||||
return getFileExtension(f.getName());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
package com.relevanz.indyo.util;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.oro.text.perl.Perl5Util;
|
||||
|
||||
/**
|
||||
* Utility String-related methods.
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Id$
|
||||
*/
|
||||
public final class StringUtils
|
||||
{
|
||||
public static final String EMPTY_STRING = "";
|
||||
private static final char[] QUOTE_ENCODE = """.toCharArray();
|
||||
private static final char[] AMP_ENCODE = "&".toCharArray();
|
||||
private static final char[] LT_ENCODE = "<".toCharArray();
|
||||
private static final char[] GT_ENCODE = ">".toCharArray();
|
||||
private static final char[] APOS_ENCODE = "'".toCharArray();
|
||||
// Create a regular expression engine
|
||||
private static Perl5Util perl5Util = new Perl5Util();
|
||||
|
||||
public static final String removeUnreadableCharacters(String s)
|
||||
{
|
||||
if (perl5Util.match("/\\W+/", s))
|
||||
{
|
||||
// replace unreadable characters with a space
|
||||
s = perl5Util.substitute("s#[^a-zA-Z0-9_@]+# #gm", s);
|
||||
// remove any single/double word characters
|
||||
s = perl5Util.substitute("s#\\b[a-zA-Z0-9_]{1,2}\\b##gm", s);
|
||||
}
|
||||
return trimWhitespace(s);
|
||||
}
|
||||
|
||||
public static final String trimWhitespace(String s)
|
||||
{
|
||||
s = perl5Util.substitute("s#[\\s]{3,}# #m", s);
|
||||
return s;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,404 @@
|
|||
/*--
|
||||
|
||||
Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions, and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer that follows
|
||||
these conditions in the documentation and/or other materials
|
||||
provided with the distribution.
|
||||
|
||||
3. The name "JDOM" must not be used to endorse or promote products
|
||||
derived from this software without prior written permission. For
|
||||
written permission, please contact license@jdom.org.
|
||||
|
||||
4. Products derived from this software may not be called "JDOM", nor
|
||||
may "JDOM" appear in their name, without prior written permission
|
||||
from the JDOM Project Management (pm@jdom.org).
|
||||
|
||||
In addition, we request (but do not require) that you include in the
|
||||
end-user documentation provided with the redistribution and/or in the
|
||||
software itself an acknowledgement equivalent to the following:
|
||||
"This product includes software developed by the
|
||||
JDOM Project (http://www.jdom.org/)."
|
||||
Alternatively, the acknowledgment may be graphical using the logos
|
||||
available at http://www.jdom.org/images/logos.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGE.
|
||||
|
||||
This software consists of voluntary contributions made by many
|
||||
individuals on behalf of the JDOM Project and was originally
|
||||
created by Brett McLaughlin <brett@jdom.org> and
|
||||
Jason Hunter <jhunter@jdom.org>. For more information on the
|
||||
JDOM Project, please see <http://www.jdom.org/>.
|
||||
|
||||
*/
|
||||
package com.relevanz.indyo.util;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.SAXNotRecognizedException;
|
||||
import org.xml.sax.SAXNotSupportedException;
|
||||
import org.xml.sax.XMLReader;
|
||||
import org.xml.sax.ext.LexicalHandler;
|
||||
import org.xml.sax.helpers.AttributesImpl;
|
||||
import org.xml.sax.helpers.XMLFilterImpl;
|
||||
|
||||
/**
|
||||
* Adds convenience methods to base SAX2 Filter implementation.
|
||||
*
|
||||
* <i>Code and comments adapted from XMLWriter-0.2, written
|
||||
* by David Megginson and released into the public domain,
|
||||
* without warranty.</i>
|
||||
*
|
||||
* <p>The convenience methods are provided so that clients do not have to
|
||||
* create empty attribute lists or provide empty strings as parameters;
|
||||
* for example, the method invocation</p>
|
||||
*
|
||||
* <pre>
|
||||
* w.startElement("foo");
|
||||
* </pre>
|
||||
*
|
||||
* <p>is equivalent to the regular SAX2 ContentHandler method</p>
|
||||
*
|
||||
* <pre>
|
||||
* w.startElement("", "foo", "", new AttributesImpl());
|
||||
* </pre>
|
||||
*
|
||||
* <p>Except that it is more efficient because it does not allocate
|
||||
* a new empty attribute list each time.</p>
|
||||
*
|
||||
* <p>In fact, there is an even simpler convenience method,
|
||||
* <var>dataElement</var>, designed for writing elements that
|
||||
* contain only character data.</p>
|
||||
*
|
||||
* <pre>
|
||||
* w.dataElement("greeting", "Hello, world!");
|
||||
* </pre>
|
||||
*
|
||||
* <p>is equivalent to</p>
|
||||
*
|
||||
* <pre>
|
||||
* w.startElement("greeting");
|
||||
* w.characters("Hello, world!");
|
||||
* w.endElement("greeting");
|
||||
* </pre>
|
||||
*
|
||||
* @see org.xml.sax.helpers.XMLFilterImpl
|
||||
*/
|
||||
class XMLFilterBase extends XMLFilterImpl
|
||||
{
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Constructors.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Construct an XML filter with no parent.
|
||||
*
|
||||
* <p>This filter will have no parent: you must assign a parent
|
||||
* before you start a parse or do any configuration with
|
||||
* setFeature or setProperty.</p>
|
||||
*
|
||||
* @see org.xml.sax.XMLReader#setFeature
|
||||
* @see org.xml.sax.XMLReader#setProperty
|
||||
*/
|
||||
public XMLFilterBase()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an XML filter with the specified parent.
|
||||
*
|
||||
* <p>Use the XMLReader provided as the source of events.</p>
|
||||
*
|
||||
* @param xmlreader The parent in the filter chain.
|
||||
*/
|
||||
public XMLFilterBase(XMLReader parent)
|
||||
{
|
||||
super(parent);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Convenience methods.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Start a new element without a qname or attributes.
|
||||
*
|
||||
* <p>This method will provide a default empty attribute
|
||||
* list and an empty string for the qualified name.
|
||||
* It invokes {@link
|
||||
* #startElement(String, String, String, Attributes)}
|
||||
* directly.</p>
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
*/
|
||||
public void startElement (String uri, String localName) throws SAXException
|
||||
{
|
||||
startElement(uri, localName, "", EMPTY_ATTS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a new element without a qname, attributes or a Namespace URI.
|
||||
*
|
||||
* <p>This method will provide an empty string for the
|
||||
* Namespace URI, and empty string for the qualified name,
|
||||
* and a default empty attribute list. It invokes
|
||||
* #startElement(String, String, String, Attributes)}
|
||||
* directly.</p>
|
||||
*
|
||||
* @param localName The element's local name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
*/
|
||||
public void startElement (String localName) throws SAXException
|
||||
{
|
||||
startElement("", localName, "", EMPTY_ATTS);
|
||||
}
|
||||
|
||||
/**
|
||||
* End an element without a qname.
|
||||
*
|
||||
* <p>This method will supply an empty string for the qName.
|
||||
* It invokes {@link #endElement(String, String, String)}
|
||||
* directly.</p>
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void endElement (String uri, String localName) throws SAXException
|
||||
{
|
||||
endElement(uri, localName, "");
|
||||
}
|
||||
|
||||
/**
|
||||
* End an element without a Namespace URI or qname.
|
||||
*
|
||||
* <p>This method will supply an empty string for the qName
|
||||
* and an empty string for the Namespace URI.
|
||||
* It invokes {@link #endElement(String, String, String)}
|
||||
* directly.</p>
|
||||
*
|
||||
* @param localName The element's local name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void endElement (String localName) throws SAXException
|
||||
{
|
||||
endElement("", localName, "");
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an empty element.
|
||||
*
|
||||
* Both a {@link #startElement startElement} and an
|
||||
* {@link #endElement endElement} event will be passed on down
|
||||
* the filter chain.
|
||||
*
|
||||
* @param uri The element's Namespace URI, or the empty string
|
||||
* if the element has no Namespace or if Namespace
|
||||
* processing is not being performed.
|
||||
* @param localName The element's local name (without prefix). This
|
||||
* parameter must be provided.
|
||||
* @param qName The element's qualified name (with prefix), or
|
||||
* the empty string if none is available. This parameter
|
||||
* is strictly advisory: the writer may or may not use
|
||||
* the prefix attached.
|
||||
* @param atts The element's attribute list.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void emptyElement (String uri, String localName, String qName,
|
||||
Attributes atts) throws SAXException
|
||||
{
|
||||
startElement(uri, localName, qName, atts);
|
||||
endElement(uri, localName, qName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an empty element without a qname or attributes.
|
||||
*
|
||||
* <p>This method will supply an empty string for the qname
|
||||
* and an empty attribute list. It invokes
|
||||
* {@link #emptyElement(String, String, String, Attributes)}
|
||||
* directly.</p>
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see #emptyElement(String, String, String, Attributes)
|
||||
*/
|
||||
public void emptyElement (String uri, String localName) throws SAXException
|
||||
{
|
||||
emptyElement(uri, localName, "", EMPTY_ATTS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an empty element without a Namespace URI, qname or attributes.
|
||||
*
|
||||
* <p>This method will supply an empty string for the qname,
|
||||
* and empty string for the Namespace URI, and an empty
|
||||
* attribute list. It invokes
|
||||
* {@link #emptyElement(String, String, String, Attributes)}
|
||||
* directly.</p>
|
||||
*
|
||||
* @param localName The element's local name.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see #emptyElement(String, String, String, Attributes)
|
||||
*/
|
||||
public void emptyElement (String localName) throws SAXException
|
||||
{
|
||||
emptyElement("", localName, "", EMPTY_ATTS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an element with character data content.
|
||||
*
|
||||
* <p>This is a convenience method to add a complete element
|
||||
* with character data content, including the start tag
|
||||
* and end tag.</p>
|
||||
*
|
||||
* <p>This method invokes
|
||||
* {@link @see org.xml.sax.ContentHandler#startElement},
|
||||
* followed by
|
||||
* {@link #characters(String)}, followed by
|
||||
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @param qName The element's default qualified name.
|
||||
* @param atts The element's attributes.
|
||||
* @param content The character data content.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
* @see #characters(String)
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void dataElement (String uri, String localName, String qName,
|
||||
Attributes atts, String content) throws SAXException
|
||||
{
|
||||
startElement(uri, localName, qName, atts);
|
||||
characters(content);
|
||||
endElement(uri, localName, qName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an element with character data content but no attributes.
|
||||
*
|
||||
* <p>This is a convenience method to add a complete element
|
||||
* with character data content, including the start tag
|
||||
* and end tag. This method provides an empty string
|
||||
* for the qname and an empty attribute list.</p>
|
||||
*
|
||||
* <p>This method invokes
|
||||
* {@link @see org.xml.sax.ContentHandler#startElement},
|
||||
* followed by
|
||||
* {@link #characters(String)}, followed by
|
||||
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
|
||||
*
|
||||
* @param uri The element's Namespace URI.
|
||||
* @param localName The element's local name.
|
||||
* @param content The character data content.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
* @see #characters(String)
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void dataElement (String uri, String localName, String content)
|
||||
throws SAXException
|
||||
{
|
||||
dataElement(uri, localName, "", EMPTY_ATTS, content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an element with character data content but no attributes or
|
||||
* Namespace URI.
|
||||
*
|
||||
* <p>This is a convenience method to add a complete element
|
||||
* with character data content, including the start tag
|
||||
* and end tag. The method provides an empty string for the
|
||||
* Namespace URI, and empty string for the qualified name,
|
||||
* and an empty attribute list.</p>
|
||||
*
|
||||
* <p>This method invokes
|
||||
* {@link @see org.xml.sax.ContentHandler#startElement},
|
||||
* followed by
|
||||
* {@link #characters(String)}, followed by
|
||||
* {@link @see org.xml.sax.ContentHandler#endElement}.</p>
|
||||
*
|
||||
* @param localName The element's local name.
|
||||
* @param content The character data content.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see org.xml.sax.ContentHandler#startElement
|
||||
* @see #characters(String)
|
||||
* @see org.xml.sax.ContentHandler#endElement
|
||||
*/
|
||||
public void dataElement (String localName, String content)
|
||||
throws SAXException
|
||||
{
|
||||
dataElement("", localName, "", EMPTY_ATTS, content);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a string of character data, with XML escaping.
|
||||
*
|
||||
* <p>This is a convenience method that takes an XML
|
||||
* String, converts it to a character array, then invokes
|
||||
* {@link @see org.xml.sax.ContentHandler#characters}.</p>
|
||||
*
|
||||
* @param data The character data.
|
||||
* @exception org.xml.sax.SAXException If a filter
|
||||
* further down the chain raises an exception.
|
||||
* @see @see org.xml.sax.ContentHandler#characters
|
||||
*/
|
||||
public void characters (String data) throws SAXException
|
||||
{
|
||||
char ch[] = data.toCharArray();
|
||||
characters(ch, 0, ch.length);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////
|
||||
// Constants.
|
||||
////////////////////////////////////////////////////////////////////
|
||||
protected static final Attributes EMPTY_ATTS = new AttributesImpl();
|
||||
}
|
||||
|
||||
// end of XMLFilterBase.java
|
Loading…
Reference in New Issue