mirror of https://github.com/apache/lucene.git
Remove outdated sandbox code
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@165365 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f848854278
commit
acf2b4c60c
|
@ -1,10 +0,0 @@
|
|||
<customerInfo>
|
||||
<name><![CDATA[Aruna A. Raghavan]]></name>
|
||||
<profession><![CDATA[Software Developer]]></profession>
|
||||
<addressLine1><![CDATA[6801 West 106th Street]]></addressLine1>
|
||||
<addressLine2><![CDATA[#205]]></addressLine2>
|
||||
<city><![CDATA[Eagan]]></city>
|
||||
<state><![CDATA[MN]]></state>
|
||||
<zip><![CDATA[55121]]></zip>
|
||||
<country><![CDATA[USA]]></country>
|
||||
</customerInfo>
|
|
@ -1,6 +0,0 @@
|
|||
This is the README file for XML Indexing Demo contributed by Aruna Raghavan.
|
||||
|
||||
$Id$
|
||||
|
||||
Lucene Indexing Demo illustrates how one can parse and index XML documents
|
||||
using a SAX2 or DOM parser with Lucene.
|
Binary file not shown.
|
@ -1,10 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="xml" default="default">
|
||||
|
||||
<description>
|
||||
Example of Lucene XML indexing
|
||||
</description>
|
||||
|
||||
<import file="../common.xml"/>
|
||||
</project>
|
|
@ -1,111 +0,0 @@
|
|||
package org.apache.lucenesandbox.xmlindexingdemo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
|
||||
class IndexFiles
|
||||
{
|
||||
public static void main(String[] args)
|
||||
throws Exception
|
||||
{
|
||||
try
|
||||
{
|
||||
Date start = new Date();
|
||||
|
||||
IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true);
|
||||
indexDocs(writer, new File(args[0]));
|
||||
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
Date end = new Date();
|
||||
|
||||
System.out.print(end.getTime() - start.getTime());
|
||||
System.out.println(" total milliseconds");
|
||||
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public static void indexDocs(IndexWriter writer, File file)
|
||||
throws Exception
|
||||
{
|
||||
if (file.isDirectory())
|
||||
{
|
||||
String[] files = file.list();
|
||||
for (int i = 0; i < files.length; i++)
|
||||
indexDocs(writer, new File(file, files[i]));
|
||||
}
|
||||
else
|
||||
{
|
||||
System.out.println("adding " + file);
|
||||
XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file);
|
||||
writer.addDocument(hdlr.getDocument());
|
||||
// For DOM, use
|
||||
// XMLDocumentHandlerDOM hdlr = new XMLDocumentHandlerDOM();
|
||||
// writer.addDocument(hdlr.createXMLDocument(file));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,126 +0,0 @@
|
|||
package org.apache.lucenesandbox.xmlindexingdemo;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.queryParser.QueryParser;
|
||||
|
||||
class SearchFiles {
|
||||
public static void main(String[] args) {
|
||||
try {
|
||||
Searcher searcher = new IndexSearcher("index");
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
|
||||
while (true) {
|
||||
System.out.print("Query: ");
|
||||
String line = in.readLine();
|
||||
|
||||
if (line.length() == -1)
|
||||
break;
|
||||
|
||||
Query query = QueryParser.parse(line, "name", analyzer);
|
||||
System.out.println("Searching for: " + query.toString("name"));
|
||||
|
||||
Hits hits = searcher.search(query);
|
||||
System.out.println(hits.length() + " total matching documents");
|
||||
|
||||
final int HITS_PER_PAGE = 10;
|
||||
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE)
|
||||
{
|
||||
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
Document doc = hits.doc(i);
|
||||
String name = doc.get("name");
|
||||
System.out.println(name);
|
||||
System.out.println(doc.get("profession"));
|
||||
System.out.println(doc.get("addressLine1"));
|
||||
System.out.println(doc.get("addressLine2"));
|
||||
System.out.print(doc.get("city"));
|
||||
System.out.print(" ");
|
||||
System.out.print(doc.get("state"));
|
||||
System.out.print(" ");
|
||||
System.out.print(doc.get("zip"));
|
||||
System.out.println(doc.get("country"));
|
||||
|
||||
}
|
||||
|
||||
if (hits.length() > end) {
|
||||
System.out.print("more (y/n) ? ");
|
||||
line = in.readLine();
|
||||
if (line.length() == 0 || line.charAt(0) == 'n')
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
searcher.close();
|
||||
|
||||
} catch (Exception e) {
|
||||
System.out.println(" caught a " + e.getClass() +
|
||||
"\n with message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,131 +0,0 @@
|
|||
package org.apache.lucenesandbox.xmlindexingdemo;
|
||||
|
||||
import org.w3c.dom.*;
|
||||
import org.w3c.dom.Node;
|
||||
import javax.xml.parsers.*;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class XMLDocumentHandlerDOM {
|
||||
public org.apache.lucene.document.Document createXMLDocument(File f) {
|
||||
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
|
||||
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
|
||||
try {
|
||||
DocumentBuilder df = dbf.newDocumentBuilder();
|
||||
org.w3c.dom.Document d = df.parse(f);
|
||||
Node root = d.getDocumentElement();
|
||||
traverseTree(root, document);
|
||||
} catch (Exception e) {
|
||||
System.out.println("error: " + e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
return document;
|
||||
}
|
||||
|
||||
static private void traverseTree(Node node, org.apache.lucene.document.Document document) {
|
||||
NodeList nl = node.getChildNodes();
|
||||
if (nl.getLength() == 0) {
|
||||
if (node.getNodeType() == Node.TEXT_NODE) {
|
||||
Node parentNode = node.getParentNode();
|
||||
if (parentNode.getNodeType() == Node.ELEMENT_NODE) {
|
||||
// String parentNodeName = parentNode.getNodeName();
|
||||
// String nodeValue = node.getNodeValue();
|
||||
// if (parentNodeName.equals("name"))
|
||||
// {
|
||||
Node siblingNode = node.getNextSibling();
|
||||
if (siblingNode != null) {
|
||||
if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) {
|
||||
document.add(Field.Text("name", siblingNode.getNodeValue()));
|
||||
}
|
||||
}
|
||||
// }
|
||||
// else if (parentNodeName.equals("profession"))
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if (siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text([arentNodeName, siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// else if (parentNodeName == "addressLine1")
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if(siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text("addressLine1", siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// else if (parentNodeName.equals("addressLine2"))
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if (siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text("addressLine2", siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// if (parentNodeName.equals("city"))
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if (siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text("city", siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// else if (parentNodeName.equals("zip"))
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if (siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text("zip", siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// else if (parentNodeName.equals("state"))
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if (siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text("state", siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// else if (parentNodeName.equals("country"))
|
||||
// {
|
||||
// Node siblingNode = node.getNextSibling();
|
||||
// if (siblingNode != null)
|
||||
// {
|
||||
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
|
||||
// {
|
||||
// document.add(Field.Text("country", siblingNode.getNodeValue()));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < nl.getLength(); i++) {
|
||||
traverseTree(nl.item(i), document);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,106 +0,0 @@
|
|||
package org.apache.lucenesandbox.xmlindexingdemo;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
public class XMLDocumentHandlerSAX extends DefaultHandler {
|
||||
/** A buffer for each XML element */
|
||||
private StringBuffer elementBuffer = new StringBuffer();
|
||||
|
||||
private Document mDocument;
|
||||
|
||||
// constructor
|
||||
public XMLDocumentHandlerSAX(File xmlFile)
|
||||
throws ParserConfigurationException, SAXException, IOException {
|
||||
SAXParserFactory spf = SAXParserFactory.newInstance();
|
||||
|
||||
// use validating parser?
|
||||
//spf.setValidating(false);
|
||||
// make parser name space aware?
|
||||
//spf.setNamespaceAware(true);
|
||||
|
||||
SAXParser parser = spf.newSAXParser();
|
||||
//System.out.println("parser is validating: " + parser.isValidating());
|
||||
try {
|
||||
parser.parse(xmlFile, this);
|
||||
} catch (org.xml.sax.SAXParseException spe) {
|
||||
System.out.println("SAXParser caught SAXParseException at line: " +
|
||||
spe.getLineNumber() + " column " +
|
||||
spe.getColumnNumber());
|
||||
}
|
||||
}
|
||||
|
||||
// call at document start
|
||||
public void startDocument() throws SAXException {
|
||||
mDocument = new Document();
|
||||
}
|
||||
|
||||
// call at element start
|
||||
public void startElement(String namespaceURI, String localName,
|
||||
String qualifiedName, Attributes attrs) throws SAXException {
|
||||
|
||||
String eName = localName;
|
||||
if ("".equals(eName)) {
|
||||
eName = qualifiedName; // namespaceAware = false
|
||||
}
|
||||
// list the attribute(s)
|
||||
if (attrs != null) {
|
||||
for (int i = 0; i < attrs.getLength(); i++) {
|
||||
String aName = attrs.getLocalName(i); // Attr name
|
||||
if ("".equals(aName)) { aName = attrs.getQName(i); }
|
||||
// perform application specific action on attribute(s)
|
||||
// for now just dump out attribute name and value
|
||||
System.out.println("attr " + aName+"="+attrs.getValue(i));
|
||||
}
|
||||
}
|
||||
elementBuffer.setLength(0);
|
||||
}
|
||||
|
||||
// call when cdata found
|
||||
public void characters(char[] text, int start, int length)
|
||||
throws SAXException {
|
||||
elementBuffer.append(text, start, length);
|
||||
}
|
||||
|
||||
// call at element end
|
||||
public void endElement(String namespaceURI, String simpleName,
|
||||
String qualifiedName) throws SAXException {
|
||||
|
||||
String eName = simpleName;
|
||||
if ("".equals(eName)) {
|
||||
eName = qualifiedName; // namespaceAware = false
|
||||
}
|
||||
|
||||
mDocument.add(Field.Text(eName, elementBuffer.toString()));
|
||||
}
|
||||
|
||||
public Document getDocument() {
|
||||
return mDocument;
|
||||
}
|
||||
}
|
|
@ -1,17 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
<document>
|
||||
<properties>
|
||||
<author>Aruna Raghavan</author>
|
||||
<author>Otis Gospodnetic</author>
|
||||
<title>Lucene Indexing Demo</title>
|
||||
</properties>
|
||||
|
||||
<body>
|
||||
|
||||
<section name="Description">
|
||||
<p>Lucene Indexing Demo illustrates how one can parse XML documents
|
||||
using a SAX2 or DOM and index them with Lucene.</p>
|
||||
</section>
|
||||
|
||||
</body>
|
||||
</document>
|
|
@ -1,26 +0,0 @@
|
|||
<project name="sandbox" default="build-tree">
|
||||
|
||||
<property name="dist.dir" location="dist"/>
|
||||
|
||||
<macrodef name="crawl">
|
||||
<attribute name="target" default=""/>
|
||||
<sequential>
|
||||
<subant target="@{target}" failonerror="false">
|
||||
<property name="dist.dir" location="${dist.dir}"/>
|
||||
|
||||
<fileset dir="."
|
||||
includes="*/build.xml"
|
||||
excludes="taglib/build.xml"
|
||||
/>
|
||||
</subant>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
<target name="clean">
|
||||
<crawl target="clean"/>
|
||||
</target>
|
||||
|
||||
<target name="build-tree">
|
||||
<crawl/>
|
||||
</target>
|
||||
</project>
|
|
@ -1,241 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="common">
|
||||
|
||||
<!-- default values, intended to be overridden-->
|
||||
<property name="version" value="dev"/>
|
||||
<property name="Name" value="${ant.project.name}"/>
|
||||
|
||||
<!-- not intended to be overridden-->
|
||||
<property name="src.dir" location="src/java"/>
|
||||
<property name="build.dir" location="build"/>
|
||||
<property name="build.classes.dir" location="${build.dir}/classes"/>
|
||||
<property name="build.javadoc" value="${build.dir}/docs/api"/>
|
||||
<property name="build.encoding" value="utf-8"/>
|
||||
|
||||
<property name="release.host" value="www.apache.org"/>
|
||||
<property name="release.path" value="/www/cvs.apache.org/dist/jakarta/lucene/sandbox/${ant.project.name}"/>
|
||||
|
||||
<property name="web.host" value="www.apache.org"/>
|
||||
<property name="web.path" value="/www/jakarta.apache.org/lucene/docs/lucene-sandbox/${ant.project.name}"/>
|
||||
|
||||
<property name="javadoc.link.java" value="http://java.sun.com/j2se/1.4.1/docs/api/"/>
|
||||
<property name="javadoc.link.lucene" value="http://jakarta.apache.org/lucene/docs/api/"/>
|
||||
|
||||
<property name="test.src.dir" location="src/test"/>
|
||||
<property name="test.output.dir" location="${build.dir}/test"/>
|
||||
<property name="test.classes.dir" location="${test.output.dir}/classes"/>
|
||||
|
||||
<property name="dist.dir" location="dist"/>
|
||||
<property name="dist.name" value="${ant.project.name}-${version}"/>
|
||||
<property name="package.dir" location="dist/${dist.name}"/>
|
||||
|
||||
<property name="junit.jar" location="${ant.home}/lib/junit.jar"/>
|
||||
<dirname file="${ant.file.common}" property="common.dir"/>
|
||||
<property name="lucene.dir" location="${common.dir}/../../jakarta-lucene"/>
|
||||
|
||||
<property name="build.debug" value="true"/>
|
||||
<property name="junit.fork" value="true"/>
|
||||
|
||||
<property name="jakarta.site2.home" location="../../../jakarta-site2"/>
|
||||
<property name="project.name" value="site"/>
|
||||
<property name="docs.src" location="xdocs"/>
|
||||
<property name="docs.dest" location="docs"/>
|
||||
|
||||
<path id="anakia.classpath">
|
||||
<fileset dir="${jakarta.site2.home}/lib">
|
||||
<include name="*.jar"/>
|
||||
</fileset>
|
||||
</path>
|
||||
|
||||
<!-- ========================================================== -->
|
||||
<!-- Datatype declarations -->
|
||||
<!-- ========================================================== -->
|
||||
<!-- TODO: define ${lucene.jar} for easier overriding -->
|
||||
<path id="compile.classpath">
|
||||
<fileset dir="${lucene.dir}" includes="build/lucene*.jar"/>
|
||||
<pathelement path="${project.classpath}"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="compile.classpath"/>
|
||||
<pathelement location="${junit.jar}"/>
|
||||
<pathelement location="${build.classes.dir}"/>
|
||||
<pathelement location="${test.classes.dir}"/>
|
||||
</path>
|
||||
|
||||
|
||||
<target name="init">
|
||||
<echo message="Building ${ant.project.name}"/>
|
||||
<tstamp/>
|
||||
|
||||
<mkdir dir="${build.dir}"/>
|
||||
<mkdir dir="${build.classes.dir}"/>
|
||||
<mkdir dir="${dist.dir}"/>
|
||||
|
||||
<mkdir dir="${test.output.dir}"/>
|
||||
<mkdir dir="${test.classes.dir}"/>
|
||||
|
||||
<available property="has.tests" file="${test.src.dir}" type="dir"/>
|
||||
</target>
|
||||
|
||||
<target name="clean"
|
||||
description="Deletes all previous build artifacts">
|
||||
<delete dir="${build.dir}"/>
|
||||
<delete dir="${build.classes.dir}"/>
|
||||
<delete dir="${dist.dir}"/>
|
||||
<delete dir="${package.dir}"/>
|
||||
|
||||
<delete dir="${test.output.dir}"/>
|
||||
<delete dir="${test.classes.dir}"/>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="compile" description="Create JAR">
|
||||
<jar jarfile="${dist.dir}/${dist.name}.jar"
|
||||
basedir="${build.classes.dir}"
|
||||
/>
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="init">
|
||||
<javac destdir="${build.classes.dir}"
|
||||
debug="${build.debug}"
|
||||
includeAntRuntime="yes"
|
||||
deprecation="true"
|
||||
srcdir="${src.dir}"
|
||||
classpathref="compile.classpath"
|
||||
encoding="${build.encoding}"
|
||||
/>
|
||||
<copy todir="${build.classes.dir}">
|
||||
<fileset dir="${src.dir}" excludes="**/*.java"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="test-compile" depends="compile" if="has.tests">
|
||||
<javac destdir="${test.classes.dir}"
|
||||
debug="${build.debug}"
|
||||
includeAntRuntime="yes"
|
||||
srcdir="src/test"
|
||||
classpathref="test.classpath"
|
||||
encoding="${build.encoding}"
|
||||
/>
|
||||
|
||||
<copy todir="${test.classes.dir}">
|
||||
<fileset dir="src/test" excludes="**/*.java"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="test" depends="test-compile" if="has.tests">
|
||||
<junit printsummary="no"
|
||||
errorProperty="test.failed"
|
||||
failureProperty="test.failed"
|
||||
fork="${junit.fork}">
|
||||
<classpath refid="test.classpath"/>
|
||||
<sysproperty key="docs.dir" file="${test.classes.dir}"/>
|
||||
<sysproperty key="index.dir" file="${test.output.dir}/index"/>
|
||||
<sysproperty key="dataDir" file="${test.src.dir}"/>
|
||||
<formatter type="brief" usefile="false"/>
|
||||
<test name="${testcase}" if="testcase"/>
|
||||
<batchtest todir="${test.data.dir}" unless="testcase">
|
||||
<fileset dir="${test.classes.dir}"
|
||||
includes="**/*Test.class,**/Test*.class"
|
||||
/>
|
||||
</batchtest>
|
||||
</junit>
|
||||
|
||||
<fail if="test.failed">
|
||||
Unit tests failed. Check log or reports for details
|
||||
</fail>
|
||||
|
||||
</target>
|
||||
|
||||
<target name="default" depends="test,dist"/>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- Documentation -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="javadoc" depends="compile">
|
||||
<mkdir dir="${build.javadoc}"/>
|
||||
<javadoc
|
||||
sourcepath="${src.dir}"
|
||||
overview="${src.dir}/overview.html"
|
||||
packagenames="*"
|
||||
destdir="${build.javadoc}"
|
||||
author="true"
|
||||
version="true"
|
||||
use="true"
|
||||
windowtitle="${Name} ${version} API"
|
||||
doctitle="${Name} ${version} API"
|
||||
encoding="${build.encoding}"
|
||||
>
|
||||
<link href="${javadoc.link.java}"/>
|
||||
<link href="${javadoc.link.lucene}"/>
|
||||
<tag name="todo" description="To Do:"/>
|
||||
<classpath refid="compile.classpath"/>
|
||||
</javadoc>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- D I S T R I B U T I O N -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="package" depends="dist, javadoc">
|
||||
<mkdir dir="${package.dir}"/>
|
||||
<mkdir dir="${package.dir}/docs"/>
|
||||
<mkdir dir="${package.dir}/docs/api"/>
|
||||
<mkdir dir="${docs.dest}"/>
|
||||
<copy todir="${package.dir}/docs/api">
|
||||
<fileset dir="${build.javadoc}"/>
|
||||
</copy>
|
||||
|
||||
<copy todir="${package.dir}/docs">
|
||||
<fileset dir="${docs.dest}/"/>
|
||||
</copy>
|
||||
|
||||
<copy todir="${package.dir}">
|
||||
<fileset dir=".">
|
||||
<include name="*.txt"/>
|
||||
</fileset>
|
||||
</copy>
|
||||
|
||||
<copy todir="${package.dir}/src">
|
||||
<fileset dir="src"/>
|
||||
</copy>
|
||||
<copy todir="${package.dir}/" file="build.xml"/>
|
||||
<copy todir="${dist.dir}/" file="${common.dir}/common.xml"/>
|
||||
|
||||
<copy file="${dist.dir}/${dist.name}.jar" todir="${package.dir}"/>
|
||||
|
||||
<tar tarfile="${dist.dir}/${dist.name}.tar.gz" basedir="${dist.dir}/"
|
||||
compression="gzip" includes="${dist.name}/**,common.xml"/>
|
||||
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- Copy release to server -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="release" depends="package">
|
||||
<exec executable="ssh">
|
||||
<arg value="${release.host}"/>
|
||||
<arg value="mkdir"/>
|
||||
<arg value="${release.path}/${dist.name}"/>
|
||||
</exec>
|
||||
<exec executable="scp">
|
||||
<arg value="${dist.dir}/${dist.name}.jar"/>
|
||||
<arg value="${dist.dir}/${dist.name}.tar.gz"/>
|
||||
<arg value="${release.host}:${release.path}/${dist.name}"/>
|
||||
</exec>
|
||||
<exec executable="ssh">
|
||||
<arg value="${web.host}"/>
|
||||
<arg value="rm"/>
|
||||
<arg value="-rf"/>
|
||||
<arg value="${web.path}/api"/>
|
||||
</exec>
|
||||
<exec executable="scp">
|
||||
<arg value="-r"/>
|
||||
<arg value="${build.javadoc}"/>
|
||||
<arg value="${web.host}:${web.path}/api"/>
|
||||
</exec>
|
||||
</target>
|
||||
|
||||
</project>
|
|
@ -1,19 +0,0 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<project name="parsers" default="default">
|
||||
|
||||
<description>
|
||||
Document parsers
|
||||
</description>
|
||||
|
||||
<path id="additional.dependencies">
|
||||
<fileset dir="lib"/>
|
||||
</path>
|
||||
|
||||
<pathconvert property="project.classpath"
|
||||
targetos="unix"
|
||||
refid="additional.dependencies"
|
||||
/>
|
||||
|
||||
<import file="../common.xml"/>
|
||||
</project>
|
|
@ -1 +0,0 @@
|
|||
Place pj.jar here (from http://www.etymon.com/pub/software/pj/) and log4j JAR.
|
|
@ -1,172 +0,0 @@
|
|||
package org.apache.lucene.parsers.pdf;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
*
|
||||
* Copyright (c) 2001 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation"
|
||||
* must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
*/
|
||||
|
||||
import com.etymon.pj.Pdf;
|
||||
import com.etymon.pj.exception.InvalidPdfObjectException;
|
||||
import com.etymon.pj.exception.PjException;
|
||||
import com.etymon.pj.object.PjArray;
|
||||
import com.etymon.pj.object.PjObject;
|
||||
import com.etymon.pj.object.PjPage;
|
||||
import com.etymon.pj.object.PjStream;
|
||||
import org.apache.log4j.Category;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Vector;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Attempts to extract text from a PDF file.
|
||||
* </p>
|
||||
* <p>
|
||||
* <a href="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00280.html">
|
||||
* Known limitations</a>
|
||||
* </p>
|
||||
*
|
||||
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
|
||||
* @version $Revision$
|
||||
*/
|
||||
public class PdfTextExtractor
|
||||
{
|
||||
private static Category cat = Category.getInstance(PdfTextExtractor.class);
|
||||
|
||||
public static void main(String[] args)
|
||||
{
|
||||
File f = new File("/usr/local/test.pdf");
|
||||
try
|
||||
{
|
||||
Pdf pdf = new Pdf(f.toString());
|
||||
int pagecount = pdf.getPageCount();
|
||||
cat.debug(f.toString() + "has " + pagecount + " pages.");
|
||||
for (int i = 1; i <= pagecount; i++)
|
||||
{
|
||||
System.out.println(getContent(pdf, i));
|
||||
}
|
||||
}
|
||||
catch (IOException ioe)
|
||||
{
|
||||
cat.error("IOException parsing PDF file:" + f.toString(), ioe);
|
||||
}
|
||||
catch (PjException pje)
|
||||
{
|
||||
cat.error("PjException parsing PDF file:" + f.toString(), pje);
|
||||
}
|
||||
}
|
||||
|
||||
private static String getContent(Pdf pdf, int pageNo)
|
||||
{
|
||||
String content = null;
|
||||
PjStream stream = null;
|
||||
StringBuffer strbf = new StringBuffer();
|
||||
try
|
||||
{
|
||||
PjPage page = (PjPage) pdf.getObject(pdf.getPage(pageNo));
|
||||
PjObject pobj = (PjObject) pdf.resolve(page.getContents());
|
||||
if (pobj instanceof PjArray)
|
||||
{
|
||||
PjArray array = (PjArray) pobj;
|
||||
Vector vArray = array.getVector();
|
||||
int size = vArray.size();
|
||||
for (int j = 0; j < size; j++)
|
||||
{
|
||||
stream = (PjStream) pdf.resolve((PjObject) vArray.get(j));
|
||||
strbf.append(getStringFromPjStream(stream));
|
||||
}
|
||||
content = strbf.toString();
|
||||
}
|
||||
else
|
||||
{
|
||||
stream = (PjStream) pobj;
|
||||
content = getStringFromPjStream(stream);
|
||||
}
|
||||
}
|
||||
catch (InvalidPdfObjectException pdfe)
|
||||
{
|
||||
cat.error("Invalid PDF Object:" + pdfe, pdfe);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
cat.error("Exception in getContent() " + e, e);
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
private static String getStringFromPjStream(PjStream stream)
|
||||
{
|
||||
StringBuffer strbf = new StringBuffer();
|
||||
try
|
||||
{
|
||||
int start,end = 0;
|
||||
stream = stream.flateDecompress();
|
||||
String longString = stream.toString();
|
||||
int strlen = longString.length();
|
||||
int lastIndex = longString.lastIndexOf(')');
|
||||
while (lastIndex != -1 && end != lastIndex)
|
||||
{
|
||||
start = longString.indexOf('(', end);
|
||||
end = longString.indexOf(')', start);
|
||||
String text = longString.substring(start + 1, end);
|
||||
strbf.append(text);
|
||||
}
|
||||
}
|
||||
catch (InvalidPdfObjectException pdfe)
|
||||
{
|
||||
cat.error("InvalidObjectException:" + pdfe.getMessage(), pdfe);
|
||||
}
|
||||
return strbf.toString();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue