Remove outdated sandbox code

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@165365 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Erik Hatcher 2005-04-30 00:07:27 +00:00
parent f848854278
commit acf2b4c60c
14 changed files with 0 additions and 976 deletions

View File

@ -1,10 +0,0 @@
<customerInfo>
<name><![CDATA[Aruna A. Raghavan]]></name>
<profession><![CDATA[Software Developer]]></profession>
<addressLine1><![CDATA[6801 West 106th Street]]></addressLine1>
<addressLine2><![CDATA[#205]]></addressLine2>
<city><![CDATA[Eagan]]></city>
<state><![CDATA[MN]]></state>
<zip><![CDATA[55121]]></zip>
<country><![CDATA[USA]]></country>
</customerInfo>

View File

@ -1,6 +0,0 @@
This is the README file for XML Indexing Demo contributed by Aruna Raghavan.
$Id$
Lucene Indexing Demo illustrates how one can parse and index XML documents
using a SAX2 or DOM parser with Lucene.

View File

@ -1,10 +0,0 @@
<?xml version="1.0"?>
<project name="xml" default="default">
<description>
Example of Lucene XML indexing
</description>
<import file="../common.xml"/>
</project>

View File

@ -1,111 +0,0 @@
package org.apache.lucenesandbox.xmlindexingdemo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import java.io.File;
import java.util.Date;
class IndexFiles
{
public static void main(String[] args)
throws Exception
{
try
{
Date start = new Date();
IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true);
indexDocs(writer, new File(args[0]));
writer.optimize();
writer.close();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
}
catch (Exception e)
{
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
throw e;
}
}
public static void indexDocs(IndexWriter writer, File file)
throws Exception
{
if (file.isDirectory())
{
String[] files = file.list();
for (int i = 0; i < files.length; i++)
indexDocs(writer, new File(file, files[i]));
}
else
{
System.out.println("adding " + file);
XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file);
writer.addDocument(hdlr.getDocument());
// For DOM, use
// XMLDocumentHandlerDOM hdlr = new XMLDocumentHandlerDOM();
// writer.addDocument(hdlr.createXMLDocument(file));
}
}
}

View File

@ -1,126 +0,0 @@
package org.apache.lucenesandbox.xmlindexingdemo;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.queryParser.QueryParser;
class SearchFiles {
public static void main(String[] args) {
try {
Searcher searcher = new IndexSearcher("index");
Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
while (true) {
System.out.print("Query: ");
String line = in.readLine();
if (line.length() == -1)
break;
Query query = QueryParser.parse(line, "name", analyzer);
System.out.println("Searching for: " + query.toString("name"));
Hits hits = searcher.search(query);
System.out.println(hits.length() + " total matching documents");
final int HITS_PER_PAGE = 10;
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE)
{
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
for (int i = start; i < end; i++)
{
Document doc = hits.doc(i);
String name = doc.get("name");
System.out.println(name);
System.out.println(doc.get("profession"));
System.out.println(doc.get("addressLine1"));
System.out.println(doc.get("addressLine2"));
System.out.print(doc.get("city"));
System.out.print(" ");
System.out.print(doc.get("state"));
System.out.print(" ");
System.out.print(doc.get("zip"));
System.out.println(doc.get("country"));
}
if (hits.length() > end) {
System.out.print("more (y/n) ? ");
line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n')
break;
}
}
}
searcher.close();
} catch (Exception e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
}

View File

@ -1,131 +0,0 @@
package org.apache.lucenesandbox.xmlindexingdemo;
import org.w3c.dom.*;
import org.w3c.dom.Node;
import javax.xml.parsers.*;
import org.apache.lucene.document.Field;
import java.io.File;
/**
*
*/
public class XMLDocumentHandlerDOM {
public org.apache.lucene.document.Document createXMLDocument(File f) {
org.apache.lucene.document.Document document = new org.apache.lucene.document.Document();
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
try {
DocumentBuilder df = dbf.newDocumentBuilder();
org.w3c.dom.Document d = df.parse(f);
Node root = d.getDocumentElement();
traverseTree(root, document);
} catch (Exception e) {
System.out.println("error: " + e);
e.printStackTrace();
}
return document;
}
static private void traverseTree(Node node, org.apache.lucene.document.Document document) {
NodeList nl = node.getChildNodes();
if (nl.getLength() == 0) {
if (node.getNodeType() == Node.TEXT_NODE) {
Node parentNode = node.getParentNode();
if (parentNode.getNodeType() == Node.ELEMENT_NODE) {
// String parentNodeName = parentNode.getNodeName();
// String nodeValue = node.getNodeValue();
// if (parentNodeName.equals("name"))
// {
Node siblingNode = node.getNextSibling();
if (siblingNode != null) {
if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) {
document.add(Field.Text("name", siblingNode.getNodeValue()));
}
}
// }
// else if (parentNodeName.equals("profession"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text([arentNodeName, siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName == "addressLine1")
// {
// Node siblingNode = node.getNextSibling();
// if(siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("addressLine1", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("addressLine2"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("addressLine2", siblingNode.getNodeValue()));
// }
// }
// }
// if (parentNodeName.equals("city"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("city", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("zip"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("zip", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("state"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("state", siblingNode.getNodeValue()));
// }
// }
// }
// else if (parentNodeName.equals("country"))
// {
// Node siblingNode = node.getNextSibling();
// if (siblingNode != null)
// {
// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE)
// {
// document.add(Field.Text("country", siblingNode.getNodeValue()));
// }
// }
// }
}
}
} else {
for (int i = 0; i < nl.getLength(); i++) {
traverseTree(nl.item(i), document);
}
}
}
}

View File

@ -1,106 +0,0 @@
package org.apache.lucenesandbox.xmlindexingdemo;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.File;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class XMLDocumentHandlerSAX extends DefaultHandler {
/** A buffer for each XML element */
private StringBuffer elementBuffer = new StringBuffer();
private Document mDocument;
// constructor
public XMLDocumentHandlerSAX(File xmlFile)
throws ParserConfigurationException, SAXException, IOException {
SAXParserFactory spf = SAXParserFactory.newInstance();
// use validating parser?
//spf.setValidating(false);
// make parser name space aware?
//spf.setNamespaceAware(true);
SAXParser parser = spf.newSAXParser();
//System.out.println("parser is validating: " + parser.isValidating());
try {
parser.parse(xmlFile, this);
} catch (org.xml.sax.SAXParseException spe) {
System.out.println("SAXParser caught SAXParseException at line: " +
spe.getLineNumber() + " column " +
spe.getColumnNumber());
}
}
// call at document start
public void startDocument() throws SAXException {
mDocument = new Document();
}
// call at element start
public void startElement(String namespaceURI, String localName,
String qualifiedName, Attributes attrs) throws SAXException {
String eName = localName;
if ("".equals(eName)) {
eName = qualifiedName; // namespaceAware = false
}
// list the attribute(s)
if (attrs != null) {
for (int i = 0; i < attrs.getLength(); i++) {
String aName = attrs.getLocalName(i); // Attr name
if ("".equals(aName)) { aName = attrs.getQName(i); }
// perform application specific action on attribute(s)
// for now just dump out attribute name and value
System.out.println("attr " + aName+"="+attrs.getValue(i));
}
}
elementBuffer.setLength(0);
}
// call when cdata found
public void characters(char[] text, int start, int length)
throws SAXException {
elementBuffer.append(text, start, length);
}
// call at element end
public void endElement(String namespaceURI, String simpleName,
String qualifiedName) throws SAXException {
String eName = simpleName;
if ("".equals(eName)) {
eName = qualifiedName; // namespaceAware = false
}
mDocument.add(Field.Text(eName, elementBuffer.toString()));
}
public Document getDocument() {
return mDocument;
}
}

View File

@ -1,17 +0,0 @@
<?xml version="1.0"?>
<document>
<properties>
<author>Aruna Raghavan</author>
<author>Otis Gospodnetic</author>
<title>Lucene Indexing Demo</title>
</properties>
<body>
<section name="Description">
<p>Lucene Indexing Demo illustrates how one can parse XML documents
using a SAX2 or DOM and index them with Lucene.</p>
</section>
</body>
</document>

View File

@ -1,26 +0,0 @@
<project name="sandbox" default="build-tree">
<property name="dist.dir" location="dist"/>
<macrodef name="crawl">
<attribute name="target" default=""/>
<sequential>
<subant target="@{target}" failonerror="false">
<property name="dist.dir" location="${dist.dir}"/>
<fileset dir="."
includes="*/build.xml"
excludes="taglib/build.xml"
/>
</subant>
</sequential>
</macrodef>
<target name="clean">
<crawl target="clean"/>
</target>
<target name="build-tree">
<crawl/>
</target>
</project>

View File

@ -1,241 +0,0 @@
<?xml version="1.0"?>
<project name="common">
<!-- default values, intended to be overridden-->
<property name="version" value="dev"/>
<property name="Name" value="${ant.project.name}"/>
<!-- not intended to be overridden-->
<property name="src.dir" location="src/java"/>
<property name="build.dir" location="build"/>
<property name="build.classes.dir" location="${build.dir}/classes"/>
<property name="build.javadoc" value="${build.dir}/docs/api"/>
<property name="build.encoding" value="utf-8"/>
<property name="release.host" value="www.apache.org"/>
<property name="release.path" value="/www/cvs.apache.org/dist/jakarta/lucene/sandbox/${ant.project.name}"/>
<property name="web.host" value="www.apache.org"/>
<property name="web.path" value="/www/jakarta.apache.org/lucene/docs/lucene-sandbox/${ant.project.name}"/>
<property name="javadoc.link.java" value="http://java.sun.com/j2se/1.4.1/docs/api/"/>
<property name="javadoc.link.lucene" value="http://jakarta.apache.org/lucene/docs/api/"/>
<property name="test.src.dir" location="src/test"/>
<property name="test.output.dir" location="${build.dir}/test"/>
<property name="test.classes.dir" location="${test.output.dir}/classes"/>
<property name="dist.dir" location="dist"/>
<property name="dist.name" value="${ant.project.name}-${version}"/>
<property name="package.dir" location="dist/${dist.name}"/>
<property name="junit.jar" location="${ant.home}/lib/junit.jar"/>
<dirname file="${ant.file.common}" property="common.dir"/>
<property name="lucene.dir" location="${common.dir}/../../jakarta-lucene"/>
<property name="build.debug" value="true"/>
<property name="junit.fork" value="true"/>
<property name="jakarta.site2.home" location="../../../jakarta-site2"/>
<property name="project.name" value="site"/>
<property name="docs.src" location="xdocs"/>
<property name="docs.dest" location="docs"/>
<path id="anakia.classpath">
<fileset dir="${jakarta.site2.home}/lib">
<include name="*.jar"/>
</fileset>
</path>
<!-- ========================================================== -->
<!-- Datatype declarations -->
<!-- ========================================================== -->
<!-- TODO: define ${lucene.jar} for easier overriding -->
<path id="compile.classpath">
<fileset dir="${lucene.dir}" includes="build/lucene*.jar"/>
<pathelement path="${project.classpath}"/>
</path>
<path id="test.classpath">
<path refid="compile.classpath"/>
<pathelement location="${junit.jar}"/>
<pathelement location="${build.classes.dir}"/>
<pathelement location="${test.classes.dir}"/>
</path>
<target name="init">
<echo message="Building ${ant.project.name}"/>
<tstamp/>
<mkdir dir="${build.dir}"/>
<mkdir dir="${build.classes.dir}"/>
<mkdir dir="${dist.dir}"/>
<mkdir dir="${test.output.dir}"/>
<mkdir dir="${test.classes.dir}"/>
<available property="has.tests" file="${test.src.dir}" type="dir"/>
</target>
<target name="clean"
description="Deletes all previous build artifacts">
<delete dir="${build.dir}"/>
<delete dir="${build.classes.dir}"/>
<delete dir="${dist.dir}"/>
<delete dir="${package.dir}"/>
<delete dir="${test.output.dir}"/>
<delete dir="${test.classes.dir}"/>
</target>
<target name="dist" depends="compile" description="Create JAR">
<jar jarfile="${dist.dir}/${dist.name}.jar"
basedir="${build.classes.dir}"
/>
</target>
<target name="compile" depends="init">
<javac destdir="${build.classes.dir}"
debug="${build.debug}"
includeAntRuntime="yes"
deprecation="true"
srcdir="${src.dir}"
classpathref="compile.classpath"
encoding="${build.encoding}"
/>
<copy todir="${build.classes.dir}">
<fileset dir="${src.dir}" excludes="**/*.java"/>
</copy>
</target>
<target name="test-compile" depends="compile" if="has.tests">
<javac destdir="${test.classes.dir}"
debug="${build.debug}"
includeAntRuntime="yes"
srcdir="src/test"
classpathref="test.classpath"
encoding="${build.encoding}"
/>
<copy todir="${test.classes.dir}">
<fileset dir="src/test" excludes="**/*.java"/>
</copy>
</target>
<target name="test" depends="test-compile" if="has.tests">
<junit printsummary="no"
errorProperty="test.failed"
failureProperty="test.failed"
fork="${junit.fork}">
<classpath refid="test.classpath"/>
<sysproperty key="docs.dir" file="${test.classes.dir}"/>
<sysproperty key="index.dir" file="${test.output.dir}/index"/>
<sysproperty key="dataDir" file="${test.src.dir}"/>
<formatter type="brief" usefile="false"/>
<test name="${testcase}" if="testcase"/>
<batchtest todir="${test.data.dir}" unless="testcase">
<fileset dir="${test.classes.dir}"
includes="**/*Test.class,**/Test*.class"
/>
</batchtest>
</junit>
<fail if="test.failed">
Unit tests failed. Check log or reports for details
</fail>
</target>
<target name="default" depends="test,dist"/>
<!-- ================================================================== -->
<!-- Documentation -->
<!-- ================================================================== -->
<target name="javadoc" depends="compile">
<mkdir dir="${build.javadoc}"/>
<javadoc
sourcepath="${src.dir}"
overview="${src.dir}/overview.html"
packagenames="*"
destdir="${build.javadoc}"
author="true"
version="true"
use="true"
windowtitle="${Name} ${version} API"
doctitle="${Name} ${version} API"
encoding="${build.encoding}"
>
<link href="${javadoc.link.java}"/>
<link href="${javadoc.link.lucene}"/>
<tag name="todo" description="To Do:"/>
<classpath refid="compile.classpath"/>
</javadoc>
</target>
<!-- ================================================================== -->
<!-- D I S T R I B U T I O N -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="package" depends="dist, javadoc">
<mkdir dir="${package.dir}"/>
<mkdir dir="${package.dir}/docs"/>
<mkdir dir="${package.dir}/docs/api"/>
<mkdir dir="${docs.dest}"/>
<copy todir="${package.dir}/docs/api">
<fileset dir="${build.javadoc}"/>
</copy>
<copy todir="${package.dir}/docs">
<fileset dir="${docs.dest}/"/>
</copy>
<copy todir="${package.dir}">
<fileset dir=".">
<include name="*.txt"/>
</fileset>
</copy>
<copy todir="${package.dir}/src">
<fileset dir="src"/>
</copy>
<copy todir="${package.dir}/" file="build.xml"/>
<copy todir="${dist.dir}/" file="${common.dir}/common.xml"/>
<copy file="${dist.dir}/${dist.name}.jar" todir="${package.dir}"/>
<tar tarfile="${dist.dir}/${dist.name}.tar.gz" basedir="${dist.dir}/"
compression="gzip" includes="${dist.name}/**,common.xml"/>
</target>
<!-- ================================================================== -->
<!-- Copy release to server -->
<!-- ================================================================== -->
<target name="release" depends="package">
<exec executable="ssh">
<arg value="${release.host}"/>
<arg value="mkdir"/>
<arg value="${release.path}/${dist.name}"/>
</exec>
<exec executable="scp">
<arg value="${dist.dir}/${dist.name}.jar"/>
<arg value="${dist.dir}/${dist.name}.tar.gz"/>
<arg value="${release.host}:${release.path}/${dist.name}"/>
</exec>
<exec executable="ssh">
<arg value="${web.host}"/>
<arg value="rm"/>
<arg value="-rf"/>
<arg value="${web.path}/api"/>
</exec>
<exec executable="scp">
<arg value="-r"/>
<arg value="${build.javadoc}"/>
<arg value="${web.host}:${web.path}/api"/>
</exec>
</target>
</project>

View File

@ -1,19 +0,0 @@
<?xml version="1.0"?>
<project name="parsers" default="default">
<description>
Document parsers
</description>
<path id="additional.dependencies">
<fileset dir="lib"/>
</path>
<pathconvert property="project.classpath"
targetos="unix"
refid="additional.dependencies"
/>
<import file="../common.xml"/>
</project>

View File

@ -1 +0,0 @@
Place pj.jar here (from http://www.etymon.com/pub/software/pj/) and log4j JAR.

View File

@ -1,172 +0,0 @@
package org.apache.lucene.parsers.pdf;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation"
* must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import com.etymon.pj.Pdf;
import com.etymon.pj.exception.InvalidPdfObjectException;
import com.etymon.pj.exception.PjException;
import com.etymon.pj.object.PjArray;
import com.etymon.pj.object.PjObject;
import com.etymon.pj.object.PjPage;
import com.etymon.pj.object.PjStream;
import org.apache.log4j.Category;
import java.io.File;
import java.io.IOException;
import java.util.Vector;
/**
* <p>
* Attempts to extract text from a PDF file.
* </p>
* <p>
* <a href="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00280.html">
* Known limitations</a>
* </p>
*
* @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
* @version $Revision$
*/
public class PdfTextExtractor
{
private static Category cat = Category.getInstance(PdfTextExtractor.class);
public static void main(String[] args)
{
File f = new File("/usr/local/test.pdf");
try
{
Pdf pdf = new Pdf(f.toString());
int pagecount = pdf.getPageCount();
cat.debug(f.toString() + "has " + pagecount + " pages.");
for (int i = 1; i <= pagecount; i++)
{
System.out.println(getContent(pdf, i));
}
}
catch (IOException ioe)
{
cat.error("IOException parsing PDF file:" + f.toString(), ioe);
}
catch (PjException pje)
{
cat.error("PjException parsing PDF file:" + f.toString(), pje);
}
}
private static String getContent(Pdf pdf, int pageNo)
{
String content = null;
PjStream stream = null;
StringBuffer strbf = new StringBuffer();
try
{
PjPage page = (PjPage) pdf.getObject(pdf.getPage(pageNo));
PjObject pobj = (PjObject) pdf.resolve(page.getContents());
if (pobj instanceof PjArray)
{
PjArray array = (PjArray) pobj;
Vector vArray = array.getVector();
int size = vArray.size();
for (int j = 0; j < size; j++)
{
stream = (PjStream) pdf.resolve((PjObject) vArray.get(j));
strbf.append(getStringFromPjStream(stream));
}
content = strbf.toString();
}
else
{
stream = (PjStream) pobj;
content = getStringFromPjStream(stream);
}
}
catch (InvalidPdfObjectException pdfe)
{
cat.error("Invalid PDF Object:" + pdfe, pdfe);
}
catch (Exception e)
{
cat.error("Exception in getContent() " + e, e);
}
return content;
}
private static String getStringFromPjStream(PjStream stream)
{
StringBuffer strbf = new StringBuffer();
try
{
int start,end = 0;
stream = stream.flateDecompress();
String longString = stream.toString();
int strlen = longString.length();
int lastIndex = longString.lastIndexOf(')');
while (lastIndex != -1 && end != lastIndex)
{
start = longString.indexOf('(', end);
end = longString.indexOf(')', start);
String text = longString.substring(start + 1, end);
strbf.append(text);
}
}
catch (InvalidPdfObjectException pdfe)
{
cat.error("InvalidObjectException:" + pdfe.getMessage(), pdfe);
}
return strbf.toString();
}
}