diff --git a/sandbox/contributions/XML-Indexing-Demo/IndexingRequest.xml b/sandbox/contributions/XML-Indexing-Demo/IndexingRequest.xml deleted file mode 100644 index ef89135ff8d..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/IndexingRequest.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/sandbox/contributions/XML-Indexing-Demo/README.txt b/sandbox/contributions/XML-Indexing-Demo/README.txt deleted file mode 100644 index 26173a12609..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -This is the README file for XML Indexing Demo contributed by Aruna Raghavan. - -$Id$ - -Lucene Indexing Demo illustrates how one can parse and index XML documents -using a SAX2 or DOM parser with Lucene. diff --git a/sandbox/contributions/XML-Indexing-Demo/XMLIndexingDemo.zip b/sandbox/contributions/XML-Indexing-Demo/XMLIndexingDemo.zip deleted file mode 100644 index aa42058d175..00000000000 Binary files a/sandbox/contributions/XML-Indexing-Demo/XMLIndexingDemo.zip and /dev/null differ diff --git a/sandbox/contributions/XML-Indexing-Demo/build.xml b/sandbox/contributions/XML-Indexing-Demo/build.xml deleted file mode 100644 index 54e960167ec..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/build.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - Example of Lucene XML indexing - - - - diff --git a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/IndexFiles.java b/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/IndexFiles.java deleted file mode 100644 index 1dace90ed79..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/IndexFiles.java +++ /dev/null @@ -1,111 +0,0 @@ -package org.apache.lucenesandbox.xmlindexingdemo; - -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . - */ - -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.index.IndexWriter; - -import java.io.File; -import java.util.Date; - -class IndexFiles -{ - public static void main(String[] args) - throws Exception - { - try - { - Date start = new Date(); - - IndexWriter writer = new IndexWriter("index", new StandardAnalyzer(), true); - indexDocs(writer, new File(args[0])); - - writer.optimize(); - writer.close(); - - Date end = new Date(); - - System.out.print(end.getTime() - start.getTime()); - System.out.println(" total milliseconds"); - - } - catch (Exception e) - { - System.out.println(" caught a " + e.getClass() + - "\n with message: " + e.getMessage()); - throw e; - } - } - - public static void indexDocs(IndexWriter writer, File file) - throws Exception - { - if (file.isDirectory()) - { - String[] files = file.list(); - for (int i = 0; i < files.length; i++) - indexDocs(writer, new File(file, files[i])); - } - else - { - System.out.println("adding " + file); - XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file); - writer.addDocument(hdlr.getDocument()); - // For DOM, use - // XMLDocumentHandlerDOM hdlr = new XMLDocumentHandlerDOM(); - // writer.addDocument(hdlr.createXMLDocument(file)); - } - } -} diff --git a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/SearchFiles.java b/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/SearchFiles.java deleted file mode 100644 index 047cfefb73b..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/SearchFiles.java +++ /dev/null @@ -1,126 +0,0 @@ -package org.apache.lucenesandbox.xmlindexingdemo; - -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" and - * "Apache Lucene" must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * "Apache Lucene", nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . - */ - -import java.io.IOException; -import java.io.BufferedReader; -import java.io.InputStreamReader; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.search.Searcher; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Hits; -import org.apache.lucene.queryParser.QueryParser; - -class SearchFiles { - public static void main(String[] args) { - try { - Searcher searcher = new IndexSearcher("index"); - Analyzer analyzer = new StandardAnalyzer(); - - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); - while (true) { - System.out.print("Query: "); - String line = in.readLine(); - - if (line.length() == -1) - break; - - Query query = QueryParser.parse(line, "name", analyzer); - System.out.println("Searching for: " + query.toString("name")); - - Hits hits = searcher.search(query); - System.out.println(hits.length() + " total matching documents"); - - final int HITS_PER_PAGE = 10; - for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) - { - int end = Math.min(hits.length(), start + HITS_PER_PAGE); - for (int i = start; i < end; i++) - { - Document doc = hits.doc(i); - String name = doc.get("name"); - System.out.println(name); - System.out.println(doc.get("profession")); - System.out.println(doc.get("addressLine1")); - System.out.println(doc.get("addressLine2")); - System.out.print(doc.get("city")); - System.out.print(" "); - System.out.print(doc.get("state")); - System.out.print(" "); - System.out.print(doc.get("zip")); - System.out.println(doc.get("country")); - - } - - if (hits.length() > end) { - System.out.print("more (y/n) ? "); - line = in.readLine(); - if (line.length() == 0 || line.charAt(0) == 'n') - break; - } - } - } - searcher.close(); - - } catch (Exception e) { - System.out.println(" caught a " + e.getClass() + - "\n with message: " + e.getMessage()); - } - } -} diff --git a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerDOM.java b/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerDOM.java deleted file mode 100644 index f7c57b782ea..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerDOM.java +++ /dev/null @@ -1,131 +0,0 @@ -package org.apache.lucenesandbox.xmlindexingdemo; - -import org.w3c.dom.*; -import org.w3c.dom.Node; -import javax.xml.parsers.*; -import org.apache.lucene.document.Field; - -import java.io.File; - -/** - * - */ -public class XMLDocumentHandlerDOM { - public org.apache.lucene.document.Document createXMLDocument(File f) { - org.apache.lucene.document.Document document = new org.apache.lucene.document.Document(); - DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); - try { - DocumentBuilder df = dbf.newDocumentBuilder(); - org.w3c.dom.Document d = df.parse(f); - Node root = d.getDocumentElement(); - traverseTree(root, document); - } catch (Exception e) { - System.out.println("error: " + e); - e.printStackTrace(); - } - return document; - } - - static private void traverseTree(Node node, org.apache.lucene.document.Document document) { - NodeList nl = node.getChildNodes(); - if (nl.getLength() == 0) { - if (node.getNodeType() == Node.TEXT_NODE) { - Node parentNode = node.getParentNode(); - if (parentNode.getNodeType() == Node.ELEMENT_NODE) { -// String parentNodeName = parentNode.getNodeName(); -// String nodeValue = node.getNodeValue(); -// if (parentNodeName.equals("name")) -// { - Node siblingNode = node.getNextSibling(); - if (siblingNode != null) { - if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) { - document.add(Field.Text("name", siblingNode.getNodeValue())); - } - } -// } -// else if (parentNodeName.equals("profession")) -// { -// Node siblingNode = node.getNextSibling(); -// if (siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text([arentNodeName, siblingNode.getNodeValue())); -// } -// } -// } -// else if (parentNodeName == "addressLine1") -// { -// Node siblingNode = node.getNextSibling(); -// if(siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text("addressLine1", siblingNode.getNodeValue())); -// } -// } -// } -// else if (parentNodeName.equals("addressLine2")) -// { -// Node siblingNode = node.getNextSibling(); -// if (siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text("addressLine2", siblingNode.getNodeValue())); -// } -// } -// } -// if (parentNodeName.equals("city")) -// { -// Node siblingNode = node.getNextSibling(); -// if (siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text("city", siblingNode.getNodeValue())); -// } -// } -// } -// else if (parentNodeName.equals("zip")) -// { -// Node siblingNode = node.getNextSibling(); -// if (siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text("zip", siblingNode.getNodeValue())); -// } -// } -// } -// else if (parentNodeName.equals("state")) -// { -// Node siblingNode = node.getNextSibling(); -// if (siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text("state", siblingNode.getNodeValue())); -// } -// } -// } -// else if (parentNodeName.equals("country")) -// { -// Node siblingNode = node.getNextSibling(); -// if (siblingNode != null) -// { -// if (siblingNode.getNodeType() == Node.CDATA_SECTION_NODE) -// { -// document.add(Field.Text("country", siblingNode.getNodeValue())); -// } -// } -// } - } - } - } else { - for (int i = 0; i < nl.getLength(); i++) { - traverseTree(nl.item(i), document); - } - } - } -} diff --git a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerSAX.java b/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerSAX.java deleted file mode 100644 index 32170daa22e..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/src/java/org/apache/lucenesandbox/xmlindexingdemo/XMLDocumentHandlerSAX.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.apache.lucenesandbox.xmlindexingdemo; - -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - -import java.io.File; -import java.io.IOException; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -public class XMLDocumentHandlerSAX extends DefaultHandler { - /** A buffer for each XML element */ - private StringBuffer elementBuffer = new StringBuffer(); - - private Document mDocument; - - // constructor - public XMLDocumentHandlerSAX(File xmlFile) - throws ParserConfigurationException, SAXException, IOException { - SAXParserFactory spf = SAXParserFactory.newInstance(); - - // use validating parser? - //spf.setValidating(false); - // make parser name space aware? - //spf.setNamespaceAware(true); - - SAXParser parser = spf.newSAXParser(); - //System.out.println("parser is validating: " + parser.isValidating()); - try { - parser.parse(xmlFile, this); - } catch (org.xml.sax.SAXParseException spe) { - System.out.println("SAXParser caught SAXParseException at line: " + - spe.getLineNumber() + " column " + - spe.getColumnNumber()); - } - } - - // call at document start - public void startDocument() throws SAXException { - mDocument = new Document(); - } - - // call at element start - public void startElement(String namespaceURI, String localName, - String qualifiedName, Attributes attrs) throws SAXException { - - String eName = localName; - if ("".equals(eName)) { - eName = qualifiedName; // namespaceAware = false - } - // list the attribute(s) - if (attrs != null) { - for (int i = 0; i < attrs.getLength(); i++) { - String aName = attrs.getLocalName(i); // Attr name - if ("".equals(aName)) { aName = attrs.getQName(i); } - // perform application specific action on attribute(s) - // for now just dump out attribute name and value - System.out.println("attr " + aName+"="+attrs.getValue(i)); - } - } - elementBuffer.setLength(0); - } - - // call when cdata found - public void characters(char[] text, int start, int length) - throws SAXException { - elementBuffer.append(text, start, length); - } - - // call at element end - public void endElement(String namespaceURI, String simpleName, - String qualifiedName) throws SAXException { - - String eName = simpleName; - if ("".equals(eName)) { - eName = qualifiedName; // namespaceAware = false - } - - mDocument.add(Field.Text(eName, elementBuffer.toString())); - } - - public Document getDocument() { - return mDocument; - } -} diff --git a/sandbox/contributions/XML-Indexing-Demo/xdocs/about-LuceneIndexingDemo.xml b/sandbox/contributions/XML-Indexing-Demo/xdocs/about-LuceneIndexingDemo.xml deleted file mode 100644 index 3538ebb8175..00000000000 --- a/sandbox/contributions/XML-Indexing-Demo/xdocs/about-LuceneIndexingDemo.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - -Aruna Raghavan -Otis Gospodnetic -Lucene Indexing Demo - - - - -
-

Lucene Indexing Demo illustrates how one can parse XML documents -using a SAX2 or DOM and index them with Lucene.

-
- - -
diff --git a/sandbox/contributions/build.xml b/sandbox/contributions/build.xml deleted file mode 100644 index c56a1dd1557..00000000000 --- a/sandbox/contributions/build.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - diff --git a/sandbox/contributions/common.xml b/sandbox/contributions/common.xml deleted file mode 100644 index 7e83e378195..00000000000 --- a/sandbox/contributions/common.xml +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Unit tests failed. Check log or reports for details - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/sandbox/contributions/parsers/build.xml b/sandbox/contributions/parsers/build.xml deleted file mode 100644 index d47d11d4881..00000000000 --- a/sandbox/contributions/parsers/build.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - Document parsers - - - - - - - - - - diff --git a/sandbox/contributions/parsers/lib/readme.txt b/sandbox/contributions/parsers/lib/readme.txt deleted file mode 100644 index 42c80cf3a2f..00000000000 --- a/sandbox/contributions/parsers/lib/readme.txt +++ /dev/null @@ -1 +0,0 @@ -Place pj.jar here (from http://www.etymon.com/pub/software/pj/) and log4j JAR. \ No newline at end of file diff --git a/sandbox/contributions/parsers/src/java/org/apache/lucene/parsers/pdf/PdfTextExtractor.java b/sandbox/contributions/parsers/src/java/org/apache/lucene/parsers/pdf/PdfTextExtractor.java deleted file mode 100644 index 6ef4a19ca20..00000000000 --- a/sandbox/contributions/parsers/src/java/org/apache/lucene/parsers/pdf/PdfTextExtractor.java +++ /dev/null @@ -1,172 +0,0 @@ -package org.apache.lucene.parsers.pdf; - -/* ==================================================================== - * The Apache Software License, Version 1.1 - * - * Copyright (c) 2001 The Apache Software Foundation. All rights - * reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. The end-user documentation included with the redistribution, - * if any, must include the following acknowledgment: - * "This product includes software developed by the - * Apache Software Foundation (http://www.apache.org/)." - * Alternately, this acknowledgment may appear in the software itself, - * if and wherever such third-party acknowledgments normally appear. - * - * 4. The names "Apache" and "Apache Software Foundation" - * must not be used to endorse or promote products - * derived from this software without prior written permission. For - * written permission, please contact apache@apache.org. - * - * 5. Products derived from this software may not be called "Apache", - * nor may "Apache" appear in their name, without - * prior written permission of the Apache Software Foundation. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * ==================================================================== - * - * This software consists of voluntary contributions made by many - * individuals on behalf of the Apache Software Foundation. For more - * information on the Apache Software Foundation, please see - * . - */ - -import com.etymon.pj.Pdf; -import com.etymon.pj.exception.InvalidPdfObjectException; -import com.etymon.pj.exception.PjException; -import com.etymon.pj.object.PjArray; -import com.etymon.pj.object.PjObject; -import com.etymon.pj.object.PjPage; -import com.etymon.pj.object.PjStream; -import org.apache.log4j.Category; - -import java.io.File; -import java.io.IOException; -import java.util.Vector; - -/** - *

- * Attempts to extract text from a PDF file. - *

- *

- * - * Known limitations - *

- * - * @author Kelvin Tan - * @version $Revision$ - */ -public class PdfTextExtractor -{ - private static Category cat = Category.getInstance(PdfTextExtractor.class); - - public static void main(String[] args) - { - File f = new File("/usr/local/test.pdf"); - try - { - Pdf pdf = new Pdf(f.toString()); - int pagecount = pdf.getPageCount(); - cat.debug(f.toString() + "has " + pagecount + " pages."); - for (int i = 1; i <= pagecount; i++) - { - System.out.println(getContent(pdf, i)); - } - } - catch (IOException ioe) - { - cat.error("IOException parsing PDF file:" + f.toString(), ioe); - } - catch (PjException pje) - { - cat.error("PjException parsing PDF file:" + f.toString(), pje); - } - } - - private static String getContent(Pdf pdf, int pageNo) - { - String content = null; - PjStream stream = null; - StringBuffer strbf = new StringBuffer(); - try - { - PjPage page = (PjPage) pdf.getObject(pdf.getPage(pageNo)); - PjObject pobj = (PjObject) pdf.resolve(page.getContents()); - if (pobj instanceof PjArray) - { - PjArray array = (PjArray) pobj; - Vector vArray = array.getVector(); - int size = vArray.size(); - for (int j = 0; j < size; j++) - { - stream = (PjStream) pdf.resolve((PjObject) vArray.get(j)); - strbf.append(getStringFromPjStream(stream)); - } - content = strbf.toString(); - } - else - { - stream = (PjStream) pobj; - content = getStringFromPjStream(stream); - } - } - catch (InvalidPdfObjectException pdfe) - { - cat.error("Invalid PDF Object:" + pdfe, pdfe); - } - catch (Exception e) - { - cat.error("Exception in getContent() " + e, e); - } - return content; - } - - private static String getStringFromPjStream(PjStream stream) - { - StringBuffer strbf = new StringBuffer(); - try - { - int start,end = 0; - stream = stream.flateDecompress(); - String longString = stream.toString(); - int strlen = longString.length(); - int lastIndex = longString.lastIndexOf(')'); - while (lastIndex != -1 && end != lastIndex) - { - start = longString.indexOf('(', end); - end = longString.indexOf(')', start); - String text = longString.substring(start + 1, end); - strbf.append(text); - } - } - catch (InvalidPdfObjectException pdfe) - { - cat.error("InvalidObjectException:" + pdfe.getMessage(), pdfe); - } - return strbf.toString(); - } -} -