SOLR-1274: added extract only output options

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@802282 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2009-08-08 01:39:16 +00:00
parent d880e90b1b
commit af828c575d
4 changed files with 46 additions and 4 deletions

View File

@ -34,3 +34,5 @@ $Id:$
See http://www.lucidimagination.com/search/document/d6f1899a85b2a45c/vote_apache_tika_0_4_release_candidate_2#d6f1899a85b2a45c
for discussion on language detection.
See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers)
6. SOLR-1274: Added text serialization output for extractOnly (Peter Wolanin, gsingers)

View File

@ -38,7 +38,9 @@ import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.BaseMarkupSerializer;
import org.apache.xml.serialize.XMLSerializer;
import org.apache.xml.serialize.TextSerializer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@ -52,7 +54,14 @@ import java.io.StringWriter;
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
/**
* Extract Only supported format
*/
public static final String TEXT_FORMAT = "text";
/**
* Extract Only supported format. Default
*/
public static final String XML_FORMAT = "xml";
/**
* XHTML XPath parser.
*/
@ -152,10 +161,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
ContentHandler parsingHandler = handler;
StringWriter writer = null;
XMLSerializer serializer = null;
BaseMarkupSerializer serializer = null;
if (extractOnly == true) {
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
writer = new StringWriter();
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
if (extractFormat.equals(TEXT_FORMAT)) {
serializer = new TextSerializer();
serializer.setOutputCharStream(writer);
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
} else {
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
}
if (xpathExpr != null) {
Matcher matcher =
PARSER.parse(xpathExpr);

View File

@ -81,6 +81,11 @@ public interface ExtractingParams {
*/
public static final String EXTRACT_ONLY = "extractOnly";
/**
* Content output format if extractOnly is true. Default is "xml", alternative is "text".
*/
public static final String EXTRACT_FORMAT = "extractFormat";
/**
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
*/

View File

@ -25,6 +25,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.SolrException;
import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
import java.util.List;
import java.util.ArrayList;
@ -243,6 +244,24 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
assertTrue("nl is null and it shouldn't be", nl != null);
Object title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
assertTrue(extraction.indexOf("<?xml") != -1);
rsp = loadLocal("solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
assertTrue("rsp is null and it shouldn't be", rsp != null);
list = rsp.getValues();
extraction = (String) list.get("solr-word.pdf");
assertTrue("extraction is null and it shouldn't be", extraction != null);
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
assertTrue(extraction.indexOf("<?xml") == -1);
nl = (NamedList) list.get("solr-word.pdf_metadata");
assertTrue("nl is null and it shouldn't be", nl != null);
title = nl.get("title");
assertTrue("title is null and it shouldn't be", title != null);
}