mirror of https://github.com/apache/lucene.git
SOLR-1274: added extract only output options
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@802282 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d880e90b1b
commit
af828c575d
|
@ -34,3 +34,5 @@ $Id:$
|
|||
See http://www.lucidimagination.com/search/document/d6f1899a85b2a45c/vote_apache_tika_0_4_release_candidate_2#d6f1899a85b2a45c
|
||||
for discussion on language detection.
|
||||
See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers)
|
||||
|
||||
6. SOLR-1274: Added text serialization output for extractOnly (Peter Wolanin, gsingers)
|
|
@ -38,7 +38,9 @@ import org.apache.tika.sax.xpath.MatchingContentHandler;
|
|||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.xml.serialize.OutputFormat;
|
||||
import org.apache.xml.serialize.BaseMarkupSerializer;
|
||||
import org.apache.xml.serialize.XMLSerializer;
|
||||
import org.apache.xml.serialize.TextSerializer;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
@ -52,7 +54,14 @@ import java.io.StringWriter;
|
|||
*
|
||||
**/
|
||||
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||
|
||||
/**
|
||||
* Extract Only supported format
|
||||
*/
|
||||
public static final String TEXT_FORMAT = "text";
|
||||
/**
|
||||
* Extract Only supported format. Default
|
||||
*/
|
||||
public static final String XML_FORMAT = "xml";
|
||||
/**
|
||||
* XHTML XPath parser.
|
||||
*/
|
||||
|
@ -152,10 +161,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
ContentHandler parsingHandler = handler;
|
||||
|
||||
StringWriter writer = null;
|
||||
XMLSerializer serializer = null;
|
||||
BaseMarkupSerializer serializer = null;
|
||||
if (extractOnly == true) {
|
||||
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
|
||||
writer = new StringWriter();
|
||||
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
|
||||
if (extractFormat.equals(TEXT_FORMAT)) {
|
||||
serializer = new TextSerializer();
|
||||
serializer.setOutputCharStream(writer);
|
||||
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
|
||||
} else {
|
||||
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
|
||||
}
|
||||
if (xpathExpr != null) {
|
||||
Matcher matcher =
|
||||
PARSER.parse(xpathExpr);
|
||||
|
|
|
@ -81,6 +81,11 @@ public interface ExtractingParams {
|
|||
*/
|
||||
public static final String EXTRACT_ONLY = "extractOnly";
|
||||
|
||||
/**
|
||||
* Content output format if extractOnly is true. Default is "xml", alternative is "text".
|
||||
*/
|
||||
public static final String EXTRACT_FORMAT = "extractFormat";
|
||||
|
||||
/**
|
||||
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
|
||||
*/
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.solr.common.util.NamedList;
|
|||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.handler.extraction.ExtractingParams;
|
||||
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
||||
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
@ -243,6 +244,24 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
|||
assertTrue("nl is null and it shouldn't be", nl != null);
|
||||
Object title = nl.get("title");
|
||||
assertTrue("title is null and it shouldn't be", title != null);
|
||||
assertTrue(extraction.indexOf("<?xml") != -1);
|
||||
|
||||
rsp = loadLocal("solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
|
||||
ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
|
||||
assertTrue("rsp is null and it shouldn't be", rsp != null);
|
||||
list = rsp.getValues();
|
||||
|
||||
extraction = (String) list.get("solr-word.pdf");
|
||||
assertTrue("extraction is null and it shouldn't be", extraction != null);
|
||||
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
|
||||
assertTrue(extraction.indexOf("<?xml") == -1);
|
||||
|
||||
nl = (NamedList) list.get("solr-word.pdf_metadata");
|
||||
assertTrue("nl is null and it shouldn't be", nl != null);
|
||||
title = nl.get("title");
|
||||
assertTrue("title is null and it shouldn't be", title != null);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue