mirror of https://github.com/apache/lucene.git
SOLR-1274: added extract only output options
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@802282 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d880e90b1b
commit
af828c575d
|
@ -34,3 +34,5 @@ $Id:$
|
||||||
See http://www.lucidimagination.com/search/document/d6f1899a85b2a45c/vote_apache_tika_0_4_release_candidate_2#d6f1899a85b2a45c
|
See http://www.lucidimagination.com/search/document/d6f1899a85b2a45c/vote_apache_tika_0_4_release_candidate_2#d6f1899a85b2a45c
|
||||||
for discussion on language detection.
|
for discussion on language detection.
|
||||||
See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers)
|
See http://www.apache.org/dist/lucene/tika/CHANGES-0.4.txt. (gsingers)
|
||||||
|
|
||||||
|
6. SOLR-1274: Added text serialization output for extractOnly (Peter Wolanin, gsingers)
|
|
@ -38,7 +38,9 @@ import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||||
import org.apache.tika.sax.xpath.XPathParser;
|
import org.apache.tika.sax.xpath.XPathParser;
|
||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
import org.apache.xml.serialize.OutputFormat;
|
import org.apache.xml.serialize.OutputFormat;
|
||||||
|
import org.apache.xml.serialize.BaseMarkupSerializer;
|
||||||
import org.apache.xml.serialize.XMLSerializer;
|
import org.apache.xml.serialize.XMLSerializer;
|
||||||
|
import org.apache.xml.serialize.TextSerializer;
|
||||||
import org.xml.sax.ContentHandler;
|
import org.xml.sax.ContentHandler;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
|
@ -52,7 +54,14 @@ import java.io.StringWriter;
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
|
/**
|
||||||
|
* Extract Only supported format
|
||||||
|
*/
|
||||||
|
public static final String TEXT_FORMAT = "text";
|
||||||
|
/**
|
||||||
|
* Extract Only supported format. Default
|
||||||
|
*/
|
||||||
|
public static final String XML_FORMAT = "xml";
|
||||||
/**
|
/**
|
||||||
* XHTML XPath parser.
|
* XHTML XPath parser.
|
||||||
*/
|
*/
|
||||||
|
@ -152,10 +161,17 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
||||||
ContentHandler parsingHandler = handler;
|
ContentHandler parsingHandler = handler;
|
||||||
|
|
||||||
StringWriter writer = null;
|
StringWriter writer = null;
|
||||||
XMLSerializer serializer = null;
|
BaseMarkupSerializer serializer = null;
|
||||||
if (extractOnly == true) {
|
if (extractOnly == true) {
|
||||||
|
String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
|
||||||
writer = new StringWriter();
|
writer = new StringWriter();
|
||||||
|
if (extractFormat.equals(TEXT_FORMAT)) {
|
||||||
|
serializer = new TextSerializer();
|
||||||
|
serializer.setOutputCharStream(writer);
|
||||||
|
serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
|
||||||
|
} else {
|
||||||
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
|
serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
|
||||||
|
}
|
||||||
if (xpathExpr != null) {
|
if (xpathExpr != null) {
|
||||||
Matcher matcher =
|
Matcher matcher =
|
||||||
PARSER.parse(xpathExpr);
|
PARSER.parse(xpathExpr);
|
||||||
|
|
|
@ -81,6 +81,11 @@ public interface ExtractingParams {
|
||||||
*/
|
*/
|
||||||
public static final String EXTRACT_ONLY = "extractOnly";
|
public static final String EXTRACT_ONLY = "extractOnly";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Content output format if extractOnly is true. Default is "xml", alternative is "text".
|
||||||
|
*/
|
||||||
|
public static final String EXTRACT_FORMAT = "extractFormat";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
|
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.solr.common.util.NamedList;
|
||||||
import org.apache.solr.common.SolrException;
|
import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.handler.extraction.ExtractingParams;
|
import org.apache.solr.handler.extraction.ExtractingParams;
|
||||||
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
||||||
|
import org.apache.solr.handler.extraction.ExtractingDocumentLoader;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -243,6 +244,24 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
||||||
assertTrue("nl is null and it shouldn't be", nl != null);
|
assertTrue("nl is null and it shouldn't be", nl != null);
|
||||||
Object title = nl.get("title");
|
Object title = nl.get("title");
|
||||||
assertTrue("title is null and it shouldn't be", title != null);
|
assertTrue("title is null and it shouldn't be", title != null);
|
||||||
|
assertTrue(extraction.indexOf("<?xml") != -1);
|
||||||
|
|
||||||
|
rsp = loadLocal("solr-word.pdf", ExtractingParams.EXTRACT_ONLY, "true",
|
||||||
|
ExtractingParams.EXTRACT_FORMAT, ExtractingDocumentLoader.TEXT_FORMAT);
|
||||||
|
assertTrue("rsp is null and it shouldn't be", rsp != null);
|
||||||
|
list = rsp.getValues();
|
||||||
|
|
||||||
|
extraction = (String) list.get("solr-word.pdf");
|
||||||
|
assertTrue("extraction is null and it shouldn't be", extraction != null);
|
||||||
|
assertTrue(extraction + " does not contain " + "solr-word", extraction.indexOf("solr-word") != -1);
|
||||||
|
assertTrue(extraction.indexOf("<?xml") == -1);
|
||||||
|
|
||||||
|
nl = (NamedList) list.get("solr-word.pdf_metadata");
|
||||||
|
assertTrue("nl is null and it shouldn't be", nl != null);
|
||||||
|
title = nl.get("title");
|
||||||
|
assertTrue("title is null and it shouldn't be", title != null);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue