mirror of https://github.com/apache/lucene.git
SOLR-2241: upgrade to Tika 0.8
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1040815 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0511306b53
commit
ef762e6046
|
@ -20,13 +20,13 @@ to your Solr Home lib directory. See http://wiki.apache.org/solr/ExtractingRequ
|
|||
Tika Dependency
|
||||
---------------
|
||||
|
||||
Current Version: Tika 0.8-SNAPSHOT (rev 942725)
|
||||
Current Version: Tika 0.8 (released 11/07/2010)
|
||||
|
||||
$Id:$
|
||||
|
||||
================== Release 1.5-dev ==================
|
||||
|
||||
================== Release 3.1-dev ==================
|
||||
|
||||
* Upgraded to Tika 0.8 and changed deprecated parse call
|
||||
|
||||
* SOLR-1756: The date.format setting causes ClassCastException when enabled and the config code that
|
||||
parses this setting does not properly use the same iterator instance. (Christoph Brill, Mark Miller)
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[b64b033af70609338c07e2a88a5f7efcd1a84ddb] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[78d832c11c42023d4bc12077a1d9b7b5025217bc] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[51baf91a2df10184a8cca5cb43f11418576743a1] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[3bc5a7691d234751986dbeeca353f9ee390f1ffb] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[194e1f0c6e458db0b840b1530534a199306c07d2] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[d2c9a0514c1c4123c815851a5643eccd3ca884c8] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[371c2537fc26548ca8187f426900b34d9ab8b435] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[d93af7f1688eba78bb8580e010adf1ee66ac1d40] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[ed19b45098b326c42f625db2613c21a03a3ff79e] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[c3cce64a366865316dd1e579a53e6db858166619] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[9972d973277def35e3749d39cf39dfa37d61f75c] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[a08d953500f508864bb22ff1306f396d8b634c22] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[c986646e69bef4e3cd9086eabfc67f6a200fa3d9] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[5f36eb4e9b23409c8b266b196140975de6da3a80] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[5b79f0246f6b9b599767586fc426b26cf28c960a] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[82282b542613378e3bd46c6850c6ac1e715b5f11] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[1a01b2b895b560d94dd12b3fd5e46a39724e16d1] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[6fd02d419c0653c0127773ad3f22e186c03764cb] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[ba482aecb0d9b5a1b74d038c37a8cdde821b0258] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[934d3a7a0c87fc25ffe6bdfa2774fc7ae8e5cbd8] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[809e47cc4fb901a2fd67c99b70952c36243a0cd2] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[6aba6dca7d96e30dd3c411cd0a2e28033b219767] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[25d23ac5cf511587131a9e9ee58ad384ccf6f57c] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -31,6 +31,7 @@ import org.apache.solr.handler.ContentStreamLoader;
|
|||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.sax.xpath.Matcher;
|
||||
|
@ -190,7 +191,8 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
} //else leave it as is
|
||||
|
||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||
parser.parse(inputStream, parsingHandler, metadata);
|
||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||
parser.parse(inputStream, parsingHandler, metadata, context);
|
||||
if (extractOnly == false) {
|
||||
addDoc(handler);
|
||||
} else {
|
||||
|
|
|
@ -58,13 +58,15 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
|
||||
@Test
|
||||
public void testExtraction() throws Exception {
|
||||
// broken for turkish: https://issues.apache.org/jira/browse/SOLR-2088
|
||||
String defLang = Locale.getDefault().getLanguage();
|
||||
assumeFalse("Known bugs under Turkish locale: https://issues.apache.org/jira/browse/SOLR-2088", defLang.equals("tr") || defLang.equals("az"));
|
||||
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
|
||||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
loadLocal("solr-word.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
|
||||
loadLocal("solr-word.pdf",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.AAPL:Keywords", "ignored_a",
|
||||
"fmap.xmpTPg:NPages", "ignored_a",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "extractedContent",
|
||||
"literal.id", "one",
|
||||
|
@ -146,6 +148,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testDefaultField() throws Exception {
|
||||
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
|
||||
|
@ -349,6 +352,9 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
|
||||
loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.AAPL:Keywords", "ignored_a",
|
||||
"fmap.xmpTPg:NPages", "ignored_a",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"literal.id", "one",
|
||||
|
|
Loading…
Reference in New Issue