From 0a6017d7da5689f0be04150afa2be4e2e03902ae Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Thu, 19 Dec 2013 19:13:47 +0000 Subject: [PATCH] SOLR-1301: Merge Morphlines modules up to Kite 0.10 and CDK 0.9 git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1552398 13f79535-47bb-0310-9956-ffa450edef68 --- solr/contrib/map-reduce/build.xml | 1 + .../src/test-files/test-documents/cars.csv | 6 + .../src/test-files/test-documents/cars.csv.gz | Bin 0 -> 167 bytes .../src/test-files/test-documents/cars.tar.gz | Bin 0 -> 298 bytes .../src/test-files/test-documents/email.eml | 40 +++++++ .../src/test-files/test-documents/testRFC822 | 41 +++++++ .../solrCellDocumentTypes.conf | 28 +++++ .../org/apache/solr/hadoop/MRUnitBase.java | 10 +- .../solr/hadoop/MorphlineBasicMiniMRTest.java | 3 +- .../hadoop/MorphlineGoLiveMiniMRTest.java | 3 +- .../solr/morphlines/cell/SolrCellBuilder.java | 4 - .../cell/SolrCellMorphlineTest.java | 109 +++++++++++++++--- .../solr/morphlines/solr/SolrLocator.java | 32 ++--- .../solr/AbstractSolrMorphlineTestBase.java | 62 +++++++--- .../morphlines/solr/SolrMorphlineTest.java | 27 +++-- 15 files changed, 283 insertions(+), 83 deletions(-) create mode 100644 solr/contrib/map-reduce/src/test-files/test-documents/cars.csv create mode 100644 solr/contrib/map-reduce/src/test-files/test-documents/cars.csv.gz create mode 100644 solr/contrib/map-reduce/src/test-files/test-documents/cars.tar.gz create mode 100644 solr/contrib/map-reduce/src/test-files/test-documents/email.eml create mode 100644 solr/contrib/map-reduce/src/test-files/test-documents/testRFC822 diff --git a/solr/contrib/map-reduce/build.xml b/solr/contrib/map-reduce/build.xml index b1076bb4327..ea2646b15fe 100644 --- a/solr/contrib/map-reduce/build.xml +++ b/solr/contrib/map-reduce/build.xml @@ -92,6 +92,7 @@ + diff --git a/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv b/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv new file mode 100644 index 00000000000..8f1f9e1ae6a --- /dev/null +++ b/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv @@ -0,0 +1,6 @@ +Age,Color,Extras,Type,Used +2,blue,GPS,"Gas, with electric","" +10,green,"Labeled ""Vintage, 1913""",,yes +100,red,"Labeled ""Vintage 1913""",yes +5,orange,none,"This is a +multi, line text",no \ No newline at end of file diff --git a/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv.gz b/solr/contrib/map-reduce/src/test-files/test-documents/cars.csv.gz new file mode 100644 index 0000000000000000000000000000000000000000..ee2a951eddc077ee2c382113b9b9c62141fabdbb GIT binary patch literal 167 zcmV;Y09gMYiwFpht))=_17l%wb1q|Zb^vY6u?oUK5CqUZU$JaGMWR?(iinMcV6?v^ zOLB1D$=wE%zjtk{gz4tZ97e%0g_LnxG`oU!eF^SGhH8T@%!2dv20mLuZ?6+ckY2Oz z8O&8vW6UBEd~uyM12I2*RA;Z$?3*n!1FIC1HL?tU;Lm;84k^26>zJZ|+sPNQxav6v V^`PLRkjkQL;sY8H&w{)F001z!Ok)55 literal 0 HcmV?d00001 diff --git a/solr/contrib/map-reduce/src/test-files/test-documents/cars.tar.gz b/solr/contrib/map-reduce/src/test-files/test-documents/cars.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..5ca3cf1cdcbaf7d468424a7c2a4afd402f3858e6 GIT binary patch literal 298 zcmV+_0oDE=iwFP#+m=!Q1MSp7ON2la2XN2(DTX;e?2V2Np380voq~=w&a$$lF}h2A z`^Lng!$w5w68(P&Pjpb^%SRM8&p0{Qf(}{m8XGI)9~9m_KXI zv)qi=RmMg|Vv$Y!=kvdFm8W&JbF}Vi_dr*!x4wBAny#fA-&nU&MPcZ=^EH)e#w06C zqPp)%Ja%0xDGklW%&T)Z0zdL8|H+y1pO}yQ>vYV&;sp6$!ng8Y(B;-0UP#`$cG!C- wq%5BY*Ve8UW!;R6rIZv7UEps3000000000000000006MKS5bhNHvlLA0A22urT_o{ literal 0 HcmV?d00001 diff --git a/solr/contrib/map-reduce/src/test-files/test-documents/email.eml b/solr/contrib/map-reduce/src/test-files/test-documents/email.eml new file mode 100644 index 00000000000..d45f430869c --- /dev/null +++ b/solr/contrib/map-reduce/src/test-files/test-documents/email.eml @@ -0,0 +1,40 @@ +MIME-Version: 1.0 +Received: by 10.216.199.5 with HTTP; Wed, 27 Nov 2013 12:01:23 -0800 +(PST) +Date: Wed, 27 Nov 2013 13:01:23 -0700 +Delivered-To: foo@cloudera.com +Message-ID: + +Subject: Test EML +From: Patrick Foo +To: Patrick Foo +Content-Type: multipart/alternative; +boundary=001a11c3815cb55dda04ec2e0f3b + +--001a11c3815cb55dda04ec2e0f3b +Content-Type: text/plain; charset=ISO-8859-1 + +This is a test + +-- +Patrick Foo +Customer Operations Engineer + + + +--001a11c3815cb55dda04ec2e0f3b +Content-Type: text/html; charset=ISO-8859-1 +Content-Transfer-Encoding: quoted-printable + +
This is a test

-- +
Patrick Foo
Customer Operations +Engineer

= +

+
+ + +--001a11c3815cb55dda04ec2e0f3b-- diff --git a/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822 b/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822 new file mode 100644 index 00000000000..9ce423a182a --- /dev/null +++ b/solr/contrib/map-reduce/src/test-files/test-documents/testRFC822 @@ -0,0 +1,41 @@ +From: "Julien Nioche (JIRA)" +To: dev@tika.apache.org +Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed +Reply-To: dev@tika.apache.org +Delivered-To: mailing list dev@tika.apache.org +Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT) +In-Reply-To: <6089099.260231278600349994.JavaMail.jira@thor> +MIME-Version: 1.0 +Content-Type: text/plain; charset=utf-8 +Content-Transfer-Encoding: 7bit +X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394 +X-Virus-Checked: Checked by ClamAV on apache.org + + + [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ] + +Julien Nioche commented on TIKA-461: +------------------------------------ + +I'll have a look at mime4j and try to use it in Tika + +> RFC822 messages not parsed +> -------------------------- +> +> Key: TIKA-461 +> URL: https://issues.apache.org/jira/browse/TIKA-461 +> Project: Tika +> Issue Type: Bug +> Components: parser +> Affects Versions: 0.7 +> Reporter: Joshua Turner +> Assignee: Julien Nioche +> +> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox". +> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet. + +-- +This message is automatically generated by JIRA. +- +You can reply to this email to add a comment to the issue online. + diff --git a/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf b/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf index cfc3760c87d..7d232dd27f5 100644 --- a/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf +++ b/solr/contrib/map-reduce/src/test-files/test-morphlines/solrCellDocumentTypes.conf @@ -78,6 +78,32 @@ morphlines : [ ] } + { + commands : [ + { + readCSV { + supportedMimeTypes : [text/csv] + charset : UTF-8 + ignoreFirstLine : false + columns : [ user_screen_name, text ] + } + } + + { + generateUUID { + field : id + preserveExisting : false + } + } + + { + sanitizeUnknownSolrFields { + solrLocator : ${SOLR_LOCATOR} + } + } + ] + } + { commands : [ { @@ -180,6 +206,7 @@ morphlines : [ # the parser is chosen that is closest to the bottom in this list: parsers : [ { parser : org.apache.tika.parser.asm.ClassParser } + # { parser : org.apache.tika.parser.AutoDetectParser } # { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes : [audio/ogg] } { parser : org.gagravarr.tika.FlacParser } { parser : org.apache.tika.parser.audio.AudioParser } @@ -218,6 +245,7 @@ morphlines : [ { parser : org.apache.tika.parser.xml.DcXMLParser } { parser : org.apache.tika.parser.xml.FictionBookParser } { parser : org.apache.tika.parser.chm.ChmParser } + #{ parser : org.apache.tika.parser.AutoDetectParser } ] } } diff --git a/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java b/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java index 93f620f85a5..3897ecec30a 100644 --- a/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java +++ b/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MRUnitBase.java @@ -23,6 +23,7 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.hadoop.morphline.MorphlineMapRunner; +import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase; import org.apache.solr.util.ExternalPaths; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -50,17 +51,10 @@ public abstract class MRUnitBase extends SolrTestCaseJ4 { new File(tempDir).mkdirs(); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); - setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes"); + AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true); config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf"); config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName()); } - public static void setupMorphline(String tempDir, String file) throws IOException { - String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8"); - morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath()); - morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}", "{ collection : collection1 }"); - - FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8"); - } } diff --git a/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java b/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java index 9f53a0333c5..ba6fc3c86c7 100644 --- a/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java +++ b/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineBasicMiniMRTest.java @@ -42,6 +42,7 @@ import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.cloud.AbstractZkTestCase; import org.apache.solr.hadoop.hack.MiniMRCluster; +import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase; import org.apache.solr.util.ExternalPaths; import org.junit.After; import org.junit.AfterClass; @@ -125,7 +126,7 @@ public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 { new File(tempDir).mkdirs(); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); - MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes"); + AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true); System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath()); diff --git a/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java b/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java index bc6b1634f3c..03068e33a39 100644 --- a/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java +++ b/solr/contrib/map-reduce/src/test/org/apache/solr/hadoop/MorphlineGoLiveMiniMRTest.java @@ -67,6 +67,7 @@ import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.hadoop.hack.MiniMRClientCluster; import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory; +import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase; import org.apache.solr.util.ExternalPaths; import org.junit.After; import org.junit.AfterClass; @@ -142,7 +143,7 @@ public class MorphlineGoLiveMiniMRTest extends AbstractFullDistribZkTestBase { new File(tempDir).mkdirs(); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); - MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes"); + AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true); System.setProperty("hadoop.log.dir", new File(dataDir, "logs").getAbsolutePath()); diff --git a/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java b/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java index acee3f4582a..333d372a045 100644 --- a/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java +++ b/solr/contrib/morphlines-cell/src/java/org/apache/solr/morphlines/cell/SolrCellBuilder.java @@ -223,16 +223,12 @@ public final class SolrCellBuilder implements CommandBuilder { ParseContext parseContext = new ParseContext(); - // necessary for gzipped files or tar files, etc! copied from TikaCLI - parseContext.set(Parser.class, parser); - Metadata metadata = new Metadata(); for (Entry entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); - try { inputStream = TikaInputStream.get(inputStream); diff --git a/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java b/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java index 912febceef6..111aef521aa 100644 --- a/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java +++ b/solr/contrib/morphlines-cell/src/test/org/apache/solr/morphlines/cell/SolrCellMorphlineTest.java @@ -18,6 +18,7 @@ package org.apache.solr.morphlines.cell; import java.io.File; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; import org.apache.commons.io.FileUtils; @@ -37,7 +38,7 @@ import org.junit.Test; public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { private Map expectedRecords = new HashMap(); - + private Map> expectedRecordContents = new HashMap>(); @BeforeClass public static void beforeClass2() { assumeFalse("FIXME: This test fails under Java 8 due to the Saxon dependency - see SOLR-1301", Constants.JRE_IS_MINIMUM_JAVA8); @@ -47,16 +48,17 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { @Before public void setUp() throws Exception { super.setUp(); + String path = RESOURCES_DIR + "/test-documents"; expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2); - expectedRecords.put(path + "/cars.csv", 5); - expectedRecords.put(path + "/cars.csv.gz", 5); + expectedRecords.put(path + "/cars.csv", 6); + expectedRecords.put(path + "/cars.csv.gz", 6); expectedRecords.put(path + "/cars.tar.gz", 4); - expectedRecords.put(path + "/cars.tsv", 5); - expectedRecords.put(path + "/cars.ssv", 5); + expectedRecords.put(path + "/cars.tsv", 6); + expectedRecords.put(path + "/cars.ssv", 6); expectedRecords.put(path + "/test-documents.7z", 9); expectedRecords.put(path + "/test-documents.cpio", 9); expectedRecords.put(path + "/test-documents.tar", 9); @@ -65,12 +67,80 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { expectedRecords.put(path + "/test-documents.zip", 9); expectedRecords.put(path + "/multiline-stacktrace.log", 4); + { + Map record = new LinkedHashMap(); + record.put("ignored__attachment_mimetype", "image/jpeg"); + record.put("ignored_exif_isospeedratings", "400"); + record.put("ignored_meta_creation_date", "2009-08-11T09:09:45"); + record.put("ignored_tiff_model", "Canon EOS 40D"); + record.put("text", NON_EMPTY_FIELD); + expectedRecordContents.put("/testJPEG_EXIF.jpg", record); + expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record); + expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record); + } + + { + String file = path + "/testWORD_various.doc"; + Map record = new LinkedHashMap(); + record.put("ignored__attachment_mimetype", "application/msword"); + record.put("ignored_author", "Michael McCandless"); + record.put("ignored_creation_date", "2011-09-02T10:11:00Z"); + record.put("ignored_title", ""); + record.put("ignored_keywords", "Keyword1 Keyword2"); + record.put("ignored_subject", "Subject is here"); + record.put("text", NON_EMPTY_FIELD); + expectedRecordContents.put(file, record); + } + + { + String file = path + "/testPDF.pdf"; + Map record = new LinkedHashMap(); + record.put("ignored__attachment_mimetype", "application/pdf"); + record.put("ignored_author", "Bertrand Delacrétaz"); + record.put("ignored_creation_date", "2007-09-15T09:02:31Z"); + record.put("ignored_title", "Apache Tika - Apache Tika"); + record.put("ignored_xmp_creatortool", "Firefox"); + record.put("text", NON_EMPTY_FIELD); + expectedRecordContents.put(file, record); + } + + { + String file = path + "/email.eml"; + Map record = new LinkedHashMap(); + String name = "Patrick Foo "; + record.put("ignored__attachment_mimetype", "message/rfc822"); + record.put("ignored_author", name); + //record.put("ignored_content_length", "1068"); + record.put("ignored_creation_date", "2013-11-27T20:01:23Z"); + record.put("ignored_message_from", name); + record.put("ignored_message_to", name); + record.put("ignored_creator", name); + record.put("ignored_dc_creator", name); + record.put("ignored_dc_title", "Test EML"); + record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z"); + record.put("ignored_meta_author", name); + record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z"); + record.put("ignored_subject", "Test EML"); + record.put("text", NON_EMPTY_FIELD); + expectedRecordContents.put(file, record); + } + + { + String file = path + "/testEXCEL.xlsx"; + Map record = new LinkedHashMap(); + record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + record.put("ignored_author", "Keith Bennett"); + record.put("ignored_creation_date", "2007-10-01T16:13:56Z"); + record.put("ignored_title", "Simple Excel document"); + record.put("text", NON_EMPTY_FIELD); + expectedRecordContents.put(file, record); + } + FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); } @Test public void testSolrCellJPGCompressed() throws Exception { - morphline = createMorphline("test-morphlines/solrCellJPGCompressed"); String path = RESOURCES_DIR + "/test-documents"; String[] files = new String[] { @@ -79,7 +149,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { path + "/testJPEG_EXIF.jpg.tar.gz", //path + "/jpeg2000.jp2", }; - testDocumentTypesInternal(files, expectedRecords); + testDocumentTypesInternal(files, expectedRecords, expectedRecordContents); } @Test @@ -89,13 +159,14 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { String[] files = new String[] { path + "/testXML2.xml", }; - testDocumentTypesInternal(files, expectedRecords); + testDocumentTypesInternal(files, expectedRecords, expectedRecordContents); } @Test public void testSolrCellDocumentTypes() throws Exception { - - morphline = createMorphline("test-morphlines/solrCellDocumentTypes"); + AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false); + + morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes"); String path = RESOURCES_DIR + "/test-documents"; String[] files = new String[] { path + "/testBMPfp.txt", @@ -107,22 +178,26 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { path + "/testJPEG_EXIF.jpg.gz", path + "/testJPEG_EXIF.jpg.tar.gz", path + "/testXML.xml", -// path + "/cars.csv", + path + "/cars.csv", // path + "/cars.tsv", // path + "/cars.ssv", -// path + "/cars.csv.gz", -// path + "/cars.tar.gz", + path + "/cars.csv.gz", + path + "/cars.tar.gz", path + "/sample-statuses-20120906-141433.avro", path + "/sample-statuses-20120906-141433", path + "/sample-statuses-20120906-141433.gz", path + "/sample-statuses-20120906-141433.bz2", + path + "/email.eml", }; - testDocumentTypesInternal(files, expectedRecords); + testDocumentTypesInternal(files, expectedRecords, expectedRecordContents); } @Test public void testSolrCellDocumentTypes2() throws Exception { - morphline = createMorphline("test-morphlines/solrCellDocumentTypes"); + + AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false); + + morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes"); String path = RESOURCES_DIR + "/test-documents"; String[] files = new String[] { path + "/testPPT_various.ppt", @@ -137,7 +212,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { path + "/complex.mbox", path + "/test-outlook.msg", path + "/testEMLX.emlx", -// path + "/testRFC822", + path + "/testRFC822", path + "/rsstest.rss", // path + "/testDITA.dita", @@ -176,7 +251,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { // path + "/testWINMAIL.dat", // path + "/testWMF.wmf", }; - testDocumentTypesInternal(files, expectedRecords); + testDocumentTypesInternal(files, expectedRecords, expectedRecordContents); } /** diff --git a/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java b/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java index ab29c7810a9..15622053f7b 100644 --- a/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java +++ b/solr/contrib/morphlines-core/src/java/org/apache/solr/morphlines/solr/SolrLocator.java @@ -20,11 +20,13 @@ import org.kitesdk.morphline.api.MorphlineCompilationException; import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.MorphlineRuntimeException; import org.kitesdk.morphline.base.Configs; + import com.google.common.base.Preconditions; import com.typesafe.config.Config; import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigRenderOptions; import com.typesafe.config.ConfigUtil; + import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.impl.CloudSolrServer; import org.apache.solr.common.cloud.SolrZkClient; @@ -39,8 +41,10 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; + import java.io.File; import java.io.IOException; +import java.net.MalformedURLException; /** * Set of configuration parameters that identify the location and schema of a Solr server or @@ -57,8 +61,6 @@ public class SolrLocator { private String solrHomeDir; private int batchSize = 1000; - private static final String SOLR_HOME_PROPERTY_NAME = "solr.solr.home"; - private static final Logger LOG = LoggerFactory.getLogger(SolrLocator.class); protected SolrLocator(MorphlineContext context) { @@ -120,7 +122,6 @@ public class SolrLocator { // If solrHomeDir isn't defined and zkHost and collectionName are defined // then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir - String oldSolrHomeDir = null; String mySolrHomeDir = solrHomeDir; if (solrHomeDir == null || solrHomeDir.length() == 0) { if (zkHost == null || zkHost.length() == 0) { @@ -150,20 +151,13 @@ public class SolrLocator { } } - oldSolrHomeDir = System.setProperty(SOLR_HOME_PROPERTY_NAME, mySolrHomeDir); + LOG.debug("SolrLocator loading IndexSchema from dir {}", mySolrHomeDir); try { - SolrConfig solrConfig = new SolrConfig(); // TODO use SolrResourceLoader ala TikaMapper? - // SolrConfig solrConfig = new SolrConfig("solrconfig.xml"); - // SolrConfig solrConfig = new - // SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1", - // "solrconfig.xml", null); - // SolrConfig solrConfig = new - // SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1/conf/solrconfig.xml"); - SolrResourceLoader loader = solrConfig.getResourceLoader(); - + SolrResourceLoader loader = new SolrResourceLoader(mySolrHomeDir); + SolrConfig solrConfig = new SolrConfig(loader, "solrconfig.xml", null); InputSource is = new InputSource(loader.openSchema("schema.xml")); - is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml")); - + is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml")); + IndexSchema schema = new IndexSchema(solrConfig, "schema.xml", is); validateSchema(schema); return schema; @@ -173,14 +167,6 @@ public class SolrLocator { throw new MorphlineRuntimeException(e); } catch (SAXException e) { throw new MorphlineRuntimeException(e); - } finally { // restore old global state - if (solrHomeDir != null) { - if (oldSolrHomeDir == null) { - System.clearProperty(SOLR_HOME_PROPERTY_NAME); - } else { - System.setProperty(SOLR_HOME_PROPERTY_NAME, oldSolrHomeDir); - } - } } } diff --git a/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java b/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java index 100c9e992b7..78c209ae9de 100644 --- a/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java +++ b/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/AbstractSolrMorphlineTestBase.java @@ -19,6 +19,7 @@ package org.apache.solr.morphlines.solr; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -28,7 +29,6 @@ import java.util.Map.Entry; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang.StringEscapeUtils; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServer; @@ -41,9 +41,6 @@ import org.apache.solr.util.ExternalPaths; import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import org.kitesdk.morphline.api.Collector; import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.MorphlineContext; @@ -53,6 +50,9 @@ import org.kitesdk.morphline.base.FaultTolerance; import org.kitesdk.morphline.base.Fields; import org.kitesdk.morphline.base.Notifications; import org.kitesdk.morphline.stdlib.PipeBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.codahale.metrics.MetricRegistry; import com.google.common.io.Files; import com.typesafe.config.Config; @@ -73,6 +73,8 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { protected static final AtomicInteger SEQ_NUM = new AtomicInteger(); protected static final AtomicInteger SEQ_NUM2 = new AtomicInteger(); + protected static final Object NON_EMPTY_FIELD = new Object(); + private static final Logger LOGGER = LoggerFactory.getLogger(AbstractSolrMorphlineTestBase.class); protected String tempDir; @@ -113,7 +115,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { testServer = new SolrServerDocumentLoader(solrServer, batchSize); deleteAllDocuments(); - tempDir = TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis(); + tempDir = new File(TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis()).getAbsolutePath(); new File(tempDir).mkdirs(); } @@ -124,7 +126,11 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { super.tearDown(); } - protected void testDocumentTypesInternal(String[] files, Map expectedRecords) throws Exception { + protected void testDocumentTypesInternal( + String[] files, + Map expectedRecords, + Map> expectedRecordContents) throws Exception { + deleteAllDocuments(); int numDocs = 0; for (int i = 0; i < 1; i++) { @@ -137,6 +143,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(body)); event.getFields().put(Fields.ATTACHMENT_NAME, f.getName()); event.getFields().put(Fields.BASE_ID, f.getName()); + collector.reset(); load(event); Integer count = expectedRecords.get(file); if (count != null) { @@ -145,6 +152,20 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { numDocs++; } assertEquals("unexpected results in " + file, numDocs, queryResultSetSize("*:*")); + Map expectedContents = expectedRecordContents.get(file); + if (expectedContents != null) { + Record actual = collector.getFirstRecord(); + for (Map.Entry entry : expectedContents.entrySet()) { + if (entry.getValue() == NON_EMPTY_FIELD) { + assertNotNull(entry.getKey()); + assertTrue(actual.getFirstValue(entry.getKey()).toString().length() > 0); + } else if (entry.getValue() == null) { + assertEquals("key:" + entry.getKey(), 0, actual.get(entry.getKey()).size()); + } else { + assertEquals("key:" + entry.getKey(), Arrays.asList(entry.getValue()), actual.get(entry.getKey())); + } + } + } } } assertEquals(numDocs, queryResultSetSize("*:*")); @@ -180,17 +201,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { s.commit(); } - - public static void setupMorphline(String tempDir, String file) throws IOException { - String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8"); - morphlineText = morphlineText.replace("RESOURCES_DIR", StringEscapeUtils.escapeJavaScript(new File(tempDir).getAbsolutePath())); - - FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8"); - } - protected Command createMorphline(String file) throws IOException { - setupMorphline(tempDir, file); - return new PipeBuilder().build(parse(file), null, collector, createMorphlineContext()); } @@ -206,7 +217,13 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { private Config parse(String file) throws IOException { SolrLocator locator = new SolrLocator(createMorphlineContext()); locator.setSolrHomeDir(testSolrHome + "/collection1"); - Config config = new Compiler().parse(new File(tempDir + "/" + file + ".conf"), locator.toConfig("SOLR_LOCATOR")); + File morphlineFile; + if (new File(file).isAbsolute()) { + morphlineFile = new File(file + ".conf"); + } else { + morphlineFile = new File(RESOURCES_DIR + "/" + file + ".conf"); + } + Config config = new Compiler().parse(morphlineFile, locator.toConfig("SOLR_LOCATOR")); config = config.getConfigList("morphlines").get(0); return config; } @@ -266,4 +283,15 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 { public HashSet getFieldValues() { return fieldValues; } public CompareType getCompareType() { return compareType; } } + + public static void setupMorphline(String tempDir, String file, boolean replaceSolrLocator) throws IOException { + String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8"); + morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath()); + if (replaceSolrLocator) { + morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}", + "{ collection : collection1 }"); + } + new File(tempDir + "/" + file + ".conf").getParentFile().mkdirs(); + FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8"); + } } diff --git a/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java b/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java index 7ded41a9494..972e6483550 100644 --- a/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java +++ b/solr/contrib/morphlines-core/src/test/org/apache/solr/morphlines/solr/SolrMorphlineTest.java @@ -57,18 +57,21 @@ public class SolrMorphlineTest extends AbstractSolrMorphlineTestBase { @Test public void testTokenizeText() throws Exception { - morphline = createMorphline("test-morphlines/tokenizeText"); - Record record = new Record(); - record.put(Fields.MESSAGE, "Hello World!"); - record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123"); - Record expected = record.copy(); - expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123")); - startSession(); - Notifications.notifyBeginTransaction(morphline); - assertTrue(morphline.process(record)); - assertEquals(1, collector.getNumStartEvents()); - Notifications.notifyCommitTransaction(morphline); - assertEquals(expected, collector.getFirstRecord()); + morphline = createMorphline("test-morphlines/tokenizeText"); + for (int i = 0; i < 3; i++) { + Record record = new Record(); + record.put(Fields.MESSAGE, "Hello World!"); + record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123"); + Record expected = record.copy(); + expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123")); + collector.reset(); + startSession(); + Notifications.notifyBeginTransaction(morphline); + assertTrue(morphline.process(record)); + assertEquals(1, collector.getNumStartEvents()); + Notifications.notifyCommitTransaction(morphline); + assertEquals(expected, collector.getFirstRecord()); + } } }