SOLR-1301: Merge Morphlines modules up to Kite 0.10 and CDK 0.9

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1552398 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2013-12-19 19:13:47 +00:00
parent e7d605a2d5
commit 0a6017d7da
15 changed files with 283 additions and 83 deletions

View File

@ -92,6 +92,7 @@
<path id="test.classpath"> <path id="test.classpath">
<path refid="solr.test.base.classpath"/> <path refid="solr.test.base.classpath"/>
<path refid="classpath.additions"/> <path refid="classpath.additions"/>
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
<fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/> <fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
</path> </path>

View File

@ -0,0 +1,6 @@
Age,Color,Extras,Type,Used
2,blue,GPS,"Gas, with electric",""
10,green,"Labeled ""Vintage, 1913""",,yes
100,red,"Labeled ""Vintage 1913""",yes
5,orange,none,"This is a
multi, line text",no
Can't render this file because it has a wrong number of fields in line 4.

View File

@ -0,0 +1,40 @@
MIME-Version: 1.0
Received: by 10.216.199.5 with HTTP; Wed, 27 Nov 2013 12:01:23 -0800
(PST)
Date: Wed, 27 Nov 2013 13:01:23 -0700
Delivered-To: foo@cloudera.com
Message-ID:
<CAOi5V169EW4GCfde_aNKSBgqAD=KSPVO6Batw_Oko-8cmAgK6w@mail.gmail.com>
Subject: Test EML
From: Patrick Foo <foo@cloudera.com>
To: Patrick Foo <foo@cloudera.com>
Content-Type: multipart/alternative;
boundary=001a11c3815cb55dda04ec2e0f3b
--001a11c3815cb55dda04ec2e0f3b
Content-Type: text/plain; charset=ISO-8859-1
This is a test
--
Patrick Foo
Customer Operations Engineer
<http://www.cloudera.com>
--001a11c3815cb55dda04ec2e0f3b
Content-Type: text/html; charset=ISO-8859-1
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">This is a test<br clear=3D"all"><div><br></div>--
<br><div=
dir=3D"ltr">Patrick Foo<div>Customer Operations
Engineer</div><div><br>=
</div><div><a href=3D"http://www.cloudera.com" target=3D"_blank"><img
src=
=3D"http://files.cloudera.com.s3.amazonaws.com/New%20Branding/cloudera-smal=
l.png"></a><br>
</div></div>
</div>
--001a11c3815cb55dda04ec2e0f3b--

View File

@ -0,0 +1,41 @@
From: "Julien Nioche (JIRA)" <jira@apache.org>
To: dev@tika.apache.org
Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
Reply-To: dev@tika.apache.org
Delivered-To: mailing list dev@tika.apache.org
Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
In-Reply-To: <6089099.260231278600349994.JavaMail.jira@thor>
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: 7bit
X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
X-Virus-Checked: Checked by ClamAV on apache.org
[ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ]
Julien Nioche commented on TIKA-461:
------------------------------------
I'll have a look at mime4j and try to use it in Tika
> RFC822 messages not parsed
> --------------------------
>
> Key: TIKA-461
> URL: https://issues.apache.org/jira/browse/TIKA-461
> Project: Tika
> Issue Type: Bug
> Components: parser
> Affects Versions: 0.7
> Reporter: Joshua Turner
> Assignee: Julien Nioche
>
> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

View File

@ -78,6 +78,32 @@ morphlines : [
] ]
} }
{
commands : [
{
readCSV {
supportedMimeTypes : [text/csv]
charset : UTF-8
ignoreFirstLine : false
columns : [ user_screen_name, text ]
}
}
{
generateUUID {
field : id
preserveExisting : false
}
}
{
sanitizeUnknownSolrFields {
solrLocator : ${SOLR_LOCATOR}
}
}
]
}
{ {
commands : [ commands : [
{ {
@ -180,6 +206,7 @@ morphlines : [
# the parser is chosen that is closest to the bottom in this list: # the parser is chosen that is closest to the bottom in this list:
parsers : [ parsers : [
{ parser : org.apache.tika.parser.asm.ClassParser } { parser : org.apache.tika.parser.asm.ClassParser }
# { parser : org.apache.tika.parser.AutoDetectParser }
# { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes : [audio/ogg] } # { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes : [audio/ogg] }
{ parser : org.gagravarr.tika.FlacParser } { parser : org.gagravarr.tika.FlacParser }
{ parser : org.apache.tika.parser.audio.AudioParser } { parser : org.apache.tika.parser.audio.AudioParser }
@ -218,6 +245,7 @@ morphlines : [
{ parser : org.apache.tika.parser.xml.DcXMLParser } { parser : org.apache.tika.parser.xml.DcXMLParser }
{ parser : org.apache.tika.parser.xml.FictionBookParser } { parser : org.apache.tika.parser.xml.FictionBookParser }
{ parser : org.apache.tika.parser.chm.ChmParser } { parser : org.apache.tika.parser.chm.ChmParser }
#{ parser : org.apache.tika.parser.AutoDetectParser }
] ]
} }
} }

View File

@ -23,6 +23,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.hadoop.morphline.MorphlineMapRunner; import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.ExternalPaths; import org.apache.solr.util.ExternalPaths;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -50,17 +51,10 @@ public abstract class MRUnitBase extends SolrTestCaseJ4 {
new File(tempDir).mkdirs(); new File(tempDir).mkdirs();
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes"); AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf"); config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf");
config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName()); config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName());
} }
public static void setupMorphline(String tempDir, String file) throws IOException {
String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath());
morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}", "{ collection : collection1 }");
FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
}
} }

View File

@ -42,6 +42,7 @@ import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.cloud.AbstractZkTestCase; import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.hadoop.hack.MiniMRCluster; import org.apache.solr.hadoop.hack.MiniMRCluster;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.ExternalPaths; import org.apache.solr.util.ExternalPaths;
import org.junit.After; import org.junit.After;
import org.junit.AfterClass; import org.junit.AfterClass;
@ -125,7 +126,7 @@ public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 {
new File(tempDir).mkdirs(); new File(tempDir).mkdirs();
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes"); AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath()); System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());

View File

@ -67,6 +67,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.hadoop.hack.MiniMRClientCluster; import org.apache.solr.hadoop.hack.MiniMRClientCluster;
import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory; import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.util.ExternalPaths; import org.apache.solr.util.ExternalPaths;
import org.junit.After; import org.junit.After;
import org.junit.AfterClass; import org.junit.AfterClass;
@ -142,7 +143,7 @@ public class MorphlineGoLiveMiniMRTest extends AbstractFullDistribZkTestBase {
new File(tempDir).mkdirs(); new File(tempDir).mkdirs();
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes"); AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
System.setProperty("hadoop.log.dir", new File(dataDir, "logs").getAbsolutePath()); System.setProperty("hadoop.log.dir", new File(dataDir, "logs").getAbsolutePath());

View File

@ -223,16 +223,12 @@ public final class SolrCellBuilder implements CommandBuilder {
ParseContext parseContext = new ParseContext(); ParseContext parseContext = new ParseContext();
// necessary for gzipped files or tar files, etc! copied from TikaCLI
parseContext.set(Parser.class, parser);
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
for (Entry<String, Object> entry : record.getFields().entries()) { for (Entry<String, Object> entry : record.getFields().entries()) {
metadata.add(entry.getKey(), entry.getValue().toString()); metadata.add(entry.getKey(), entry.getValue().toString());
} }
SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
try { try {
inputStream = TikaInputStream.get(inputStream); inputStream = TikaInputStream.get(inputStream);

View File

@ -18,6 +18,7 @@ package org.apache.solr.morphlines.cell;
import java.io.File; import java.io.File;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
@ -37,7 +38,7 @@ import org.junit.Test;
public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase { public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
private Map<String,Integer> expectedRecords = new HashMap<String,Integer>(); private Map<String,Integer> expectedRecords = new HashMap<String,Integer>();
private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<String, Map<String, Object>>();
@BeforeClass @BeforeClass
public static void beforeClass2() { public static void beforeClass2() {
assumeFalse("FIXME: This test fails under Java 8 due to the Saxon dependency - see SOLR-1301", Constants.JRE_IS_MINIMUM_JAVA8); assumeFalse("FIXME: This test fails under Java 8 due to the Saxon dependency - see SOLR-1301", Constants.JRE_IS_MINIMUM_JAVA8);
@ -47,16 +48,17 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
String path = RESOURCES_DIR + "/test-documents"; String path = RESOURCES_DIR + "/test-documents";
expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2);
expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2); expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2);
expectedRecords.put(path + "/cars.csv", 5); expectedRecords.put(path + "/cars.csv", 6);
expectedRecords.put(path + "/cars.csv.gz", 5); expectedRecords.put(path + "/cars.csv.gz", 6);
expectedRecords.put(path + "/cars.tar.gz", 4); expectedRecords.put(path + "/cars.tar.gz", 4);
expectedRecords.put(path + "/cars.tsv", 5); expectedRecords.put(path + "/cars.tsv", 6);
expectedRecords.put(path + "/cars.ssv", 5); expectedRecords.put(path + "/cars.ssv", 6);
expectedRecords.put(path + "/test-documents.7z", 9); expectedRecords.put(path + "/test-documents.7z", 9);
expectedRecords.put(path + "/test-documents.cpio", 9); expectedRecords.put(path + "/test-documents.cpio", 9);
expectedRecords.put(path + "/test-documents.tar", 9); expectedRecords.put(path + "/test-documents.tar", 9);
@ -65,12 +67,80 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
expectedRecords.put(path + "/test-documents.zip", 9); expectedRecords.put(path + "/test-documents.zip", 9);
expectedRecords.put(path + "/multiline-stacktrace.log", 4); expectedRecords.put(path + "/multiline-stacktrace.log", 4);
{
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "image/jpeg");
record.put("ignored_exif_isospeedratings", "400");
record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
record.put("ignored_tiff_model", "Canon EOS 40D");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
}
{
String file = path + "/testWORD_various.doc";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/msword");
record.put("ignored_author", "Michael McCandless");
record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
record.put("ignored_title", "");
record.put("ignored_keywords", "Keyword1 Keyword2");
record.put("ignored_subject", "Subject is here");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "/testPDF.pdf";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/pdf");
record.put("ignored_author", "Bertrand Delacrétaz");
record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
record.put("ignored_title", "Apache Tika - Apache Tika");
record.put("ignored_xmp_creatortool", "Firefox");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "/email.eml";
Map<String, Object> record = new LinkedHashMap();
String name = "Patrick Foo <foo@cloudera.com>";
record.put("ignored__attachment_mimetype", "message/rfc822");
record.put("ignored_author", name);
//record.put("ignored_content_length", "1068");
record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_message_from", name);
record.put("ignored_message_to", name);
record.put("ignored_creator", name);
record.put("ignored_dc_creator", name);
record.put("ignored_dc_title", "Test EML");
record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
record.put("ignored_meta_author", name);
record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_subject", "Test EML");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "/testEXCEL.xlsx";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
record.put("ignored_author", "Keith Bennett");
record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
record.put("ignored_title", "Simple Excel document");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml")); FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
} }
@Test @Test
public void testSolrCellJPGCompressed() throws Exception { public void testSolrCellJPGCompressed() throws Exception {
morphline = createMorphline("test-morphlines/solrCellJPGCompressed"); morphline = createMorphline("test-morphlines/solrCellJPGCompressed");
String path = RESOURCES_DIR + "/test-documents"; String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] { String[] files = new String[] {
@ -79,7 +149,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
path + "/testJPEG_EXIF.jpg.tar.gz", path + "/testJPEG_EXIF.jpg.tar.gz",
//path + "/jpeg2000.jp2", //path + "/jpeg2000.jp2",
}; };
testDocumentTypesInternal(files, expectedRecords); testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
} }
@Test @Test
@ -89,13 +159,14 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
String[] files = new String[] { String[] files = new String[] {
path + "/testXML2.xml", path + "/testXML2.xml",
}; };
testDocumentTypesInternal(files, expectedRecords); testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
} }
@Test @Test
public void testSolrCellDocumentTypes() throws Exception { public void testSolrCellDocumentTypes() throws Exception {
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
morphline = createMorphline("test-morphlines/solrCellDocumentTypes");
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + "/test-documents"; String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] { String[] files = new String[] {
path + "/testBMPfp.txt", path + "/testBMPfp.txt",
@ -107,22 +178,26 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
path + "/testJPEG_EXIF.jpg.gz", path + "/testJPEG_EXIF.jpg.gz",
path + "/testJPEG_EXIF.jpg.tar.gz", path + "/testJPEG_EXIF.jpg.tar.gz",
path + "/testXML.xml", path + "/testXML.xml",
// path + "/cars.csv", path + "/cars.csv",
// path + "/cars.tsv", // path + "/cars.tsv",
// path + "/cars.ssv", // path + "/cars.ssv",
// path + "/cars.csv.gz", path + "/cars.csv.gz",
// path + "/cars.tar.gz", path + "/cars.tar.gz",
path + "/sample-statuses-20120906-141433.avro", path + "/sample-statuses-20120906-141433.avro",
path + "/sample-statuses-20120906-141433", path + "/sample-statuses-20120906-141433",
path + "/sample-statuses-20120906-141433.gz", path + "/sample-statuses-20120906-141433.gz",
path + "/sample-statuses-20120906-141433.bz2", path + "/sample-statuses-20120906-141433.bz2",
path + "/email.eml",
}; };
testDocumentTypesInternal(files, expectedRecords); testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
} }
@Test @Test
public void testSolrCellDocumentTypes2() throws Exception { public void testSolrCellDocumentTypes2() throws Exception {
morphline = createMorphline("test-morphlines/solrCellDocumentTypes");
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + "/test-documents"; String path = RESOURCES_DIR + "/test-documents";
String[] files = new String[] { String[] files = new String[] {
path + "/testPPT_various.ppt", path + "/testPPT_various.ppt",
@ -137,7 +212,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
path + "/complex.mbox", path + "/complex.mbox",
path + "/test-outlook.msg", path + "/test-outlook.msg",
path + "/testEMLX.emlx", path + "/testEMLX.emlx",
// path + "/testRFC822", path + "/testRFC822",
path + "/rsstest.rss", path + "/rsstest.rss",
// path + "/testDITA.dita", // path + "/testDITA.dita",
@ -176,7 +251,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
// path + "/testWINMAIL.dat", // path + "/testWINMAIL.dat",
// path + "/testWMF.wmf", // path + "/testWMF.wmf",
}; };
testDocumentTypesInternal(files, expectedRecords); testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
} }
/** /**

View File

@ -20,11 +20,13 @@ import org.kitesdk.morphline.api.MorphlineCompilationException;
import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.MorphlineContext;
import org.kitesdk.morphline.api.MorphlineRuntimeException; import org.kitesdk.morphline.api.MorphlineRuntimeException;
import org.kitesdk.morphline.base.Configs; import org.kitesdk.morphline.base.Configs;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
import com.typesafe.config.Config; import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory; import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions; import com.typesafe.config.ConfigRenderOptions;
import com.typesafe.config.ConfigUtil; import com.typesafe.config.ConfigUtil;
import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CloudSolrServer; import org.apache.solr.client.solrj.impl.CloudSolrServer;
import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.SolrZkClient;
@ -39,8 +41,10 @@ import org.xml.sax.InputSource;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.ParserConfigurationException;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
/** /**
* Set of configuration parameters that identify the location and schema of a Solr server or * Set of configuration parameters that identify the location and schema of a Solr server or
@ -57,8 +61,6 @@ public class SolrLocator {
private String solrHomeDir; private String solrHomeDir;
private int batchSize = 1000; private int batchSize = 1000;
private static final String SOLR_HOME_PROPERTY_NAME = "solr.solr.home";
private static final Logger LOG = LoggerFactory.getLogger(SolrLocator.class); private static final Logger LOG = LoggerFactory.getLogger(SolrLocator.class);
protected SolrLocator(MorphlineContext context) { protected SolrLocator(MorphlineContext context) {
@ -120,7 +122,6 @@ public class SolrLocator {
// If solrHomeDir isn't defined and zkHost and collectionName are defined // If solrHomeDir isn't defined and zkHost and collectionName are defined
// then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir // then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir
String oldSolrHomeDir = null;
String mySolrHomeDir = solrHomeDir; String mySolrHomeDir = solrHomeDir;
if (solrHomeDir == null || solrHomeDir.length() == 0) { if (solrHomeDir == null || solrHomeDir.length() == 0) {
if (zkHost == null || zkHost.length() == 0) { if (zkHost == null || zkHost.length() == 0) {
@ -150,20 +151,13 @@ public class SolrLocator {
} }
} }
oldSolrHomeDir = System.setProperty(SOLR_HOME_PROPERTY_NAME, mySolrHomeDir); LOG.debug("SolrLocator loading IndexSchema from dir {}", mySolrHomeDir);
try { try {
SolrConfig solrConfig = new SolrConfig(); // TODO use SolrResourceLoader ala TikaMapper? SolrResourceLoader loader = new SolrResourceLoader(mySolrHomeDir);
// SolrConfig solrConfig = new SolrConfig("solrconfig.xml"); SolrConfig solrConfig = new SolrConfig(loader, "solrconfig.xml", null);
// SolrConfig solrConfig = new
// SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1",
// "solrconfig.xml", null);
// SolrConfig solrConfig = new
// SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1/conf/solrconfig.xml");
SolrResourceLoader loader = solrConfig.getResourceLoader();
InputSource is = new InputSource(loader.openSchema("schema.xml")); InputSource is = new InputSource(loader.openSchema("schema.xml"));
is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml")); is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml"));
IndexSchema schema = new IndexSchema(solrConfig, "schema.xml", is); IndexSchema schema = new IndexSchema(solrConfig, "schema.xml", is);
validateSchema(schema); validateSchema(schema);
return schema; return schema;
@ -173,14 +167,6 @@ public class SolrLocator {
throw new MorphlineRuntimeException(e); throw new MorphlineRuntimeException(e);
} catch (SAXException e) { } catch (SAXException e) {
throw new MorphlineRuntimeException(e); throw new MorphlineRuntimeException(e);
} finally { // restore old global state
if (solrHomeDir != null) {
if (oldSolrHomeDir == null) {
System.clearProperty(SOLR_HOME_PROPERTY_NAME);
} else {
System.setProperty(SOLR_HOME_PROPERTY_NAME, oldSolrHomeDir);
}
}
} }
} }

View File

@ -19,6 +19,7 @@ package org.apache.solr.morphlines.solr;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -28,7 +29,6 @@ import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServer;
@ -41,9 +41,6 @@ import org.apache.solr.util.ExternalPaths;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.kitesdk.morphline.api.Collector; import org.kitesdk.morphline.api.Collector;
import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.Command;
import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.MorphlineContext;
@ -53,6 +50,9 @@ import org.kitesdk.morphline.base.FaultTolerance;
import org.kitesdk.morphline.base.Fields; import org.kitesdk.morphline.base.Fields;
import org.kitesdk.morphline.base.Notifications; import org.kitesdk.morphline.base.Notifications;
import org.kitesdk.morphline.stdlib.PipeBuilder; import org.kitesdk.morphline.stdlib.PipeBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.MetricRegistry;
import com.google.common.io.Files; import com.google.common.io.Files;
import com.typesafe.config.Config; import com.typesafe.config.Config;
@ -73,6 +73,8 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
protected static final AtomicInteger SEQ_NUM = new AtomicInteger(); protected static final AtomicInteger SEQ_NUM = new AtomicInteger();
protected static final AtomicInteger SEQ_NUM2 = new AtomicInteger(); protected static final AtomicInteger SEQ_NUM2 = new AtomicInteger();
protected static final Object NON_EMPTY_FIELD = new Object();
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractSolrMorphlineTestBase.class); private static final Logger LOGGER = LoggerFactory.getLogger(AbstractSolrMorphlineTestBase.class);
protected String tempDir; protected String tempDir;
@ -113,7 +115,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
testServer = new SolrServerDocumentLoader(solrServer, batchSize); testServer = new SolrServerDocumentLoader(solrServer, batchSize);
deleteAllDocuments(); deleteAllDocuments();
tempDir = TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis(); tempDir = new File(TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis()).getAbsolutePath();
new File(tempDir).mkdirs(); new File(tempDir).mkdirs();
} }
@ -124,7 +126,11 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
super.tearDown(); super.tearDown();
} }
protected void testDocumentTypesInternal(String[] files, Map<String,Integer> expectedRecords) throws Exception { protected void testDocumentTypesInternal(
String[] files,
Map<String,Integer> expectedRecords,
Map<String, Map<String, Object>> expectedRecordContents) throws Exception {
deleteAllDocuments(); deleteAllDocuments();
int numDocs = 0; int numDocs = 0;
for (int i = 0; i < 1; i++) { for (int i = 0; i < 1; i++) {
@ -137,6 +143,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(body)); event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(body));
event.getFields().put(Fields.ATTACHMENT_NAME, f.getName()); event.getFields().put(Fields.ATTACHMENT_NAME, f.getName());
event.getFields().put(Fields.BASE_ID, f.getName()); event.getFields().put(Fields.BASE_ID, f.getName());
collector.reset();
load(event); load(event);
Integer count = expectedRecords.get(file); Integer count = expectedRecords.get(file);
if (count != null) { if (count != null) {
@ -145,6 +152,20 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
numDocs++; numDocs++;
} }
assertEquals("unexpected results in " + file, numDocs, queryResultSetSize("*:*")); assertEquals("unexpected results in " + file, numDocs, queryResultSetSize("*:*"));
Map<String, Object> expectedContents = expectedRecordContents.get(file);
if (expectedContents != null) {
Record actual = collector.getFirstRecord();
for (Map.Entry<String, Object> entry : expectedContents.entrySet()) {
if (entry.getValue() == NON_EMPTY_FIELD) {
assertNotNull(entry.getKey());
assertTrue(actual.getFirstValue(entry.getKey()).toString().length() > 0);
} else if (entry.getValue() == null) {
assertEquals("key:" + entry.getKey(), 0, actual.get(entry.getKey()).size());
} else {
assertEquals("key:" + entry.getKey(), Arrays.asList(entry.getValue()), actual.get(entry.getKey()));
}
}
}
} }
} }
assertEquals(numDocs, queryResultSetSize("*:*")); assertEquals(numDocs, queryResultSetSize("*:*"));
@ -180,17 +201,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
s.commit(); s.commit();
} }
public static void setupMorphline(String tempDir, String file) throws IOException {
String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
morphlineText = morphlineText.replace("RESOURCES_DIR", StringEscapeUtils.escapeJavaScript(new File(tempDir).getAbsolutePath()));
FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
}
protected Command createMorphline(String file) throws IOException { protected Command createMorphline(String file) throws IOException {
setupMorphline(tempDir, file);
return new PipeBuilder().build(parse(file), null, collector, createMorphlineContext()); return new PipeBuilder().build(parse(file), null, collector, createMorphlineContext());
} }
@ -206,7 +217,13 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
private Config parse(String file) throws IOException { private Config parse(String file) throws IOException {
SolrLocator locator = new SolrLocator(createMorphlineContext()); SolrLocator locator = new SolrLocator(createMorphlineContext());
locator.setSolrHomeDir(testSolrHome + "/collection1"); locator.setSolrHomeDir(testSolrHome + "/collection1");
Config config = new Compiler().parse(new File(tempDir + "/" + file + ".conf"), locator.toConfig("SOLR_LOCATOR")); File morphlineFile;
if (new File(file).isAbsolute()) {
morphlineFile = new File(file + ".conf");
} else {
morphlineFile = new File(RESOURCES_DIR + "/" + file + ".conf");
}
Config config = new Compiler().parse(morphlineFile, locator.toConfig("SOLR_LOCATOR"));
config = config.getConfigList("morphlines").get(0); config = config.getConfigList("morphlines").get(0);
return config; return config;
} }
@ -266,4 +283,15 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
public HashSet<String> getFieldValues() { return fieldValues; } public HashSet<String> getFieldValues() { return fieldValues; }
public CompareType getCompareType() { return compareType; } public CompareType getCompareType() { return compareType; }
} }
public static void setupMorphline(String tempDir, String file, boolean replaceSolrLocator) throws IOException {
String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath());
if (replaceSolrLocator) {
morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}",
"{ collection : collection1 }");
}
new File(tempDir + "/" + file + ".conf").getParentFile().mkdirs();
FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
}
} }

View File

@ -57,18 +57,21 @@ public class SolrMorphlineTest extends AbstractSolrMorphlineTestBase {
@Test @Test
public void testTokenizeText() throws Exception { public void testTokenizeText() throws Exception {
morphline = createMorphline("test-morphlines/tokenizeText"); morphline = createMorphline("test-morphlines/tokenizeText");
Record record = new Record(); for (int i = 0; i < 3; i++) {
record.put(Fields.MESSAGE, "Hello World!"); Record record = new Record();
record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123"); record.put(Fields.MESSAGE, "Hello World!");
Record expected = record.copy(); record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123");
expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123")); Record expected = record.copy();
startSession(); expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123"));
Notifications.notifyBeginTransaction(morphline); collector.reset();
assertTrue(morphline.process(record)); startSession();
assertEquals(1, collector.getNumStartEvents()); Notifications.notifyBeginTransaction(morphline);
Notifications.notifyCommitTransaction(morphline); assertTrue(morphline.process(record));
assertEquals(expected, collector.getFirstRecord()); assertEquals(1, collector.getNumStartEvents());
Notifications.notifyCommitTransaction(morphline);
assertEquals(expected, collector.getFirstRecord());
}
} }
} }