mirror of https://github.com/apache/lucene.git
SOLR-1301: Merge Morphlines modules up to Kite 0.10 and CDK 0.9
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1552398 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e7d605a2d5
commit
0a6017d7da
|
@ -92,6 +92,7 @@
|
|||
<path id="test.classpath">
|
||||
<path refid="solr.test.base.classpath"/>
|
||||
<path refid="classpath.additions"/>
|
||||
<pathelement location="${common-solr.dir}/build/contrib/solr-morphlines-core/classes/test"/>
|
||||
<fileset dir="${common-solr.dir}/contrib/morphlines-core/test-lib" excludes="${common.classpath.excludes}"/>
|
||||
</path>
|
||||
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
Age,Color,Extras,Type,Used
|
||||
2,blue,GPS,"Gas, with electric",""
|
||||
10,green,"Labeled ""Vintage, 1913""",,yes
|
||||
100,red,"Labeled ""Vintage 1913""",yes
|
||||
5,orange,none,"This is a
|
||||
multi, line text",no
|
Can't render this file because it has a wrong number of fields in line 4.
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,40 @@
|
|||
MIME-Version: 1.0
|
||||
Received: by 10.216.199.5 with HTTP; Wed, 27 Nov 2013 12:01:23 -0800
|
||||
(PST)
|
||||
Date: Wed, 27 Nov 2013 13:01:23 -0700
|
||||
Delivered-To: foo@cloudera.com
|
||||
Message-ID:
|
||||
<CAOi5V169EW4GCfde_aNKSBgqAD=KSPVO6Batw_Oko-8cmAgK6w@mail.gmail.com>
|
||||
Subject: Test EML
|
||||
From: Patrick Foo <foo@cloudera.com>
|
||||
To: Patrick Foo <foo@cloudera.com>
|
||||
Content-Type: multipart/alternative;
|
||||
boundary=001a11c3815cb55dda04ec2e0f3b
|
||||
|
||||
--001a11c3815cb55dda04ec2e0f3b
|
||||
Content-Type: text/plain; charset=ISO-8859-1
|
||||
|
||||
This is a test
|
||||
|
||||
--
|
||||
Patrick Foo
|
||||
Customer Operations Engineer
|
||||
|
||||
<http://www.cloudera.com>
|
||||
|
||||
--001a11c3815cb55dda04ec2e0f3b
|
||||
Content-Type: text/html; charset=ISO-8859-1
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
|
||||
<div dir=3D"ltr">This is a test<br clear=3D"all"><div><br></div>--
|
||||
<br><div=
|
||||
dir=3D"ltr">Patrick Foo<div>Customer Operations
|
||||
Engineer</div><div><br>=
|
||||
</div><div><a href=3D"http://www.cloudera.com" target=3D"_blank"><img
|
||||
src=
|
||||
=3D"http://files.cloudera.com.s3.amazonaws.com/New%20Branding/cloudera-smal=
|
||||
l.png"></a><br>
|
||||
</div></div>
|
||||
</div>
|
||||
|
||||
--001a11c3815cb55dda04ec2e0f3b--
|
|
@ -0,0 +1,41 @@
|
|||
From: "Julien Nioche (JIRA)" <jira@apache.org>
|
||||
To: dev@tika.apache.org
|
||||
Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
|
||||
Reply-To: dev@tika.apache.org
|
||||
Delivered-To: mailing list dev@tika.apache.org
|
||||
Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
|
||||
In-Reply-To: <6089099.260231278600349994.JavaMail.jira@thor>
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=utf-8
|
||||
Content-Transfer-Encoding: 7bit
|
||||
X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
|
||||
X-Virus-Checked: Checked by ClamAV on apache.org
|
||||
|
||||
|
||||
[ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ]
|
||||
|
||||
Julien Nioche commented on TIKA-461:
|
||||
------------------------------------
|
||||
|
||||
I'll have a look at mime4j and try to use it in Tika
|
||||
|
||||
> RFC822 messages not parsed
|
||||
> --------------------------
|
||||
>
|
||||
> Key: TIKA-461
|
||||
> URL: https://issues.apache.org/jira/browse/TIKA-461
|
||||
> Project: Tika
|
||||
> Issue Type: Bug
|
||||
> Components: parser
|
||||
> Affects Versions: 0.7
|
||||
> Reporter: Joshua Turner
|
||||
> Assignee: Julien Nioche
|
||||
>
|
||||
> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
|
||||
> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
|
||||
|
||||
--
|
||||
This message is automatically generated by JIRA.
|
||||
-
|
||||
You can reply to this email to add a comment to the issue online.
|
||||
|
|
@ -78,6 +78,32 @@ morphlines : [
|
|||
]
|
||||
}
|
||||
|
||||
{
|
||||
commands : [
|
||||
{
|
||||
readCSV {
|
||||
supportedMimeTypes : [text/csv]
|
||||
charset : UTF-8
|
||||
ignoreFirstLine : false
|
||||
columns : [ user_screen_name, text ]
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
generateUUID {
|
||||
field : id
|
||||
preserveExisting : false
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
sanitizeUnknownSolrFields {
|
||||
solrLocator : ${SOLR_LOCATOR}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
{
|
||||
commands : [
|
||||
{
|
||||
|
@ -180,6 +206,7 @@ morphlines : [
|
|||
# the parser is chosen that is closest to the bottom in this list:
|
||||
parsers : [
|
||||
{ parser : org.apache.tika.parser.asm.ClassParser }
|
||||
# { parser : org.apache.tika.parser.AutoDetectParser }
|
||||
# { parser : org.gagravarr.tika.OggParser, additionalSupportedMimeTypes : [audio/ogg] }
|
||||
{ parser : org.gagravarr.tika.FlacParser }
|
||||
{ parser : org.apache.tika.parser.audio.AudioParser }
|
||||
|
@ -218,6 +245,7 @@ morphlines : [
|
|||
{ parser : org.apache.tika.parser.xml.DcXMLParser }
|
||||
{ parser : org.apache.tika.parser.xml.FictionBookParser }
|
||||
{ parser : org.apache.tika.parser.chm.ChmParser }
|
||||
#{ parser : org.apache.tika.parser.AutoDetectParser }
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.commons.io.FileUtils;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.hadoop.morphline.MorphlineMapRunner;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.apache.solr.util.ExternalPaths;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -50,17 +51,10 @@ public abstract class MRUnitBase extends SolrTestCaseJ4 {
|
|||
new File(tempDir).mkdirs();
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
|
||||
setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes");
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
|
||||
|
||||
config.set(MorphlineMapRunner.MORPHLINE_FILE_PARAM, tempDir + "/test-morphlines/solrCellDocumentTypes.conf");
|
||||
config.set(SolrOutputFormat.ZIP_NAME, solrHomeZip.getName());
|
||||
}
|
||||
|
||||
public static void setupMorphline(String tempDir, String file) throws IOException {
|
||||
String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
|
||||
morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath());
|
||||
morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}", "{ collection : collection1 }");
|
||||
|
||||
FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ import org.apache.lucene.util.LuceneTestCase.Slow;
|
|||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.cloud.AbstractZkTestCase;
|
||||
import org.apache.solr.hadoop.hack.MiniMRCluster;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.apache.solr.util.ExternalPaths;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -125,7 +126,7 @@ public class MorphlineBasicMiniMRTest extends SolrTestCaseJ4 {
|
|||
new File(tempDir).mkdirs();
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
|
||||
MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes");
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
|
||||
|
||||
System.setProperty("hadoop.log.dir", new File(solrHomeDirectory, "logs").getAbsolutePath());
|
||||
|
||||
|
|
|
@ -67,6 +67,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
|
|||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.hadoop.hack.MiniMRClientCluster;
|
||||
import org.apache.solr.hadoop.hack.MiniMRClientClusterFactory;
|
||||
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
|
||||
import org.apache.solr.util.ExternalPaths;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
|
@ -142,7 +143,7 @@ public class MorphlineGoLiveMiniMRTest extends AbstractFullDistribZkTestBase {
|
|||
new File(tempDir).mkdirs();
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
|
||||
MRUnitBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes");
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", true);
|
||||
|
||||
|
||||
System.setProperty("hadoop.log.dir", new File(dataDir, "logs").getAbsolutePath());
|
||||
|
|
|
@ -223,16 +223,12 @@ public final class SolrCellBuilder implements CommandBuilder {
|
|||
|
||||
ParseContext parseContext = new ParseContext();
|
||||
|
||||
// necessary for gzipped files or tar files, etc! copied from TikaCLI
|
||||
parseContext.set(Parser.class, parser);
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
for (Entry<String, Object> entry : record.getFields().entries()) {
|
||||
metadata.add(entry.getKey(), entry.getValue().toString());
|
||||
}
|
||||
|
||||
SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
|
||||
|
||||
try {
|
||||
inputStream = TikaInputStream.get(inputStream);
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.solr.morphlines.cell;
|
|||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
|
@ -37,7 +38,7 @@ import org.junit.Test;
|
|||
public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
||||
|
||||
private Map<String,Integer> expectedRecords = new HashMap<String,Integer>();
|
||||
|
||||
private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<String, Map<String, Object>>();
|
||||
@BeforeClass
|
||||
public static void beforeClass2() {
|
||||
assumeFalse("FIXME: This test fails under Java 8 due to the Saxon dependency - see SOLR-1301", Constants.JRE_IS_MINIMUM_JAVA8);
|
||||
|
@ -47,16 +48,17 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
String path = RESOURCES_DIR + "/test-documents";
|
||||
expectedRecords.put(path + "/sample-statuses-20120906-141433.avro", 2);
|
||||
expectedRecords.put(path + "/sample-statuses-20120906-141433", 2);
|
||||
expectedRecords.put(path + "/sample-statuses-20120906-141433.gz", 2);
|
||||
expectedRecords.put(path + "/sample-statuses-20120906-141433.bz2", 2);
|
||||
expectedRecords.put(path + "/cars.csv", 5);
|
||||
expectedRecords.put(path + "/cars.csv.gz", 5);
|
||||
expectedRecords.put(path + "/cars.csv", 6);
|
||||
expectedRecords.put(path + "/cars.csv.gz", 6);
|
||||
expectedRecords.put(path + "/cars.tar.gz", 4);
|
||||
expectedRecords.put(path + "/cars.tsv", 5);
|
||||
expectedRecords.put(path + "/cars.ssv", 5);
|
||||
expectedRecords.put(path + "/cars.tsv", 6);
|
||||
expectedRecords.put(path + "/cars.ssv", 6);
|
||||
expectedRecords.put(path + "/test-documents.7z", 9);
|
||||
expectedRecords.put(path + "/test-documents.cpio", 9);
|
||||
expectedRecords.put(path + "/test-documents.tar", 9);
|
||||
|
@ -65,12 +67,80 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
expectedRecords.put(path + "/test-documents.zip", 9);
|
||||
expectedRecords.put(path + "/multiline-stacktrace.log", 4);
|
||||
|
||||
{
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "image/jpeg");
|
||||
record.put("ignored_exif_isospeedratings", "400");
|
||||
record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
|
||||
record.put("ignored_tiff_model", "Canon EOS 40D");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
|
||||
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
|
||||
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "/testWORD_various.doc";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "application/msword");
|
||||
record.put("ignored_author", "Michael McCandless");
|
||||
record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
|
||||
record.put("ignored_title", "");
|
||||
record.put("ignored_keywords", "Keyword1 Keyword2");
|
||||
record.put("ignored_subject", "Subject is here");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "/testPDF.pdf";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "application/pdf");
|
||||
record.put("ignored_author", "Bertrand Delacrétaz");
|
||||
record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
|
||||
record.put("ignored_title", "Apache Tika - Apache Tika");
|
||||
record.put("ignored_xmp_creatortool", "Firefox");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "/email.eml";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
String name = "Patrick Foo <foo@cloudera.com>";
|
||||
record.put("ignored__attachment_mimetype", "message/rfc822");
|
||||
record.put("ignored_author", name);
|
||||
//record.put("ignored_content_length", "1068");
|
||||
record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
|
||||
record.put("ignored_message_from", name);
|
||||
record.put("ignored_message_to", name);
|
||||
record.put("ignored_creator", name);
|
||||
record.put("ignored_dc_creator", name);
|
||||
record.put("ignored_dc_title", "Test EML");
|
||||
record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
|
||||
record.put("ignored_meta_author", name);
|
||||
record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
|
||||
record.put("ignored_subject", "Test EML");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
{
|
||||
String file = path + "/testEXCEL.xlsx";
|
||||
Map<String, Object> record = new LinkedHashMap();
|
||||
record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
|
||||
record.put("ignored_author", "Keith Bennett");
|
||||
record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
|
||||
record.put("ignored_title", "Simple Excel document");
|
||||
record.put("text", NON_EMPTY_FIELD);
|
||||
expectedRecordContents.put(file, record);
|
||||
}
|
||||
|
||||
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSolrCellJPGCompressed() throws Exception {
|
||||
|
||||
morphline = createMorphline("test-morphlines/solrCellJPGCompressed");
|
||||
String path = RESOURCES_DIR + "/test-documents";
|
||||
String[] files = new String[] {
|
||||
|
@ -79,7 +149,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
path + "/testJPEG_EXIF.jpg.tar.gz",
|
||||
//path + "/jpeg2000.jp2",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords);
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -89,13 +159,14 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
String[] files = new String[] {
|
||||
path + "/testXML2.xml",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords);
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSolrCellDocumentTypes() throws Exception {
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
|
||||
|
||||
morphline = createMorphline("test-morphlines/solrCellDocumentTypes");
|
||||
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
|
||||
String path = RESOURCES_DIR + "/test-documents";
|
||||
String[] files = new String[] {
|
||||
path + "/testBMPfp.txt",
|
||||
|
@ -107,22 +178,26 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
path + "/testJPEG_EXIF.jpg.gz",
|
||||
path + "/testJPEG_EXIF.jpg.tar.gz",
|
||||
path + "/testXML.xml",
|
||||
// path + "/cars.csv",
|
||||
path + "/cars.csv",
|
||||
// path + "/cars.tsv",
|
||||
// path + "/cars.ssv",
|
||||
// path + "/cars.csv.gz",
|
||||
// path + "/cars.tar.gz",
|
||||
path + "/cars.csv.gz",
|
||||
path + "/cars.tar.gz",
|
||||
path + "/sample-statuses-20120906-141433.avro",
|
||||
path + "/sample-statuses-20120906-141433",
|
||||
path + "/sample-statuses-20120906-141433.gz",
|
||||
path + "/sample-statuses-20120906-141433.bz2",
|
||||
path + "/email.eml",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords);
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSolrCellDocumentTypes2() throws Exception {
|
||||
morphline = createMorphline("test-morphlines/solrCellDocumentTypes");
|
||||
|
||||
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
|
||||
|
||||
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
|
||||
String path = RESOURCES_DIR + "/test-documents";
|
||||
String[] files = new String[] {
|
||||
path + "/testPPT_various.ppt",
|
||||
|
@ -137,7 +212,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
path + "/complex.mbox",
|
||||
path + "/test-outlook.msg",
|
||||
path + "/testEMLX.emlx",
|
||||
// path + "/testRFC822",
|
||||
path + "/testRFC822",
|
||||
path + "/rsstest.rss",
|
||||
// path + "/testDITA.dita",
|
||||
|
||||
|
@ -176,7 +251,7 @@ public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
// path + "/testWINMAIL.dat",
|
||||
// path + "/testWMF.wmf",
|
||||
};
|
||||
testDocumentTypesInternal(files, expectedRecords);
|
||||
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -20,11 +20,13 @@ import org.kitesdk.morphline.api.MorphlineCompilationException;
|
|||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
import org.kitesdk.morphline.api.MorphlineRuntimeException;
|
||||
import org.kitesdk.morphline.base.Configs;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.typesafe.config.Config;
|
||||
import com.typesafe.config.ConfigFactory;
|
||||
import com.typesafe.config.ConfigRenderOptions;
|
||||
import com.typesafe.config.ConfigUtil;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrServer;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrServer;
|
||||
import org.apache.solr.common.cloud.SolrZkClient;
|
||||
|
@ -39,8 +41,10 @@ import org.xml.sax.InputSource;
|
|||
import org.xml.sax.SAXException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
/**
|
||||
* Set of configuration parameters that identify the location and schema of a Solr server or
|
||||
|
@ -57,8 +61,6 @@ public class SolrLocator {
|
|||
private String solrHomeDir;
|
||||
private int batchSize = 1000;
|
||||
|
||||
private static final String SOLR_HOME_PROPERTY_NAME = "solr.solr.home";
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SolrLocator.class);
|
||||
|
||||
protected SolrLocator(MorphlineContext context) {
|
||||
|
@ -120,7 +122,6 @@ public class SolrLocator {
|
|||
|
||||
// If solrHomeDir isn't defined and zkHost and collectionName are defined
|
||||
// then download schema.xml and solrconfig.xml, etc from zk and use that as solrHomeDir
|
||||
String oldSolrHomeDir = null;
|
||||
String mySolrHomeDir = solrHomeDir;
|
||||
if (solrHomeDir == null || solrHomeDir.length() == 0) {
|
||||
if (zkHost == null || zkHost.length() == 0) {
|
||||
|
@ -150,17 +151,10 @@ public class SolrLocator {
|
|||
}
|
||||
}
|
||||
|
||||
oldSolrHomeDir = System.setProperty(SOLR_HOME_PROPERTY_NAME, mySolrHomeDir);
|
||||
LOG.debug("SolrLocator loading IndexSchema from dir {}", mySolrHomeDir);
|
||||
try {
|
||||
SolrConfig solrConfig = new SolrConfig(); // TODO use SolrResourceLoader ala TikaMapper?
|
||||
// SolrConfig solrConfig = new SolrConfig("solrconfig.xml");
|
||||
// SolrConfig solrConfig = new
|
||||
// SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1",
|
||||
// "solrconfig.xml", null);
|
||||
// SolrConfig solrConfig = new
|
||||
// SolrConfig("/cloud/apache-solr-4.0.0-BETA/example/solr/collection1/conf/solrconfig.xml");
|
||||
SolrResourceLoader loader = solrConfig.getResourceLoader();
|
||||
|
||||
SolrResourceLoader loader = new SolrResourceLoader(mySolrHomeDir);
|
||||
SolrConfig solrConfig = new SolrConfig(loader, "solrconfig.xml", null);
|
||||
InputSource is = new InputSource(loader.openSchema("schema.xml"));
|
||||
is.setSystemId(SystemIdResolver.createSystemIdFromResourceName("schema.xml"));
|
||||
|
||||
|
@ -173,14 +167,6 @@ public class SolrLocator {
|
|||
throw new MorphlineRuntimeException(e);
|
||||
} catch (SAXException e) {
|
||||
throw new MorphlineRuntimeException(e);
|
||||
} finally { // restore old global state
|
||||
if (solrHomeDir != null) {
|
||||
if (oldSolrHomeDir == null) {
|
||||
System.clearProperty(SOLR_HOME_PROPERTY_NAME);
|
||||
} else {
|
||||
System.setProperty(SOLR_HOME_PROPERTY_NAME, oldSolrHomeDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.solr.morphlines.solr;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -28,7 +29,6 @@ import java.util.Map.Entry;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.lang.StringEscapeUtils;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrServer;
|
||||
|
@ -41,9 +41,6 @@ import org.apache.solr.util.ExternalPaths;
|
|||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.kitesdk.morphline.api.Collector;
|
||||
import org.kitesdk.morphline.api.Command;
|
||||
import org.kitesdk.morphline.api.MorphlineContext;
|
||||
|
@ -53,6 +50,9 @@ import org.kitesdk.morphline.base.FaultTolerance;
|
|||
import org.kitesdk.morphline.base.Fields;
|
||||
import org.kitesdk.morphline.base.Notifications;
|
||||
import org.kitesdk.morphline.stdlib.PipeBuilder;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.google.common.io.Files;
|
||||
import com.typesafe.config.Config;
|
||||
|
@ -73,6 +73,8 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
protected static final AtomicInteger SEQ_NUM = new AtomicInteger();
|
||||
protected static final AtomicInteger SEQ_NUM2 = new AtomicInteger();
|
||||
|
||||
protected static final Object NON_EMPTY_FIELD = new Object();
|
||||
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractSolrMorphlineTestBase.class);
|
||||
|
||||
protected String tempDir;
|
||||
|
@ -113,7 +115,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
testServer = new SolrServerDocumentLoader(solrServer, batchSize);
|
||||
deleteAllDocuments();
|
||||
|
||||
tempDir = TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis();
|
||||
tempDir = new File(TEMP_DIR + "/test-morphlines-" + System.currentTimeMillis()).getAbsolutePath();
|
||||
new File(tempDir).mkdirs();
|
||||
}
|
||||
|
||||
|
@ -124,7 +126,11 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
protected void testDocumentTypesInternal(String[] files, Map<String,Integer> expectedRecords) throws Exception {
|
||||
protected void testDocumentTypesInternal(
|
||||
String[] files,
|
||||
Map<String,Integer> expectedRecords,
|
||||
Map<String, Map<String, Object>> expectedRecordContents) throws Exception {
|
||||
|
||||
deleteAllDocuments();
|
||||
int numDocs = 0;
|
||||
for (int i = 0; i < 1; i++) {
|
||||
|
@ -137,6 +143,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
event.getFields().put(Fields.ATTACHMENT_BODY, new ByteArrayInputStream(body));
|
||||
event.getFields().put(Fields.ATTACHMENT_NAME, f.getName());
|
||||
event.getFields().put(Fields.BASE_ID, f.getName());
|
||||
collector.reset();
|
||||
load(event);
|
||||
Integer count = expectedRecords.get(file);
|
||||
if (count != null) {
|
||||
|
@ -145,6 +152,20 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
numDocs++;
|
||||
}
|
||||
assertEquals("unexpected results in " + file, numDocs, queryResultSetSize("*:*"));
|
||||
Map<String, Object> expectedContents = expectedRecordContents.get(file);
|
||||
if (expectedContents != null) {
|
||||
Record actual = collector.getFirstRecord();
|
||||
for (Map.Entry<String, Object> entry : expectedContents.entrySet()) {
|
||||
if (entry.getValue() == NON_EMPTY_FIELD) {
|
||||
assertNotNull(entry.getKey());
|
||||
assertTrue(actual.getFirstValue(entry.getKey()).toString().length() > 0);
|
||||
} else if (entry.getValue() == null) {
|
||||
assertEquals("key:" + entry.getKey(), 0, actual.get(entry.getKey()).size());
|
||||
} else {
|
||||
assertEquals("key:" + entry.getKey(), Arrays.asList(entry.getValue()), actual.get(entry.getKey()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
assertEquals(numDocs, queryResultSetSize("*:*"));
|
||||
|
@ -180,17 +201,7 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
s.commit();
|
||||
}
|
||||
|
||||
|
||||
public static void setupMorphline(String tempDir, String file) throws IOException {
|
||||
String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
|
||||
morphlineText = morphlineText.replace("RESOURCES_DIR", StringEscapeUtils.escapeJavaScript(new File(tempDir).getAbsolutePath()));
|
||||
|
||||
FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
|
||||
}
|
||||
|
||||
protected Command createMorphline(String file) throws IOException {
|
||||
setupMorphline(tempDir, file);
|
||||
|
||||
return new PipeBuilder().build(parse(file), null, collector, createMorphlineContext());
|
||||
}
|
||||
|
||||
|
@ -206,7 +217,13 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
private Config parse(String file) throws IOException {
|
||||
SolrLocator locator = new SolrLocator(createMorphlineContext());
|
||||
locator.setSolrHomeDir(testSolrHome + "/collection1");
|
||||
Config config = new Compiler().parse(new File(tempDir + "/" + file + ".conf"), locator.toConfig("SOLR_LOCATOR"));
|
||||
File morphlineFile;
|
||||
if (new File(file).isAbsolute()) {
|
||||
morphlineFile = new File(file + ".conf");
|
||||
} else {
|
||||
morphlineFile = new File(RESOURCES_DIR + "/" + file + ".conf");
|
||||
}
|
||||
Config config = new Compiler().parse(morphlineFile, locator.toConfig("SOLR_LOCATOR"));
|
||||
config = config.getConfigList("morphlines").get(0);
|
||||
return config;
|
||||
}
|
||||
|
@ -266,4 +283,15 @@ public class AbstractSolrMorphlineTestBase extends SolrTestCaseJ4 {
|
|||
public HashSet<String> getFieldValues() { return fieldValues; }
|
||||
public CompareType getCompareType() { return compareType; }
|
||||
}
|
||||
|
||||
public static void setupMorphline(String tempDir, String file, boolean replaceSolrLocator) throws IOException {
|
||||
String morphlineText = FileUtils.readFileToString(new File(RESOURCES_DIR + "/" + file + ".conf"), "UTF-8");
|
||||
morphlineText = morphlineText.replaceAll("RESOURCES_DIR", new File(tempDir).getAbsolutePath());
|
||||
if (replaceSolrLocator) {
|
||||
morphlineText = morphlineText.replaceAll("\\$\\{SOLR_LOCATOR\\}",
|
||||
"{ collection : collection1 }");
|
||||
}
|
||||
new File(tempDir + "/" + file + ".conf").getParentFile().mkdirs();
|
||||
FileUtils.writeStringToFile(new File(tempDir + "/" + file + ".conf"), morphlineText, "UTF-8");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,11 +58,13 @@ public class SolrMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
@Test
|
||||
public void testTokenizeText() throws Exception {
|
||||
morphline = createMorphline("test-morphlines/tokenizeText");
|
||||
for (int i = 0; i < 3; i++) {
|
||||
Record record = new Record();
|
||||
record.put(Fields.MESSAGE, "Hello World!");
|
||||
record.put(Fields.MESSAGE, "\nFoo@Bar.com #%()123");
|
||||
Record expected = record.copy();
|
||||
expected.getFields().putAll("tokens", Arrays.asList("hello", "world", "foo", "bar.com", "123"));
|
||||
collector.reset();
|
||||
startSession();
|
||||
Notifications.notifyBeginTransaction(morphline);
|
||||
assertTrue(morphline.process(record));
|
||||
|
@ -70,5 +72,6 @@ public class SolrMorphlineTest extends AbstractSolrMorphlineTestBase {
|
|||
Notifications.notifyCommitTransaction(morphline);
|
||||
assertEquals(expected, collector.getFirstRecord());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue