diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 41e0111ff3d..b036fe1b587 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -160,6 +160,9 @@ New Features & json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} } (yonik) +* SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata + (Tim Allison via Noble Paul) + Bug Fixes ---------------------- diff --git a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java index fc16113b2c7..fb8cdb81287 100644 --- a/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java @@ -45,6 +45,7 @@ import java.io.InputStream; import java.io.StringWriter; import java.io.Writer; import java.util.HashMap; +import java.util.Locale; import java.util.Map; import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE; @@ -55,6 +56,10 @@ import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL; *

An implementation of {@link EntityProcessor} which reads data from rich docs * using Apache Tika * + *

To index latitude/longitude data that might + * be extracted from a file's metadata, identify + * the geo field for this information with this attribute: + * spatialMetadataField * * @since solr 3.1 */ @@ -67,6 +72,7 @@ public class TikaEntityProcessor extends EntityProcessorBase { private String parser; static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser"; private String htmlMapper; + private String spatialMetadataField; @Override public void init(Context context) { @@ -113,6 +119,8 @@ public class TikaEntityProcessor extends EntityProcessorBase { if(parser == null) { parser = AUTO_PARSER; } + + spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField"); } @Override @@ -167,10 +175,20 @@ public class TikaEntityProcessor extends EntityProcessorBase { if (s != null) row.put(col, s); } if(!"none".equals(format) ) row.put("text", sw.toString()); + tryToAddLatLon(metadata, row); done = true; return row; } + private void tryToAddLatLon(Metadata metadata, Map row) { + if (spatialMetadataField == null) return; + String latString = metadata.get(Metadata.LATITUDE); + String lonString = metadata.get(Metadata.LONGITUDE); + if (latString != null && lonString != null) { + row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString)); + } + } + private static ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException { SAXTransformerFactory factory = (SAXTransformerFactory) diff --git a/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml b/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml index 1ebb9d9d953..2d9fa64d618 100644 --- a/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml +++ b/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/solr/collection1/conf/dataimport-schema-no-unique-key.xml @@ -77,6 +77,8 @@ + + text diff --git a/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg b/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg new file mode 100644 index 00000000000..10d1ebb2d32 Binary files /dev/null and b/solr/contrib/dataimporthandler-extras/src/test-files/dihextras/test_jpeg.jpg differ diff --git a/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java b/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java index 3d4b1ab2d86..fe769bc8c87 100644 --- a/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java +++ b/solr/contrib/dataimporthandler-extras/src/test/org/apache/solr/handler/dataimport/TestTikaEntityProcessor.java @@ -51,6 +51,18 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase { " " + ""; + private String spatialConf = + "" + + " " + + " " + + " " + + " " + + " " + + " " + + ""; + + private String[] tests = { "//*[@numFound='1']" ,"//str[@name='author'][.='Grant Ingersoll']" @@ -74,6 +86,10 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase { , "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased }; + private String[] testsSpatial = { + "//*[@numFound='1']" + }; + private String[] testsEmbedded = { "//*[@numFound='1']", "//str[@name='text'][contains(.,'When in the Course')]" @@ -121,6 +137,16 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase { assertQ(req("*:*"), testsHTMLIdentity); } + @Test + public void testTikaGeoMetadata() throws Exception { + runFullImport(spatialConf); + String pt = "38.97,-77.018"; + Double distance = 5.0d; + assertQ(req("q", "*:* OR foo_i:" + random().nextInt(100), "fq", + "{!geofilt sfield=\"home\"}\"", + "pt", pt, "d", String.valueOf(distance)), testsSpatial); + } + private String getConfigHTML(String htmlMapper) { return "" +