SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1677001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Noble Paul 2015-04-30 14:20:51 +00:00
parent c56f522a8f
commit bd8b77f9bb
5 changed files with 51 additions and 1 deletions

View File

@ -160,6 +160,9 @@ New Features
& json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} }
(yonik)
* SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
(Tim Allison via Noble Paul)
Bug Fixes
----------------------

View File

@ -45,6 +45,7 @@ import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
@ -55,6 +56,10 @@ import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL;
* <p>An implementation of {@link EntityProcessor} which reads data from rich docs
* using <a href="http://tika.apache.org/">Apache Tika</a>
*
* <p>To index latitude/longitude data that might
* be extracted from a file's metadata, identify
* the geo field for this information with this attribute:
* <code>spatialMetadataField</code>
*
* @since solr 3.1
*/
@ -67,6 +72,7 @@ public class TikaEntityProcessor extends EntityProcessorBase {
private String parser;
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
private String htmlMapper;
private String spatialMetadataField;
@Override
public void init(Context context) {
@ -113,6 +119,8 @@ public class TikaEntityProcessor extends EntityProcessorBase {
if(parser == null) {
parser = AUTO_PARSER;
}
spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField");
}
@Override
@ -167,10 +175,20 @@ public class TikaEntityProcessor extends EntityProcessorBase {
if (s != null) row.put(col, s);
}
if(!"none".equals(format) ) row.put("text", sw.toString());
tryToAddLatLon(metadata, row);
done = true;
return row;
}
private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) {
if (spatialMetadataField == null) return;
String latString = metadata.get(Metadata.LATITUDE);
String lonString = metadata.get(Metadata.LONGITUDE);
if (latString != null && lonString != null) {
row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString));
}
}
private static ContentHandler getHtmlHandler(Writer writer)
throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory)

View File

@ -77,6 +77,8 @@
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
<!--
Numeric field types that index each value at various levels of precision
@ -199,7 +201,8 @@
<field name="title" type="string" indexed="true" stored="true"/>
<field name="author" type="string" indexed="true" stored="true" />
<field name="text" type="text" indexed="true" stored="true" />
<field name="foo_i" type="int" indexed="true" stored="false" />
<field name="home" type="latLon" indexed="true" stored="true" />
</fields>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>text</defaultSearchField>

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

View File

@ -51,6 +51,18 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
" </document>" +
"</dataConfig>";
private String spatialConf =
"<dataConfig>" +
" <dataSource type=\"BinFileDataSource\"/>" +
" <document>" +
" <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
getFile("dihextras/test_jpeg.jpg").getAbsolutePath() + "\" spatialMetadataField=\"home\">" +
" <field column=\"text\"/>" +
" </entity>" +
" </document>" +
"</dataConfig>";
private String[] tests = {
"//*[@numFound='1']"
,"//str[@name='author'][.='Grant Ingersoll']"
@ -74,6 +86,10 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
};
private String[] testsSpatial = {
"//*[@numFound='1']"
};
private String[] testsEmbedded = {
"//*[@numFound='1']",
"//str[@name='text'][contains(.,'When in the Course')]"
@ -121,6 +137,16 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
assertQ(req("*:*"), testsHTMLIdentity);
}
@Test
public void testTikaGeoMetadata() throws Exception {
runFullImport(spatialConf);
String pt = "38.97,-77.018";
Double distance = 5.0d;
assertQ(req("q", "*:* OR foo_i:" + random().nextInt(100), "fq",
"{!geofilt sfield=\"home\"}\"",
"pt", pt, "d", String.valueOf(distance)), testsSpatial);
}
private String getConfigHTML(String htmlMapper) {
return
"<dataConfig>" +