mirror of https://github.com/apache/lucene.git
SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1677001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c56f522a8f
commit
bd8b77f9bb
|
@ -160,6 +160,9 @@ New Features
|
|||
& json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} }
|
||||
(yonik)
|
||||
|
||||
* SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
|
||||
(Tim Allison via Noble Paul)
|
||||
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
|
|
@ -45,6 +45,7 @@ import java.io.InputStream;
|
|||
import java.io.StringWriter;
|
||||
import java.io.Writer;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
|
||||
|
@ -55,6 +56,10 @@ import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL;
|
|||
* <p>An implementation of {@link EntityProcessor} which reads data from rich docs
|
||||
* using <a href="http://tika.apache.org/">Apache Tika</a>
|
||||
*
|
||||
* <p>To index latitude/longitude data that might
|
||||
* be extracted from a file's metadata, identify
|
||||
* the geo field for this information with this attribute:
|
||||
* <code>spatialMetadataField</code>
|
||||
*
|
||||
* @since solr 3.1
|
||||
*/
|
||||
|
@ -67,6 +72,7 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
|||
private String parser;
|
||||
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
|
||||
private String htmlMapper;
|
||||
private String spatialMetadataField;
|
||||
|
||||
@Override
|
||||
public void init(Context context) {
|
||||
|
@ -113,6 +119,8 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
|||
if(parser == null) {
|
||||
parser = AUTO_PARSER;
|
||||
}
|
||||
|
||||
spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -167,10 +175,20 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
|||
if (s != null) row.put(col, s);
|
||||
}
|
||||
if(!"none".equals(format) ) row.put("text", sw.toString());
|
||||
tryToAddLatLon(metadata, row);
|
||||
done = true;
|
||||
return row;
|
||||
}
|
||||
|
||||
private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) {
|
||||
if (spatialMetadataField == null) return;
|
||||
String latString = metadata.get(Metadata.LATITUDE);
|
||||
String lonString = metadata.get(Metadata.LONGITUDE);
|
||||
if (latString != null && lonString != null) {
|
||||
row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString));
|
||||
}
|
||||
}
|
||||
|
||||
private static ContentHandler getHtmlHandler(Writer writer)
|
||||
throws TransformerConfigurationException {
|
||||
SAXTransformerFactory factory = (SAXTransformerFactory)
|
||||
|
|
|
@ -77,6 +77,8 @@
|
|||
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
|
||||
<fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
|
||||
|
||||
|
||||
<!--
|
||||
Numeric field types that index each value at various levels of precision
|
||||
|
@ -199,7 +201,8 @@
|
|||
<field name="title" type="string" indexed="true" stored="true"/>
|
||||
<field name="author" type="string" indexed="true" stored="true" />
|
||||
<field name="text" type="text" indexed="true" stored="true" />
|
||||
|
||||
<field name="foo_i" type="int" indexed="true" stored="false" />
|
||||
<field name="home" type="latLon" indexed="true" stored="true" />
|
||||
</fields>
|
||||
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
|
|
Binary file not shown.
After Width: | Height: | Size: 13 KiB |
|
@ -51,6 +51,18 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
private String spatialConf =
|
||||
"<dataConfig>" +
|
||||
" <dataSource type=\"BinFileDataSource\"/>" +
|
||||
" <document>" +
|
||||
" <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
|
||||
getFile("dihextras/test_jpeg.jpg").getAbsolutePath() + "\" spatialMetadataField=\"home\">" +
|
||||
" <field column=\"text\"/>" +
|
||||
" </entity>" +
|
||||
" </document>" +
|
||||
"</dataConfig>";
|
||||
|
||||
|
||||
private String[] tests = {
|
||||
"//*[@numFound='1']"
|
||||
,"//str[@name='author'][.='Grant Ingersoll']"
|
||||
|
@ -74,6 +86,10 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
|
||||
};
|
||||
|
||||
private String[] testsSpatial = {
|
||||
"//*[@numFound='1']"
|
||||
};
|
||||
|
||||
private String[] testsEmbedded = {
|
||||
"//*[@numFound='1']",
|
||||
"//str[@name='text'][contains(.,'When in the Course')]"
|
||||
|
@ -121,6 +137,16 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
|||
assertQ(req("*:*"), testsHTMLIdentity);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTikaGeoMetadata() throws Exception {
|
||||
runFullImport(spatialConf);
|
||||
String pt = "38.97,-77.018";
|
||||
Double distance = 5.0d;
|
||||
assertQ(req("q", "*:* OR foo_i:" + random().nextInt(100), "fq",
|
||||
"{!geofilt sfield=\"home\"}\"",
|
||||
"pt", pt, "d", String.valueOf(distance)), testsSpatial);
|
||||
}
|
||||
|
||||
private String getConfigHTML(String htmlMapper) {
|
||||
return
|
||||
"<dataConfig>" +
|
||||
|
|
Loading…
Reference in New Issue