mirror of https://github.com/apache/lucene.git
SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1677001 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c56f522a8f
commit
bd8b77f9bb
|
@ -160,6 +160,9 @@ New Features
|
||||||
& json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} }
|
& json.facet={ colors:{type:terms, field:color, excludeTags=COLOR} }
|
||||||
(yonik)
|
(yonik)
|
||||||
|
|
||||||
|
* SOLR-7231: DIH-TikaEntityprocessor, create lat-lon field from Metadata
|
||||||
|
(Tim Allison via Noble Paul)
|
||||||
|
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
|
@ -45,6 +45,7 @@ import java.io.InputStream;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
|
import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
|
||||||
|
@ -55,6 +56,10 @@ import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL;
|
||||||
* <p>An implementation of {@link EntityProcessor} which reads data from rich docs
|
* <p>An implementation of {@link EntityProcessor} which reads data from rich docs
|
||||||
* using <a href="http://tika.apache.org/">Apache Tika</a>
|
* using <a href="http://tika.apache.org/">Apache Tika</a>
|
||||||
*
|
*
|
||||||
|
* <p>To index latitude/longitude data that might
|
||||||
|
* be extracted from a file's metadata, identify
|
||||||
|
* the geo field for this information with this attribute:
|
||||||
|
* <code>spatialMetadataField</code>
|
||||||
*
|
*
|
||||||
* @since solr 3.1
|
* @since solr 3.1
|
||||||
*/
|
*/
|
||||||
|
@ -67,6 +72,7 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
||||||
private String parser;
|
private String parser;
|
||||||
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
|
static final String AUTO_PARSER = "org.apache.tika.parser.AutoDetectParser";
|
||||||
private String htmlMapper;
|
private String htmlMapper;
|
||||||
|
private String spatialMetadataField;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void init(Context context) {
|
public void init(Context context) {
|
||||||
|
@ -113,6 +119,8 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
||||||
if(parser == null) {
|
if(parser == null) {
|
||||||
parser = AUTO_PARSER;
|
parser = AUTO_PARSER;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
spatialMetadataField = context.getResolvedEntityAttribute("spatialMetadataField");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -167,10 +175,20 @@ public class TikaEntityProcessor extends EntityProcessorBase {
|
||||||
if (s != null) row.put(col, s);
|
if (s != null) row.put(col, s);
|
||||||
}
|
}
|
||||||
if(!"none".equals(format) ) row.put("text", sw.toString());
|
if(!"none".equals(format) ) row.put("text", sw.toString());
|
||||||
|
tryToAddLatLon(metadata, row);
|
||||||
done = true;
|
done = true;
|
||||||
return row;
|
return row;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void tryToAddLatLon(Metadata metadata, Map<String, Object> row) {
|
||||||
|
if (spatialMetadataField == null) return;
|
||||||
|
String latString = metadata.get(Metadata.LATITUDE);
|
||||||
|
String lonString = metadata.get(Metadata.LONGITUDE);
|
||||||
|
if (latString != null && lonString != null) {
|
||||||
|
row.put(spatialMetadataField, String.format(Locale.ROOT, "%s,%s", latString, lonString));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static ContentHandler getHtmlHandler(Writer writer)
|
private static ContentHandler getHtmlHandler(Writer writer)
|
||||||
throws TransformerConfigurationException {
|
throws TransformerConfigurationException {
|
||||||
SAXTransformerFactory factory = (SAXTransformerFactory)
|
SAXTransformerFactory factory = (SAXTransformerFactory)
|
||||||
|
|
|
@ -77,6 +77,8 @@
|
||||||
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
|
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
|
||||||
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
|
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
|
||||||
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
|
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
|
||||||
|
<fieldType name="latLon" class="solr.LatLonType" subFieldType="double"/>
|
||||||
|
|
||||||
|
|
||||||
<!--
|
<!--
|
||||||
Numeric field types that index each value at various levels of precision
|
Numeric field types that index each value at various levels of precision
|
||||||
|
@ -199,7 +201,8 @@
|
||||||
<field name="title" type="string" indexed="true" stored="true"/>
|
<field name="title" type="string" indexed="true" stored="true"/>
|
||||||
<field name="author" type="string" indexed="true" stored="true" />
|
<field name="author" type="string" indexed="true" stored="true" />
|
||||||
<field name="text" type="text" indexed="true" stored="true" />
|
<field name="text" type="text" indexed="true" stored="true" />
|
||||||
|
<field name="foo_i" type="int" indexed="true" stored="false" />
|
||||||
|
<field name="home" type="latLon" indexed="true" stored="true" />
|
||||||
</fields>
|
</fields>
|
||||||
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
|
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
|
||||||
<defaultSearchField>text</defaultSearchField>
|
<defaultSearchField>text</defaultSearchField>
|
||||||
|
|
Binary file not shown.
After Width: | Height: | Size: 13 KiB |
|
@ -51,6 +51,18 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
||||||
" </document>" +
|
" </document>" +
|
||||||
"</dataConfig>";
|
"</dataConfig>";
|
||||||
|
|
||||||
|
private String spatialConf =
|
||||||
|
"<dataConfig>" +
|
||||||
|
" <dataSource type=\"BinFileDataSource\"/>" +
|
||||||
|
" <document>" +
|
||||||
|
" <entity name=\"Tika\" processor=\"TikaEntityProcessor\" url=\"" +
|
||||||
|
getFile("dihextras/test_jpeg.jpg").getAbsolutePath() + "\" spatialMetadataField=\"home\">" +
|
||||||
|
" <field column=\"text\"/>" +
|
||||||
|
" </entity>" +
|
||||||
|
" </document>" +
|
||||||
|
"</dataConfig>";
|
||||||
|
|
||||||
|
|
||||||
private String[] tests = {
|
private String[] tests = {
|
||||||
"//*[@numFound='1']"
|
"//*[@numFound='1']"
|
||||||
,"//str[@name='author'][.='Grant Ingersoll']"
|
,"//str[@name='author'][.='Grant Ingersoll']"
|
||||||
|
@ -74,6 +86,10 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
||||||
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
|
, "//str[@name='text'][contains(.,'class=\"classAttribute\"')]" //attributes are lower-cased
|
||||||
};
|
};
|
||||||
|
|
||||||
|
private String[] testsSpatial = {
|
||||||
|
"//*[@numFound='1']"
|
||||||
|
};
|
||||||
|
|
||||||
private String[] testsEmbedded = {
|
private String[] testsEmbedded = {
|
||||||
"//*[@numFound='1']",
|
"//*[@numFound='1']",
|
||||||
"//str[@name='text'][contains(.,'When in the Course')]"
|
"//str[@name='text'][contains(.,'When in the Course')]"
|
||||||
|
@ -121,6 +137,16 @@ public class TestTikaEntityProcessor extends AbstractDataImportHandlerTestCase {
|
||||||
assertQ(req("*:*"), testsHTMLIdentity);
|
assertQ(req("*:*"), testsHTMLIdentity);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTikaGeoMetadata() throws Exception {
|
||||||
|
runFullImport(spatialConf);
|
||||||
|
String pt = "38.97,-77.018";
|
||||||
|
Double distance = 5.0d;
|
||||||
|
assertQ(req("q", "*:* OR foo_i:" + random().nextInt(100), "fq",
|
||||||
|
"{!geofilt sfield=\"home\"}\"",
|
||||||
|
"pt", pt, "d", String.valueOf(distance)), testsSpatial);
|
||||||
|
}
|
||||||
|
|
||||||
private String getConfigHTML(String htmlMapper) {
|
private String getConfigHTML(String htmlMapper) {
|
||||||
return
|
return
|
||||||
"<dataConfig>" +
|
"<dataConfig>" +
|
||||||
|
|
Loading…
Reference in New Issue