mirror of
https://github.com/apache/lucene.git
synced 2025-02-21 01:18:45 +00:00
SOLR-1856: In Solr Cell, literals should override Tika-parsed values
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354455 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f348f0bb99
commit
52516ebb21
@ -371,6 +371,10 @@ New Features
|
||||
* SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
|
||||
in example solrconfig.xml. (Sebastian Lutze, koji)
|
||||
|
||||
* SOLR-1856: In Solr Cell, literals should override Tika-parsed values.
|
||||
Patch adds a param "literalsOverride" which defaults to true, but can be set
|
||||
to "false" to let Tika-parsed values be appended to literal values (Chris Harris, janhoy)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
@ -96,6 +96,10 @@ public interface ExtractingParams {
|
||||
*/
|
||||
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
|
||||
|
||||
/**
|
||||
* Literal field values will by default override other values such as metadata and content. Set this to false to revert to pre-4.0 behaviour
|
||||
*/
|
||||
public static final String LITERALS_OVERRIDE = "literalsOverride";
|
||||
|
||||
/**
|
||||
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
|
||||
|
@ -66,6 +66,9 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
protected String unknownFieldPrefix = "";
|
||||
protected String defaultField = "";
|
||||
|
||||
private boolean literalsOverride;
|
||||
private Set<String> literalFieldNames;
|
||||
|
||||
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
|
||||
}
|
||||
@ -81,6 +84,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
|
||||
this.lowerNames = params.getBool(LOWERNAMES, false);
|
||||
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
|
||||
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
|
||||
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
|
||||
this.defaultField = params.get(DEFAULT_FIELD, "");
|
||||
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
|
||||
@ -107,13 +111,11 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
* @see #addLiterals()
|
||||
*/
|
||||
public SolrInputDocument newDocument() {
|
||||
float boost = 1.0f;
|
||||
//handle the metadata extracted from the document
|
||||
addMetadata();
|
||||
|
||||
//handle the literals from the params
|
||||
//handle the literals from the params. NOTE: This MUST be called before the others in order for literals to override other values
|
||||
addLiterals();
|
||||
|
||||
//handle the metadata extracted from the document
|
||||
addMetadata();
|
||||
|
||||
//add in the content
|
||||
addContent();
|
||||
@ -134,8 +136,10 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
protected void addCapturedContent() {
|
||||
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
|
||||
if (entry.getValue().length() > 0) {
|
||||
addField(entry.getKey(), entry.getValue().toString(), null);
|
||||
}
|
||||
String fieldName = entry.getKey();
|
||||
if (literalsOverride && literalFieldNames.contains(fieldName))
|
||||
continue;
|
||||
addField(fieldName, entry.getValue().toString(), null); }
|
||||
}
|
||||
}
|
||||
|
||||
@ -144,6 +148,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
* and the {@link #catchAllBuilder}
|
||||
*/
|
||||
protected void addContent() {
|
||||
if (literalsOverride && literalFieldNames.contains(contentFieldName))
|
||||
return;
|
||||
addField(contentFieldName, catchAllBuilder.toString(), null);
|
||||
}
|
||||
|
||||
@ -152,12 +158,14 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
*/
|
||||
protected void addLiterals() {
|
||||
Iterator<String> paramNames = params.getParameterNamesIterator();
|
||||
literalFieldNames = new HashSet<String>();
|
||||
while (paramNames.hasNext()) {
|
||||
String pname = paramNames.next();
|
||||
if (!pname.startsWith(LITERALS_PREFIX)) continue;
|
||||
|
||||
String name = pname.substring(LITERALS_PREFIX.length());
|
||||
addField(name, null, params.getParams(pname));
|
||||
literalFieldNames.add(name);
|
||||
}
|
||||
}
|
||||
|
||||
@ -166,6 +174,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
||||
*/
|
||||
protected void addMetadata() {
|
||||
for (String name : metadata.names()) {
|
||||
if (literalsOverride && literalFieldNames.contains(name))
|
||||
continue;
|
||||
String[] vals = metadata.getValues(name);
|
||||
addField(name, null, vals);
|
||||
}
|
||||
|
@ -444,7 +444,71 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
||||
}
|
||||
catch(Exception expected){}
|
||||
}
|
||||
|
||||
|
||||
public void testLiteralsOverride() throws Exception {
|
||||
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
|
||||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
|
||||
assertQ(req("*:*"), "//*[@numFound='0']");
|
||||
|
||||
// Here Tika should parse out a title for this document:
|
||||
loadLocal("extraction/solr-word.pdf",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"literal.id", "three",
|
||||
"fmap.content", "extractedContent",
|
||||
"fmap.language", "extractedLanguage",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.AAPL:Keywords", "ignored_a",
|
||||
"fmap.xmpTPg:NPages", "ignored_a",
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
// Here the literal value should override the Tika-parsed title:
|
||||
loadLocal("extraction/solr-word.pdf",
|
||||
"literal.title", "wolf-man",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"literal.id", "four",
|
||||
"fmap.content", "extractedContent",
|
||||
"fmap.language", "extractedLanguage",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.AAPL:Keywords", "ignored_a",
|
||||
"fmap.xmpTPg:NPages", "ignored_a",
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
// Here we mimic the old behaviour where literals are added, not overridden
|
||||
loadLocal("extraction/solr-word.pdf",
|
||||
"literalsOverride", "false",
|
||||
// Trick - we first map the metadata-title to an ignored field before we replace with literal title
|
||||
"fmap.title", "ignored_a",
|
||||
"literal.title", "old-behaviour",
|
||||
"literal.extractedKeywords", "literalkeyword",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"literal.id", "five",
|
||||
"fmap.content", "extractedContent",
|
||||
"fmap.language", "extractedLanguage",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.AAPL:Keywords", "ignored_a",
|
||||
"fmap.xmpTPg:NPages", "ignored_a",
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
assertU(commit());
|
||||
|
||||
assertQ(req("title:solr-word"), "//*[@numFound='1']");
|
||||
assertQ(req("title:wolf-man"), "//*[@numFound='1']");
|
||||
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
|
||||
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
|
||||
try {
|
||||
|
Loading…
x
Reference in New Issue
Block a user