SOLR-1856: In Solr Cell, literals should override Tika-parsed values

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354455 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2012-06-27 12:05:55 +00:00
parent f348f0bb99
commit 52516ebb21
4 changed files with 90 additions and 8 deletions

View File

@ -371,6 +371,10 @@ New Features
* SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
in example solrconfig.xml. (Sebastian Lutze, koji)
* SOLR-1856: In Solr Cell, literals should override Tika-parsed values.
Patch adds a param "literalsOverride" which defaults to true, but can be set
to "false" to let Tika-parsed values be appended to literal values (Chris Harris, janhoy)
Optimizations
----------------------

View File

@ -96,6 +96,10 @@ public interface ExtractingParams {
*/
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
/**
* Literal field values will by default override other values such as metadata and content. Set this to false to revert to pre-4.0 behaviour
*/
public static final String LITERALS_OVERRIDE = "literalsOverride";
/**
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different

View File

@ -66,6 +66,9 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
protected String unknownFieldPrefix = "";
protected String defaultField = "";
private boolean literalsOverride;
private Set<String> literalFieldNames;
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
this(metadata, params, schema, DateUtil.DEFAULT_DATE_FORMATS);
}
@ -81,6 +84,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
this.lowerNames = params.getBool(LOWERNAMES, false);
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
this.literalsOverride = params.getBool(LITERALS_OVERRIDE, true);
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
this.defaultField = params.get(DEFAULT_FIELD, "");
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
@ -107,13 +111,11 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
* @see #addLiterals()
*/
public SolrInputDocument newDocument() {
float boost = 1.0f;
//handle the metadata extracted from the document
addMetadata();
//handle the literals from the params
//handle the literals from the params. NOTE: This MUST be called before the others in order for literals to override other values
addLiterals();
//handle the metadata extracted from the document
addMetadata();
//add in the content
addContent();
@ -134,8 +136,10 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
protected void addCapturedContent() {
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
if (entry.getValue().length() > 0) {
addField(entry.getKey(), entry.getValue().toString(), null);
}
String fieldName = entry.getKey();
if (literalsOverride && literalFieldNames.contains(fieldName))
continue;
addField(fieldName, entry.getValue().toString(), null); }
}
}
@ -144,6 +148,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
* and the {@link #catchAllBuilder}
*/
protected void addContent() {
if (literalsOverride && literalFieldNames.contains(contentFieldName))
return;
addField(contentFieldName, catchAllBuilder.toString(), null);
}
@ -152,12 +158,14 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
*/
protected void addLiterals() {
Iterator<String> paramNames = params.getParameterNamesIterator();
literalFieldNames = new HashSet<String>();
while (paramNames.hasNext()) {
String pname = paramNames.next();
if (!pname.startsWith(LITERALS_PREFIX)) continue;
String name = pname.substring(LITERALS_PREFIX.length());
addField(name, null, params.getParams(pname));
literalFieldNames.add(name);
}
}
@ -166,6 +174,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
*/
protected void addMetadata() {
for (String name : metadata.names()) {
if (literalsOverride && literalFieldNames.contains(name))
continue;
String[] vals = metadata.getValues(name);
addField(name, null, vals);
}

View File

@ -444,7 +444,71 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
}
catch(Exception expected){}
}
public void testLiteralsOverride() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
assertQ(req("*:*"), "//*[@numFound='0']");
// Here Tika should parse out a title for this document:
loadLocal("extraction/solr-word.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "three",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Last-Modified", "extractedDate");
// Here the literal value should override the Tika-parsed title:
loadLocal("extraction/solr-word.pdf",
"literal.title", "wolf-man",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "four",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Last-Modified", "extractedDate");
// Here we mimic the old behaviour where literals are added, not overridden
loadLocal("extraction/solr-word.pdf",
"literalsOverride", "false",
// Trick - we first map the metadata-title to an ignored field before we replace with literal title
"fmap.title", "ignored_a",
"literal.title", "old-behaviour",
"literal.extractedKeywords", "literalkeyword",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Author", "extractedAuthor",
"literal.id", "five",
"fmap.content", "extractedContent",
"fmap.language", "extractedLanguage",
"fmap.Creation-Date", "extractedDate",
"fmap.AAPL:Keywords", "ignored_a",
"fmap.xmpTPg:NPages", "ignored_a",
"fmap.Last-Modified", "extractedDate");
assertU(commit());
assertQ(req("title:solr-word"), "//*[@numFound='1']");
assertQ(req("title:wolf-man"), "//*[@numFound='1']");
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
}
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {