SOLR-1929: Index encrypted files in SolrCell

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jan Høydahl 2012-06-28 09:46:49 +00:00
parent 01dd823f23
commit 032cad944a
8 changed files with 263 additions and 2 deletions

View File

@ -368,6 +368,11 @@ New Features
* SOLR-3351: eDismax: ps2 and ps3 params (janhoy)
* SOLR-1929: Index encrypted documents with ExtractingUpdateRequestHandler.
By supplying resource.password=<mypw> or specifying an external file with regular
expressions matching file names, Solr will decrypt and index PDFs and DOCX formats.
(janhoy, Yiannis Pericleous)
* SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
in example solrconfig.xml. (Sebastian Lutze, koji)

View File

@ -44,6 +44,7 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
@ -90,7 +91,6 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
protected TikaConfig config;
protected SolrContentHandlerFactory factory;
//protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
TikaConfig config, SolrContentHandlerFactory factory) {
@ -206,6 +206,23 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
try{
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
// Password handling
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
if(pwMapFile != null && pwMapFile.length() > 0) {
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
if(is != null) {
log.debug("Password file supplied: "+pwMapFile);
epp.parse(is);
}
}
context.set(PasswordProvider.class, epp);
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
if(resourcePassword != null) {
epp.setExplicitPassword(resourcePassword);
log.debug("Literal password supplied for file "+resourceName);
}
parser.parse(inputStream, parsingHandler, metadata, context);
} catch (TikaException e) {
if(ignoreTikaException)

View File

@ -136,6 +136,10 @@ public interface ExtractingParams {
*/
public static final String RESOURCE_NAME = "resource.name";
/**
* Optional. The password for this resource. Will be used instead of the rule based password lookup mechanisms
*/
public static final String RESOURCE_PASSWORD = "resource.password";
/**
* Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible
@ -148,4 +152,14 @@ public interface ExtractingParams {
* will be used instead.
*/
public static final String DEFAULT_FIELD = "defaultField";
/**
* Optional. If specified, loads the file as a source for password lookups for Tika encrypted documents.
* <p>
* File format is Java properties format with one key=value per line.
* The key is evaluated as a regex against the file name, and the value is the password
* The rules are evaluated top-bottom, i.e. the first match will be used
* If you want a fallback password to be always used, supply a .*=<defaultmypassword> at the end
*/
public static final String PASSWORD_MAP_FILE = "passwordsFile";
}

View File

@ -0,0 +1,150 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.extraction;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.LinkedHashMap;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.PasswordProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Password provider for Extracting request handler which finds correct
* password based on file name matching against a list of regular expressions.
* The list of passwords is supplied in an optional Map.
* If an explicit password is set, it will be used.
*/
public class RegexRulesPasswordProvider implements PasswordProvider {
private static final Logger log = LoggerFactory.getLogger(RegexRulesPasswordProvider.class);
private LinkedHashMap<Pattern,String> passwordMap = new LinkedHashMap<Pattern,String>();
private String explicitPassword;
@Override
public String getPassword(Metadata meta) {
if(getExplicitPassword() != null) {
return getExplicitPassword();
}
if(passwordMap.size() > 0)
return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
return null;
}
private String lookupPasswordFromMap(String fileName) {
if(fileName != null && fileName.length() > 0) {
for(Entry<Pattern,String> e : passwordMap.entrySet()) {
if(e.getKey().matcher(fileName).matches()) {
return e.getValue();
}
}
}
return null;
}
/**
* Parses rule file from stream and returns a Map of all rules found
* @param is input stream for the file
*/
public static LinkedHashMap<Pattern,String> parseRulesFile(InputStream is) {
LinkedHashMap<Pattern,String> rules = new LinkedHashMap<Pattern,String>();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String line;
try {
int linenum = 0;
while ((line = br.readLine()) != null) {
linenum++;
// Remove comments
String[] arr = line.split("#");
if(arr.length > 0)
line = arr[0].trim();
if(line.length() == 0)
continue;
int sep = line.indexOf("=");
if(sep <= 0) {
log.warn("Wrong format of password line "+linenum);
continue;
}
String pass = line.substring(sep+1).trim();
String regex = line.substring(0, sep).trim();
try {
Pattern pattern = Pattern.compile(regex);
rules.put(pattern, pass);
} catch(PatternSyntaxException pse) {
log.warn("Key of line "+linenum+" was not a valid regex pattern", pse);
continue;
}
}
is.close();
} catch (IOException e) {
throw new RuntimeException();
}
return rules;
}
/**
* Initialize rules through file input stream. This is a convenience for first calling
* setPasswordMap(parseRulesFile(is)).
* @param is the input stream with rules file, one line per rule on format regex=password
*/
public void parse(InputStream is) {
setPasswordMap(parseRulesFile(is));
}
public LinkedHashMap<Pattern,String> getPasswordMap() {
return passwordMap;
}
public void setPasswordMap(LinkedHashMap<Pattern,String> linkedHashMap) {
this.passwordMap = linkedHashMap;
}
/**
* Gets the explicit password, if set
* @return the password, or null if not set
*/
public String getExplicitPassword() {
return explicitPassword;
}
/**
* Sets an explicit password which will be used instead of password map
* @param explicitPassword the password to use
*/
public void setExplicitPassword(String explicitPassword) {
this.explicitPassword = explicitPassword;
}
/**
* Resets explicit password, so that map will be used for lookups
*/
public void resetExplicitPassword() {
this.explicitPassword = null;
}
}

View File

@ -0,0 +1,7 @@
# Filename regex -> password map
# Example any file ending in .doc should use password foobar:
# .*\.doc = fooBar
#
# Note: Apache Tika 1.1 supports password for .pdf and .docx only, not .doc or other formats
.*\.pdf$ = solrRules
.*\.docx$ = Word2010

View File

@ -394,7 +394,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
try{
loadLocal("extraction/password-is-solrcell.docx",
"literal.id", "one");
fail("TikaException is expected because of trying to extract text from password protected word file.");
fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
}
catch(Exception expected){}
assertU(commit());
@ -509,6 +509,74 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
}
@Test
public void testPasswordProtected() throws Exception {
// PDF, Passwords from resource.password
loadLocal("extraction/enctypted-password-is-solrRules.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "pdfpwliteral",
"resource.name", "enctypted-password-is-solrRules.pdf",
"resource.password", "solrRules",
"fmap.Last-Modified", "extractedDate");
// PDF, Passwords from passwords property file
loadLocal("extraction/enctypted-password-is-solrRules.pdf",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "pdfpwfile",
"resource.name", "enctypted-password-is-solrRules.pdf",
"passwordsFile", "passwordRegex.properties", // Passwords-file
"fmap.Last-Modified", "extractedDate");
// DOCX, Explicit password
loadLocal("extraction/password-is-Word2010.docx",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"uprefix", "ignored_",
"literal.id", "docxpwliteral",
"resource.name", "password-is-Word2010.docx",
"resource.password", "Word2010", // Explicit password
"fmap.Last-Modified", "extractedDate");
// DOCX, Passwords from file
loadLocal("extraction/password-is-Word2010.docx",
"fmap.created", "extractedDate",
"fmap.producer", "extractedProducer",
"fmap.creator", "extractedCreator",
"fmap.Keywords", "extractedKeywords",
"fmap.Creation-Date", "extractedDate",
"uprefix", "ignored_",
"fmap.Author", "extractedAuthor",
"fmap.content", "wdf_nocase",
"literal.id", "docxpwfile",
"resource.name", "password-is-Word2010.docx",
"passwordsFile", "passwordRegex.properties", // Passwords-file
"fmap.Last-Modified", "extractedDate");
assertU(commit());
Thread.sleep(100);
assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
}
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {