mirror of https://github.com/apache/lucene.git
SOLR-1929: Index encrypted files in SolrCell
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1354887 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
01dd823f23
commit
032cad944a
|
@ -368,6 +368,11 @@ New Features
|
|||
|
||||
* SOLR-3351: eDismax: ps2 and ps3 params (janhoy)
|
||||
|
||||
* SOLR-1929: Index encrypted documents with ExtractingUpdateRequestHandler.
|
||||
By supplying resource.password=<mypw> or specifying an external file with regular
|
||||
expressions matching file names, Solr will decrypt and index PDFs and DOCX formats.
|
||||
(janhoy, Yiannis Pericleous)
|
||||
|
||||
* SOLR-3542: Add WeightedFragListBuilder for FVH and set it to default fragListBuilder
|
||||
in example solrconfig.xml. (Sebastian Lutze, koji)
|
||||
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.apache.tika.parser.AutoDetectParser;
|
|||
import org.apache.tika.parser.DefaultParser;
|
||||
import org.apache.tika.parser.ParseContext;
|
||||
import org.apache.tika.parser.Parser;
|
||||
import org.apache.tika.parser.PasswordProvider;
|
||||
import org.apache.tika.sax.XHTMLContentHandler;
|
||||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
|
@ -90,7 +91,6 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
|
||||
protected TikaConfig config;
|
||||
protected SolrContentHandlerFactory factory;
|
||||
//protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
||||
|
||||
public ExtractingDocumentLoader(SolrQueryRequest req, UpdateRequestProcessor processor,
|
||||
TikaConfig config, SolrContentHandlerFactory factory) {
|
||||
|
@ -206,6 +206,23 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
try{
|
||||
//potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
|
||||
ParseContext context = new ParseContext();//TODO: should we design a way to pass in parse context?
|
||||
|
||||
// Password handling
|
||||
RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
|
||||
String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
|
||||
if(pwMapFile != null && pwMapFile.length() > 0) {
|
||||
InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
|
||||
if(is != null) {
|
||||
log.debug("Password file supplied: "+pwMapFile);
|
||||
epp.parse(is);
|
||||
}
|
||||
}
|
||||
context.set(PasswordProvider.class, epp);
|
||||
String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
|
||||
if(resourcePassword != null) {
|
||||
epp.setExplicitPassword(resourcePassword);
|
||||
log.debug("Literal password supplied for file "+resourceName);
|
||||
}
|
||||
parser.parse(inputStream, parsingHandler, metadata, context);
|
||||
} catch (TikaException e) {
|
||||
if(ignoreTikaException)
|
||||
|
|
|
@ -136,6 +136,10 @@ public interface ExtractingParams {
|
|||
*/
|
||||
public static final String RESOURCE_NAME = "resource.name";
|
||||
|
||||
/**
|
||||
* Optional. The password for this resource. Will be used instead of the rule based password lookup mechanisms
|
||||
*/
|
||||
public static final String RESOURCE_PASSWORD = "resource.password";
|
||||
|
||||
/**
|
||||
* Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible
|
||||
|
@ -148,4 +152,14 @@ public interface ExtractingParams {
|
|||
* will be used instead.
|
||||
*/
|
||||
public static final String DEFAULT_FIELD = "defaultField";
|
||||
|
||||
/**
|
||||
* Optional. If specified, loads the file as a source for password lookups for Tika encrypted documents.
|
||||
* <p>
|
||||
* File format is Java properties format with one key=value per line.
|
||||
* The key is evaluated as a regex against the file name, and the value is the password
|
||||
* The rules are evaluated top-bottom, i.e. the first match will be used
|
||||
* If you want a fallback password to be always used, supply a .*=<defaultmypassword> at the end
|
||||
*/
|
||||
public static final String PASSWORD_MAP_FILE = "passwordsFile";
|
||||
}
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.PatternSyntaxException;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
import org.apache.tika.parser.PasswordProvider;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Password provider for Extracting request handler which finds correct
|
||||
* password based on file name matching against a list of regular expressions.
|
||||
* The list of passwords is supplied in an optional Map.
|
||||
* If an explicit password is set, it will be used.
|
||||
*/
|
||||
public class RegexRulesPasswordProvider implements PasswordProvider {
|
||||
private static final Logger log = LoggerFactory.getLogger(RegexRulesPasswordProvider.class);
|
||||
|
||||
private LinkedHashMap<Pattern,String> passwordMap = new LinkedHashMap<Pattern,String>();
|
||||
private String explicitPassword;
|
||||
|
||||
@Override
|
||||
public String getPassword(Metadata meta) {
|
||||
if(getExplicitPassword() != null) {
|
||||
return getExplicitPassword();
|
||||
}
|
||||
|
||||
if(passwordMap.size() > 0)
|
||||
return lookupPasswordFromMap(meta.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private String lookupPasswordFromMap(String fileName) {
|
||||
if(fileName != null && fileName.length() > 0) {
|
||||
for(Entry<Pattern,String> e : passwordMap.entrySet()) {
|
||||
if(e.getKey().matcher(fileName).matches()) {
|
||||
return e.getValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses rule file from stream and returns a Map of all rules found
|
||||
* @param is input stream for the file
|
||||
*/
|
||||
public static LinkedHashMap<Pattern,String> parseRulesFile(InputStream is) {
|
||||
LinkedHashMap<Pattern,String> rules = new LinkedHashMap<Pattern,String>();
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(is));
|
||||
String line;
|
||||
try {
|
||||
int linenum = 0;
|
||||
while ((line = br.readLine()) != null) {
|
||||
linenum++;
|
||||
// Remove comments
|
||||
String[] arr = line.split("#");
|
||||
if(arr.length > 0)
|
||||
line = arr[0].trim();
|
||||
if(line.length() == 0)
|
||||
continue;
|
||||
int sep = line.indexOf("=");
|
||||
if(sep <= 0) {
|
||||
log.warn("Wrong format of password line "+linenum);
|
||||
continue;
|
||||
}
|
||||
String pass = line.substring(sep+1).trim();
|
||||
String regex = line.substring(0, sep).trim();
|
||||
try {
|
||||
Pattern pattern = Pattern.compile(regex);
|
||||
rules.put(pattern, pass);
|
||||
} catch(PatternSyntaxException pse) {
|
||||
log.warn("Key of line "+linenum+" was not a valid regex pattern", pse);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
is.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
return rules;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize rules through file input stream. This is a convenience for first calling
|
||||
* setPasswordMap(parseRulesFile(is)).
|
||||
* @param is the input stream with rules file, one line per rule on format regex=password
|
||||
*/
|
||||
public void parse(InputStream is) {
|
||||
setPasswordMap(parseRulesFile(is));
|
||||
}
|
||||
|
||||
public LinkedHashMap<Pattern,String> getPasswordMap() {
|
||||
return passwordMap;
|
||||
}
|
||||
|
||||
public void setPasswordMap(LinkedHashMap<Pattern,String> linkedHashMap) {
|
||||
this.passwordMap = linkedHashMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the explicit password, if set
|
||||
* @return the password, or null if not set
|
||||
*/
|
||||
public String getExplicitPassword() {
|
||||
return explicitPassword;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets an explicit password which will be used instead of password map
|
||||
* @param explicitPassword the password to use
|
||||
*/
|
||||
public void setExplicitPassword(String explicitPassword) {
|
||||
this.explicitPassword = explicitPassword;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets explicit password, so that map will be used for lookups
|
||||
*/
|
||||
public void resetExplicitPassword() {
|
||||
this.explicitPassword = null;
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,7 @@
|
|||
# Filename regex -> password map
|
||||
# Example any file ending in .doc should use password foobar:
|
||||
# .*\.doc = fooBar
|
||||
#
|
||||
# Note: Apache Tika 1.1 supports password for .pdf and .docx only, not .doc or other formats
|
||||
.*\.pdf$ = solrRules
|
||||
.*\.docx$ = Word2010
|
|
@ -394,7 +394,7 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
try{
|
||||
loadLocal("extraction/password-is-solrcell.docx",
|
||||
"literal.id", "one");
|
||||
fail("TikaException is expected because of trying to extract text from password protected word file.");
|
||||
fail("TikaException is expected because of trying to extract text from password protected word file without supplying a password.");
|
||||
}
|
||||
catch(Exception expected){}
|
||||
assertU(commit());
|
||||
|
@ -509,6 +509,74 @@ public class ExtractingRequestHandlerTest extends SolrTestCaseJ4 {
|
|||
assertQ(req("extractedKeywords:(solr AND word AND pdf AND literalkeyword)"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPasswordProtected() throws Exception {
|
||||
// PDF, Passwords from resource.password
|
||||
loadLocal("extraction/enctypted-password-is-solrRules.pdf",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"uprefix", "ignored_",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"literal.id", "pdfpwliteral",
|
||||
"resource.name", "enctypted-password-is-solrRules.pdf",
|
||||
"resource.password", "solrRules",
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
// PDF, Passwords from passwords property file
|
||||
loadLocal("extraction/enctypted-password-is-solrRules.pdf",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"uprefix", "ignored_",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"literal.id", "pdfpwfile",
|
||||
"resource.name", "enctypted-password-is-solrRules.pdf",
|
||||
"passwordsFile", "passwordRegex.properties", // Passwords-file
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
// DOCX, Explicit password
|
||||
loadLocal("extraction/password-is-Word2010.docx",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"uprefix", "ignored_",
|
||||
"literal.id", "docxpwliteral",
|
||||
"resource.name", "password-is-Word2010.docx",
|
||||
"resource.password", "Word2010", // Explicit password
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
// DOCX, Passwords from file
|
||||
loadLocal("extraction/password-is-Word2010.docx",
|
||||
"fmap.created", "extractedDate",
|
||||
"fmap.producer", "extractedProducer",
|
||||
"fmap.creator", "extractedCreator",
|
||||
"fmap.Keywords", "extractedKeywords",
|
||||
"fmap.Creation-Date", "extractedDate",
|
||||
"uprefix", "ignored_",
|
||||
"fmap.Author", "extractedAuthor",
|
||||
"fmap.content", "wdf_nocase",
|
||||
"literal.id", "docxpwfile",
|
||||
"resource.name", "password-is-Word2010.docx",
|
||||
"passwordsFile", "passwordRegex.properties", // Passwords-file
|
||||
"fmap.Last-Modified", "extractedDate");
|
||||
|
||||
assertU(commit());
|
||||
Thread.sleep(100);
|
||||
assertQ(req("wdf_nocase:\"This is a test of PDF\""), "//*[@numFound='2']");
|
||||
assertQ(req("wdf_nocase:\"Test password protected word doc\""), "//*[@numFound='2']");
|
||||
}
|
||||
|
||||
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
|
||||
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue