mirror of https://github.com/apache/lucene.git
SOLR-1406 -- Make FileDataSource and FileListEntityProcessor to be more extensible, fix variable resolving bug in FileListEntityProcessor and add tests
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@812122 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c7696af098
commit
7deb769861
|
@ -155,6 +155,8 @@ New Features
|
|||
|
||||
37.SOLR-1348: Support binary field type in convertType logic in JdbcDataSource (shalin)
|
||||
|
||||
38.SOLR-1406: Make FileDataSource and FileListEntityProcessor to be more extensible (Luke Forehand, shalin)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used
|
||||
|
|
|
@ -44,9 +44,15 @@ import org.slf4j.LoggerFactory;
|
|||
public class FileDataSource extends DataSource<Reader> {
|
||||
public static final String BASE_PATH = "basePath";
|
||||
|
||||
private String basePath;
|
||||
/**
|
||||
* The basePath for this data source
|
||||
*/
|
||||
protected String basePath;
|
||||
|
||||
private String encoding = null;
|
||||
/**
|
||||
* The encoding using which the given file should be read
|
||||
*/
|
||||
protected String encoding = null;
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FileDataSource.class);
|
||||
|
||||
|
@ -95,7 +101,16 @@ public class FileDataSource extends DataSource<Reader> {
|
|||
}
|
||||
}
|
||||
|
||||
private InputStreamReader openStream(File file) throws FileNotFoundException,
|
||||
/**
|
||||
* Open a {@link java.io.Reader} for the given file name
|
||||
*
|
||||
* @param file a {@link java.io.File} instance
|
||||
* @return a Reader on the given file
|
||||
* @throws FileNotFoundException if the File does not exist
|
||||
* @throws UnsupportedEncodingException if the encoding is unsupported
|
||||
* @since solr 1.4
|
||||
*/
|
||||
protected Reader openStream(File file) throws FileNotFoundException,
|
||||
UnsupportedEncodingException {
|
||||
if (encoding == null) {
|
||||
return new InputStreamReader(new FileInputStream(file));
|
||||
|
|
|
@ -54,13 +54,54 @@ import java.util.regex.Pattern;
|
|||
* @since solr 1.3
|
||||
*/
|
||||
public class FileListEntityProcessor extends EntityProcessorBase {
|
||||
private String fileName, baseDir, excludes;
|
||||
/**
|
||||
* A regex pattern to identify files given in data-config.xml after resolving any variables
|
||||
*/
|
||||
protected String fileName;
|
||||
|
||||
private Date newerThan, olderThan;
|
||||
/**
|
||||
* The baseDir given in data-config.xml after resolving any variables
|
||||
*/
|
||||
protected String baseDir;
|
||||
|
||||
private long biggerThan = -1, smallerThan = -1;
|
||||
/**
|
||||
* A Regex pattern of excluded file names as given in data-config.xml after resolving any variables
|
||||
*/
|
||||
protected String excludes;
|
||||
|
||||
private boolean recursive = false;
|
||||
/**
|
||||
* The newerThan given in data-config as a {@link java.util.Date}
|
||||
* <p>
|
||||
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
|
||||
* </p>
|
||||
*/
|
||||
protected Date newerThan;
|
||||
|
||||
/**
|
||||
* The newerThan given in data-config as a {@link java.util.Date}
|
||||
*/
|
||||
protected Date olderThan;
|
||||
|
||||
/**
|
||||
* The biggerThan given in data-config as a long value
|
||||
* <p>
|
||||
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
|
||||
* </p>
|
||||
*/
|
||||
protected long biggerThan = -1;
|
||||
|
||||
/**
|
||||
* The smallerThan given in data-config as a long value
|
||||
* <p>
|
||||
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
|
||||
* </p>
|
||||
*/
|
||||
protected long smallerThan = -1;
|
||||
|
||||
/**
|
||||
* The recursive given in data-config. Default value is false.
|
||||
*/
|
||||
protected boolean recursive = false;
|
||||
|
||||
private Pattern fileNamePattern, excludesPattern;
|
||||
|
||||
|
@ -91,13 +132,23 @@ public class FileListEntityProcessor extends EntityProcessorBase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the Date object corresponding to the given string.
|
||||
*
|
||||
* @param dateStr the date string. It can be a DateMath string or it may have a evaluator function
|
||||
* @return a Date instance corresponding to the input string
|
||||
*/
|
||||
private Date getDate(String dateStr) {
|
||||
if (dateStr == null)
|
||||
return null;
|
||||
|
||||
Matcher m = PLACE_HOLDER_PATTERN.matcher(dateStr);
|
||||
if (m.find()) {
|
||||
return (Date) resolver.resolve(dateStr);
|
||||
Object o = resolver.resolve(m.group(1));
|
||||
if (o instanceof Date) return (Date)o;
|
||||
dateStr = (String) o;
|
||||
} else {
|
||||
dateStr = resolver.replaceTokens(dateStr);
|
||||
}
|
||||
m = EvaluatorBag.IN_SINGLE_QUOTES.matcher(dateStr);
|
||||
if (m.find()) {
|
||||
|
@ -118,9 +169,34 @@ public class FileListEntityProcessor extends EntityProcessorBase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the Long value for the given string after resolving any evaluator or variable.
|
||||
*
|
||||
* @param sizeStr the size as a string
|
||||
* @return the Long value corresponding to the given string
|
||||
*/
|
||||
private Long getSize(String sizeStr) {
|
||||
if (sizeStr == null)
|
||||
return null;
|
||||
|
||||
Matcher m = PLACE_HOLDER_PATTERN.matcher(sizeStr);
|
||||
if (m.find()) {
|
||||
Object o = resolver.resolve(m.group(1));
|
||||
if (o instanceof Number) {
|
||||
Number number = (Number) o;
|
||||
return number.longValue();
|
||||
}
|
||||
sizeStr = (String) o;
|
||||
} else {
|
||||
sizeStr = resolver.replaceTokens(sizeStr);
|
||||
}
|
||||
|
||||
return Long.parseLong(sizeStr);
|
||||
}
|
||||
|
||||
public Map<String, Object> nextRow() {
|
||||
if (rowIterator != null)
|
||||
return getAndApplyTrans();
|
||||
return getNext();
|
||||
List<Map<String, Object>> fileDetails = new ArrayList<Map<String, Object>>();
|
||||
File dir = new File(baseDir);
|
||||
|
||||
|
@ -128,17 +204,16 @@ public class FileListEntityProcessor extends EntityProcessorBase {
|
|||
newerThan = getDate(dateStr);
|
||||
dateStr = context.getEntityAttribute(OLDER_THAN);
|
||||
olderThan = getDate(dateStr);
|
||||
String biggerThanStr = context.getEntityAttribute(BIGGER_THAN);
|
||||
if (biggerThanStr != null)
|
||||
biggerThan = getSize(biggerThanStr);
|
||||
String smallerThanStr = context.getEntityAttribute(SMALLER_THAN);
|
||||
if (smallerThanStr != null)
|
||||
smallerThan = getSize(smallerThanStr);
|
||||
|
||||
getFolderFiles(dir, fileDetails);
|
||||
rowIterator = fileDetails.iterator();
|
||||
return getAndApplyTrans();
|
||||
}
|
||||
|
||||
private Map<String, Object> getAndApplyTrans() {
|
||||
while (true) {
|
||||
Map<String, Object> r = getNext();
|
||||
return r;
|
||||
}
|
||||
return getNext();
|
||||
}
|
||||
|
||||
private void getFolderFiles(File dir, final List<Map<String, Object>> fileDetails) {
|
||||
|
@ -186,7 +261,7 @@ public class FileListEntityProcessor extends EntityProcessorBase {
|
|||
}
|
||||
|
||||
public static final Pattern PLACE_HOLDER_PATTERN = Pattern
|
||||
.compile("\\$\\{.*?\\}");
|
||||
.compile("\\$\\{(.*?)\\}");
|
||||
|
||||
public static final String DIR = "fileDir";
|
||||
|
||||
|
|
|
@ -22,10 +22,7 @@ import org.junit.Test;
|
|||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
|
@ -63,6 +60,76 @@ public class TestFileListEntityProcessor {
|
|||
}
|
||||
Assert.assertEquals(2, fList.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBiggerSmallerFiles() throws IOException {
|
||||
long time = System.currentTimeMillis();
|
||||
File tmpdir = new File("." + time);
|
||||
tmpdir.mkdir();
|
||||
tmpdir.deleteOnExit();
|
||||
long minLength = Long.MAX_VALUE;
|
||||
String smallestFile = "";
|
||||
byte[] content = "abcdefgij".getBytes("UTF-8");
|
||||
createFile(tmpdir, "a.xml", content, false);
|
||||
if (minLength > content.length) {
|
||||
minLength = content.length;
|
||||
smallestFile = "a.xml";
|
||||
}
|
||||
content = "abcdefgij".getBytes("UTF-8");
|
||||
createFile(tmpdir, "b.xml", content, false);
|
||||
if (minLength > content.length) {
|
||||
minLength = content.length;
|
||||
smallestFile = "b.xml";
|
||||
}
|
||||
content = "abc".getBytes("UTF-8");
|
||||
createFile(tmpdir, "c.props", content, false);
|
||||
if (minLength > content.length) {
|
||||
minLength = content.length;
|
||||
smallestFile = "c.props";
|
||||
}
|
||||
Map attrs = AbstractDataImportHandlerTest.createMap(
|
||||
FileListEntityProcessor.FILE_NAME, ".*",
|
||||
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
|
||||
FileListEntityProcessor.BIGGER_THAN, String.valueOf(minLength));
|
||||
List<String> fList = getFiles(null, attrs);
|
||||
Assert.assertEquals(2, fList.size());
|
||||
Set<String> l = new HashSet<String>();
|
||||
l.add(new File(tmpdir, "a.xml").getAbsolutePath());
|
||||
l.add(new File(tmpdir, "b.xml").getAbsolutePath());
|
||||
Assert.assertEquals(l, new HashSet<String>(fList));
|
||||
attrs = AbstractDataImportHandlerTest.createMap(
|
||||
FileListEntityProcessor.FILE_NAME, ".*",
|
||||
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
|
||||
FileListEntityProcessor.SMALLER_THAN, String.valueOf(minLength+1));
|
||||
fList = getFiles(null, attrs);
|
||||
l.clear();
|
||||
l.add(new File(tmpdir, smallestFile).getAbsolutePath());
|
||||
Assert.assertEquals(l, new HashSet<String>(fList));
|
||||
attrs = AbstractDataImportHandlerTest.createMap(
|
||||
FileListEntityProcessor.FILE_NAME, ".*",
|
||||
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
|
||||
FileListEntityProcessor.SMALLER_THAN, "${a.x}");
|
||||
VariableResolverImpl resolver = new VariableResolverImpl();
|
||||
resolver.addNamespace("a", AbstractDataImportHandlerTest.createMap("x", "4"));
|
||||
fList = getFiles(resolver, attrs);
|
||||
Assert.assertEquals(l, new HashSet<String>(fList));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private List<String> getFiles(VariableResolverImpl resolver, Map attrs) {
|
||||
Context c = AbstractDataImportHandlerTest.getContext(null,
|
||||
resolver, null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
|
||||
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
|
||||
fileListEntityProcessor.init(c);
|
||||
List<String> fList = new ArrayList<String>();
|
||||
while (true) {
|
||||
Map<String, Object> f = fileListEntityProcessor.nextRow();
|
||||
if (f == null)
|
||||
break;
|
||||
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
|
||||
}
|
||||
return fList;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNTOT() throws IOException {
|
||||
|
@ -77,34 +144,13 @@ public class TestFileListEntityProcessor {
|
|||
FileListEntityProcessor.FILE_NAME, "xml$",
|
||||
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
|
||||
FileListEntityProcessor.OLDER_THAN, "'NOW'");
|
||||
Context c = AbstractDataImportHandlerTest.getContext(null,
|
||||
new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
|
||||
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
|
||||
fileListEntityProcessor.init(c);
|
||||
List<String> fList = new ArrayList<String>();
|
||||
while (true) {
|
||||
Map<String, Object> f = fileListEntityProcessor.nextRow();
|
||||
if (f == null)
|
||||
break;
|
||||
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
|
||||
}
|
||||
System.out.println("List of files when given OLDER_THAN -- " + fList);
|
||||
List<String> fList = getFiles(null, attrs);
|
||||
Assert.assertEquals(2, fList.size());
|
||||
attrs = AbstractDataImportHandlerTest.createMap(
|
||||
FileListEntityProcessor.FILE_NAME, ".xml$",
|
||||
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
|
||||
FileListEntityProcessor.NEWER_THAN, "'NOW-2HOURS'");
|
||||
c = AbstractDataImportHandlerTest.getContext(null,
|
||||
new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
|
||||
fileListEntityProcessor.init(c);
|
||||
fList.clear();
|
||||
while (true) {
|
||||
Map<String, Object> f = fileListEntityProcessor.nextRow();
|
||||
if (f == null)
|
||||
break;
|
||||
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
|
||||
}
|
||||
System.out.println("List of files when given NEWER_THAN -- " + fList);
|
||||
fList = getFiles(null, attrs);
|
||||
Assert.assertEquals(2, fList.size());
|
||||
}
|
||||
|
||||
|
@ -124,20 +170,7 @@ public class TestFileListEntityProcessor {
|
|||
FileListEntityProcessor.FILE_NAME, "^.*\\.xml$",
|
||||
FileListEntityProcessor.BASE_DIR, childdir.getAbsolutePath(),
|
||||
FileListEntityProcessor.RECURSIVE, "true");
|
||||
Context c = AbstractDataImportHandlerTest.getContext(null,
|
||||
new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
|
||||
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
|
||||
fileListEntityProcessor.init(c);
|
||||
List<String> fList = new ArrayList<String>();
|
||||
while (true) {
|
||||
// Add the documents to the index. NextRow() should only
|
||||
// find two filesnames that match the pattern in fileName
|
||||
Map<String, Object> f = fileListEntityProcessor.nextRow();
|
||||
if (f == null)
|
||||
break;
|
||||
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
|
||||
}
|
||||
System.out.println("List of files indexed -- " + fList);
|
||||
List<String> fList = getFiles(null, attrs);
|
||||
Assert.assertEquals(2, fList.size());
|
||||
}
|
||||
|
||||
|
@ -148,10 +181,8 @@ public class TestFileListEntityProcessor {
|
|||
FileOutputStream f = new FileOutputStream(file);
|
||||
f.write(content);
|
||||
f.close();
|
||||
// System.out.println("before "+file.lastModified());
|
||||
if (changeModifiedTime)
|
||||
file.setLastModified(System.currentTimeMillis() - 3600000);
|
||||
// System.out.println("after "+file.lastModified());
|
||||
return file;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue