SOLR-1406 -- Make FileDataSource and FileListEntityProcessor to be more extensible, fix variable resolving bug in FileListEntityProcessor and add tests

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@812122 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-09-07 13:12:01 +00:00
parent c7696af098
commit 7deb769861
4 changed files with 184 additions and 61 deletions

View File

@ -155,6 +155,8 @@ New Features
37.SOLR-1348: Support binary field type in convertType logic in JdbcDataSource (shalin)
38.SOLR-1406: Make FileDataSource and FileListEntityProcessor to be more extensible (Luke Forehand, shalin)
Optimizations
----------------------
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used

View File

@ -44,9 +44,15 @@ import org.slf4j.LoggerFactory;
public class FileDataSource extends DataSource<Reader> {
public static final String BASE_PATH = "basePath";
private String basePath;
/**
* The basePath for this data source
*/
protected String basePath;
private String encoding = null;
/**
* The encoding using which the given file should be read
*/
protected String encoding = null;
private static final Logger LOG = LoggerFactory.getLogger(FileDataSource.class);
@ -95,7 +101,16 @@ public class FileDataSource extends DataSource<Reader> {
}
}
private InputStreamReader openStream(File file) throws FileNotFoundException,
/**
* Open a {@link java.io.Reader} for the given file name
*
* @param file a {@link java.io.File} instance
* @return a Reader on the given file
* @throws FileNotFoundException if the File does not exist
* @throws UnsupportedEncodingException if the encoding is unsupported
* @since solr 1.4
*/
protected Reader openStream(File file) throws FileNotFoundException,
UnsupportedEncodingException {
if (encoding == null) {
return new InputStreamReader(new FileInputStream(file));

View File

@ -54,13 +54,54 @@ import java.util.regex.Pattern;
* @since solr 1.3
*/
public class FileListEntityProcessor extends EntityProcessorBase {
private String fileName, baseDir, excludes;
/**
* A regex pattern to identify files given in data-config.xml after resolving any variables
*/
protected String fileName;
private Date newerThan, olderThan;
/**
* The baseDir given in data-config.xml after resolving any variables
*/
protected String baseDir;
private long biggerThan = -1, smallerThan = -1;
/**
* A Regex pattern of excluded file names as given in data-config.xml after resolving any variables
*/
protected String excludes;
private boolean recursive = false;
/**
* The newerThan given in data-config as a {@link java.util.Date}
* <p>
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
* </p>
*/
protected Date newerThan;
/**
* The newerThan given in data-config as a {@link java.util.Date}
*/
protected Date olderThan;
/**
* The biggerThan given in data-config as a long value
* <p>
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
* </p>
*/
protected long biggerThan = -1;
/**
* The smallerThan given in data-config as a long value
* <p>
* <b>Note: </b> This variable is resolved just-in-time in the {@link #nextRow()} method.
* </p>
*/
protected long smallerThan = -1;
/**
* The recursive given in data-config. Default value is false.
*/
protected boolean recursive = false;
private Pattern fileNamePattern, excludesPattern;
@ -91,13 +132,23 @@ public class FileListEntityProcessor extends EntityProcessorBase {
}
}
/**
* Get the Date object corresponding to the given string.
*
* @param dateStr the date string. It can be a DateMath string or it may have a evaluator function
* @return a Date instance corresponding to the input string
*/
private Date getDate(String dateStr) {
if (dateStr == null)
return null;
Matcher m = PLACE_HOLDER_PATTERN.matcher(dateStr);
if (m.find()) {
return (Date) resolver.resolve(dateStr);
Object o = resolver.resolve(m.group(1));
if (o instanceof Date) return (Date)o;
dateStr = (String) o;
} else {
dateStr = resolver.replaceTokens(dateStr);
}
m = EvaluatorBag.IN_SINGLE_QUOTES.matcher(dateStr);
if (m.find()) {
@ -118,9 +169,34 @@ public class FileListEntityProcessor extends EntityProcessorBase {
}
}
/**
* Get the Long value for the given string after resolving any evaluator or variable.
*
* @param sizeStr the size as a string
* @return the Long value corresponding to the given string
*/
private Long getSize(String sizeStr) {
if (sizeStr == null)
return null;
Matcher m = PLACE_HOLDER_PATTERN.matcher(sizeStr);
if (m.find()) {
Object o = resolver.resolve(m.group(1));
if (o instanceof Number) {
Number number = (Number) o;
return number.longValue();
}
sizeStr = (String) o;
} else {
sizeStr = resolver.replaceTokens(sizeStr);
}
return Long.parseLong(sizeStr);
}
public Map<String, Object> nextRow() {
if (rowIterator != null)
return getAndApplyTrans();
return getNext();
List<Map<String, Object>> fileDetails = new ArrayList<Map<String, Object>>();
File dir = new File(baseDir);
@ -128,17 +204,16 @@ public class FileListEntityProcessor extends EntityProcessorBase {
newerThan = getDate(dateStr);
dateStr = context.getEntityAttribute(OLDER_THAN);
olderThan = getDate(dateStr);
String biggerThanStr = context.getEntityAttribute(BIGGER_THAN);
if (biggerThanStr != null)
biggerThan = getSize(biggerThanStr);
String smallerThanStr = context.getEntityAttribute(SMALLER_THAN);
if (smallerThanStr != null)
smallerThan = getSize(smallerThanStr);
getFolderFiles(dir, fileDetails);
rowIterator = fileDetails.iterator();
return getAndApplyTrans();
}
private Map<String, Object> getAndApplyTrans() {
while (true) {
Map<String, Object> r = getNext();
return r;
}
return getNext();
}
private void getFolderFiles(File dir, final List<Map<String, Object>> fileDetails) {
@ -186,7 +261,7 @@ public class FileListEntityProcessor extends EntityProcessorBase {
}
public static final Pattern PLACE_HOLDER_PATTERN = Pattern
.compile("\\$\\{.*?\\}");
.compile("\\$\\{(.*?)\\}");
public static final String DIR = "fileDir";

View File

@ -22,10 +22,7 @@ import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.*;
/**
* <p>
@ -63,6 +60,76 @@ public class TestFileListEntityProcessor {
}
Assert.assertEquals(2, fList.size());
}
@Test
public void testBiggerSmallerFiles() throws IOException {
long time = System.currentTimeMillis();
File tmpdir = new File("." + time);
tmpdir.mkdir();
tmpdir.deleteOnExit();
long minLength = Long.MAX_VALUE;
String smallestFile = "";
byte[] content = "abcdefgij".getBytes("UTF-8");
createFile(tmpdir, "a.xml", content, false);
if (minLength > content.length) {
minLength = content.length;
smallestFile = "a.xml";
}
content = "abcdefgij".getBytes("UTF-8");
createFile(tmpdir, "b.xml", content, false);
if (minLength > content.length) {
minLength = content.length;
smallestFile = "b.xml";
}
content = "abc".getBytes("UTF-8");
createFile(tmpdir, "c.props", content, false);
if (minLength > content.length) {
minLength = content.length;
smallestFile = "c.props";
}
Map attrs = AbstractDataImportHandlerTest.createMap(
FileListEntityProcessor.FILE_NAME, ".*",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
FileListEntityProcessor.BIGGER_THAN, String.valueOf(minLength));
List<String> fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
Set<String> l = new HashSet<String>();
l.add(new File(tmpdir, "a.xml").getAbsolutePath());
l.add(new File(tmpdir, "b.xml").getAbsolutePath());
Assert.assertEquals(l, new HashSet<String>(fList));
attrs = AbstractDataImportHandlerTest.createMap(
FileListEntityProcessor.FILE_NAME, ".*",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
FileListEntityProcessor.SMALLER_THAN, String.valueOf(minLength+1));
fList = getFiles(null, attrs);
l.clear();
l.add(new File(tmpdir, smallestFile).getAbsolutePath());
Assert.assertEquals(l, new HashSet<String>(fList));
attrs = AbstractDataImportHandlerTest.createMap(
FileListEntityProcessor.FILE_NAME, ".*",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
FileListEntityProcessor.SMALLER_THAN, "${a.x}");
VariableResolverImpl resolver = new VariableResolverImpl();
resolver.addNamespace("a", AbstractDataImportHandlerTest.createMap("x", "4"));
fList = getFiles(resolver, attrs);
Assert.assertEquals(l, new HashSet<String>(fList));
}
@SuppressWarnings("unchecked")
private List<String> getFiles(VariableResolverImpl resolver, Map attrs) {
Context c = AbstractDataImportHandlerTest.getContext(null,
resolver, null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
fileListEntityProcessor.init(c);
List<String> fList = new ArrayList<String>();
while (true) {
Map<String, Object> f = fileListEntityProcessor.nextRow();
if (f == null)
break;
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
}
return fList;
}
@Test
public void testNTOT() throws IOException {
@ -77,34 +144,13 @@ public class TestFileListEntityProcessor {
FileListEntityProcessor.FILE_NAME, "xml$",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
FileListEntityProcessor.OLDER_THAN, "'NOW'");
Context c = AbstractDataImportHandlerTest.getContext(null,
new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
fileListEntityProcessor.init(c);
List<String> fList = new ArrayList<String>();
while (true) {
Map<String, Object> f = fileListEntityProcessor.nextRow();
if (f == null)
break;
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
}
System.out.println("List of files when given OLDER_THAN -- " + fList);
List<String> fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
attrs = AbstractDataImportHandlerTest.createMap(
FileListEntityProcessor.FILE_NAME, ".xml$",
FileListEntityProcessor.BASE_DIR, tmpdir.getAbsolutePath(),
FileListEntityProcessor.NEWER_THAN, "'NOW-2HOURS'");
c = AbstractDataImportHandlerTest.getContext(null,
new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
fileListEntityProcessor.init(c);
fList.clear();
while (true) {
Map<String, Object> f = fileListEntityProcessor.nextRow();
if (f == null)
break;
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
}
System.out.println("List of files when given NEWER_THAN -- " + fList);
fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
}
@ -124,20 +170,7 @@ public class TestFileListEntityProcessor {
FileListEntityProcessor.FILE_NAME, "^.*\\.xml$",
FileListEntityProcessor.BASE_DIR, childdir.getAbsolutePath(),
FileListEntityProcessor.RECURSIVE, "true");
Context c = AbstractDataImportHandlerTest.getContext(null,
new VariableResolverImpl(), null, Context.FULL_DUMP, Collections.EMPTY_LIST, attrs);
FileListEntityProcessor fileListEntityProcessor = new FileListEntityProcessor();
fileListEntityProcessor.init(c);
List<String> fList = new ArrayList<String>();
while (true) {
// Add the documents to the index. NextRow() should only
// find two filesnames that match the pattern in fileName
Map<String, Object> f = fileListEntityProcessor.nextRow();
if (f == null)
break;
fList.add((String) f.get(FileListEntityProcessor.ABSOLUTE_FILE));
}
System.out.println("List of files indexed -- " + fList);
List<String> fList = getFiles(null, attrs);
Assert.assertEquals(2, fList.size());
}
@ -148,10 +181,8 @@ public class TestFileListEntityProcessor {
FileOutputStream f = new FileOutputStream(file);
f.write(content);
f.close();
// System.out.println("before "+file.lastModified());
if (changeModifiedTime)
file.setLastModified(System.currentTimeMillis() - 3600000);
// System.out.println("after "+file.lastModified());
return file;
}
}