SOLR-5003: add rowid (line number) option to CSV Loader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1500046 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2013-07-05 15:30:42 +00:00
parent 18cc36851f
commit 218856ac6d
3 changed files with 24 additions and 3 deletions

View File

@ -156,6 +156,9 @@ New Features
which are INACTIVE or have no range (created for custom sharding).
(Anshum Gupta, shalin)
* SOLR-5003: CSV Update Handler supports optionally adding the line number/row id to
a document (gsingers)
Bug Fixes
----------------------

View File

@ -55,6 +55,7 @@ abstract class CSVLoaderBase extends ContentStreamLoader {
public static final String ESCAPE="escape";
public static final String OVERWRITE="overwrite";
public static final String LITERALS_PREFIX = "literal.";
public static final String ROW_ID = "rowid";
private static Pattern colonSplit = Pattern.compile(":");
private static Pattern commaSplit = Pattern.compile(",");
@ -65,13 +66,15 @@ abstract class CSVLoaderBase extends ContentStreamLoader {
final SolrParams params;
final CSVStrategy strategy;
final UpdateRequestProcessor processor;
// hashmap to save any literal fields and their values
HashMap <SchemaField, String> literals;
String[] fieldnames;
SchemaField[] fields;
CSVLoaderBase.FieldAdder[] adders;
String rowId = null;// if not null, add a special field by the name given with the line number/row id as the value
int skipLines; // number of lines to skip at start of file
final AddUpdateCommand templateAdd;
@ -186,6 +189,7 @@ abstract class CSVLoaderBase extends ContentStreamLoader {
if (escape!=null) {
if (escape.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
}
rowId = params.get(ROW_ID);
// if only encapsulator or escape is set, disable the other escaping mechanism
if (encapsulator == null && escape != null) {
@ -290,6 +294,7 @@ abstract class CSVLoaderBase extends ContentStreamLoader {
if (!pname.startsWith(LITERALS_PREFIX)) continue;
String name = pname.substring(LITERALS_PREFIX.length());
//TODO: need to look at this in light of schemaless
SchemaField sf = schema.getFieldOrNull(name);
if(sf == null)
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid field name for literal:'"+ name +"'");
@ -378,7 +383,7 @@ abstract class CSVLoaderBase extends ContentStreamLoader {
/** this must be MT safe... may be called concurrently from multiple threads. */
void doAdd(int line, String[] vals, SolrInputDocument doc, AddUpdateCommand template) throws IOException {
// the line number is passed simply for error reporting in MT mode.
// the line number is passed for error reporting in MT mode as well as for optional rowId.
// first, create the lucene document
for (int i=0; i<vals.length; i++) {
if (fields[i]==null) continue; // ignore this field
@ -392,7 +397,9 @@ abstract class CSVLoaderBase extends ContentStreamLoader {
String val = literals.get(sf);
doc.addField(fn, val);
}
if (rowId != null){
doc.addField(rowId, line);
}
template.solrDoc = doc;
processor.processAdd(template);
}

View File

@ -107,6 +107,17 @@ public class TestCSVLoader extends SolrTestCaseJ4 {
assertQ(req("id:[100 TO 110]"),"//*[@numFound='3']");
}
@Test
public void testCSVRowId() throws Exception {
makeFile("id\n100\n101\n102");
loadLocal("rowid", "rowid_i");//add a special field
// check default commit of false
assertU(commit());
assertQ(req("rowid_i:1"),"//*[@numFound='1']");
assertQ(req("rowid_i:2"),"//*[@numFound='1']");
assertQ(req("rowid_i:100"),"//*[@numFound='0']");
}
@Test
public void testCommitFalse() throws Exception {
makeFile("id\n100\n101\n102");