new CSV lib, escape option for loader

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@609333 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2008-01-06 15:43:50 +00:00
parent 8dfa3262b0
commit 8e069c976c
5 changed files with 62 additions and 7 deletions

View File

@ -272,6 +272,10 @@ Other Changes
lucene-analyzers-2.2.0.jar -- includes support for German, Chinese,
Russan, Dutch, Greek, Brazilian, Thai, and French. (hossman)
7. Upgraded to commons-CSV r609327, which fixes escaping bugs and
introduces new escaping and whitespace handling options to
increase compatibility with different formats. (yonik)
Build
1. SOLR-411. Changed the names of the Solr JARs to use the defacto standard JAR names based on
project-name-version.jar. This yields, for example:

View File

@ -1,2 +0,0 @@
AnyObjectId[8e096258a36f86e9b956e52f55df0b5afbe8999f] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[f80348dfa0b59f0840c25d1b8c25d1490d1eaf51] was removed in git history.
Apache SVN contains full history.

View File

@ -99,6 +99,7 @@ abstract class CSVLoader {
static String EMPTY="keepEmpty";
static String SPLIT="split";
static String ENCAPSULATOR="encapsulator";
static String ESCAPE="escape";
static String OVERWRITE="overwrite";
private static Pattern colonSplit = Pattern.compile(":");
@ -216,7 +217,7 @@ abstract class CSVLoader {
templateAdd.overwritePending=false;
}
strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, true, false, true);
strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, true);
String sep = params.get(SEPARATOR);
if (sep!=null) {
if (sep.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");
@ -225,8 +226,32 @@ abstract class CSVLoader {
String encapsulator = params.get(ENCAPSULATOR);
if (encapsulator!=null) {
if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+sep+"'");
strategy.setEncapsulator(encapsulator.charAt(0));
if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");
}
String escape = params.get(ESCAPE);
if (escape!=null) {
if (escape.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
}
// if only encapsulator or escape is set, disable the other escaping mechanism
if (encapsulator == null && escape != null) {
strategy.setEncapsulator((char)-2); // TODO: add CSVStrategy.ENCAPSULATOR_DISABLED
strategy.setEscape(escape.charAt(0));
} else {
if (encapsulator != null) {
strategy.setEncapsulator(encapsulator.charAt(0));
}
if (escape != null) {
char ch = escape.charAt(0);
strategy.setEscape(ch);
if (ch == '\\') {
// If the escape is the standard backslash, then also enable
// unicode escapes (it's harmless since 'u' would not otherwise
// be escaped.
strategy.setUnicodeEscapeInterpretation(true);
}
}
}
String fn = params.get(FIELDNAMES);

View File

@ -234,7 +234,9 @@ public class TestCSVLoader extends AbstractSolrTestCase {
+"100|^quoted^\n"
+"101|a;'b';c\n"
+"102|a;;b\n"
+"103|\n");
+"103|\n"
+"104|a\\\\b\n" // no backslash escaping should be done by default
);
loadLocal("stream.file",filename, "commit","true",
"separator","|",
@ -244,14 +246,38 @@ public class TestCSVLoader extends AbstractSolrTestCase {
"f.str_s.separator",";",
"f.str_s.encapsulator","'"
);
assertQ(req("id:[100 TO 110]"),"//*[@numFound='4']");
assertQ(req("id:[100 TO 110]"),"//*[@numFound='5']");
assertQ(req("id:100"),"//str[@name='str_s'][.='quoted']");
assertQ(req("id:101"),"//arr[@name='str_s']/str[1][.='a']");
assertQ(req("id:101"),"//arr[@name='str_s']/str[2][.='b']");
assertQ(req("id:101"),"//arr[@name='str_s']/str[3][.='c']");
assertQ(req("id:102"),"//arr[@name='str_s']/str[2][.='EMPTY']");
assertQ(req("id:103"),"//str[@name='str_s'][.='EMPTY']");
assertQ(req("id:104"),"//str[@name='str_s'][.='a\\\\b']");
// test no escaping + double encapsulator escaping by default
makeFile("id,str_s\n"
+"100,\"quoted \"\" \\ string\"\n"
+"101,unquoted \"\" \\ string\n" // double encap shouldn't be an escape outside encap
+"102,end quote \\\n"
);
loadLocal("stream.file",filename, "commit","true"
);
assertQ(req("id:100"),"//str[@name='str_s'][.='quoted \" \\ string']");
assertQ(req("id:101"),"//str[@name='str_s'][.='unquoted \"\" \\ string']");
assertQ(req("id:102"),"//str[@name='str_s'][.='end quote \\']");
// setting an escape should disable encapsulator
makeFile("id,str_s\n"
+"100,\"quoted \"\" \\\" \\\\ string\"\n" // quotes should be part of value
+"101,unquoted \"\" \\\" \\, \\\\ string\n"
);
loadLocal("stream.file",filename, "commit","true"
,"escape","\\"
);
assertQ(req("id:100"),"//str[@name='str_s'][.='\"quoted \"\" \" \\ string\"']");
assertQ(req("id:101"),"//str[@name='str_s'][.='unquoted \"\" \" , \\ string']");
}