mirror of https://github.com/apache/lucene.git
new CSV lib, escape option for loader
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@609333 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8dfa3262b0
commit
8e069c976c
|
@ -272,6 +272,10 @@ Other Changes
|
|||
lucene-analyzers-2.2.0.jar -- includes support for German, Chinese,
|
||||
Russan, Dutch, Greek, Brazilian, Thai, and French. (hossman)
|
||||
|
||||
7. Upgraded to commons-CSV r609327, which fixes escaping bugs and
|
||||
introduces new escaping and whitespace handling options to
|
||||
increase compatibility with different formats. (yonik)
|
||||
|
||||
Build
|
||||
1. SOLR-411. Changed the names of the Solr JARs to use the defacto standard JAR names based on
|
||||
project-name-version.jar. This yields, for example:
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[8e096258a36f86e9b956e52f55df0b5afbe8999f] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[f80348dfa0b59f0840c25d1b8c25d1490d1eaf51] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -99,6 +99,7 @@ abstract class CSVLoader {
|
|||
static String EMPTY="keepEmpty";
|
||||
static String SPLIT="split";
|
||||
static String ENCAPSULATOR="encapsulator";
|
||||
static String ESCAPE="escape";
|
||||
static String OVERWRITE="overwrite";
|
||||
|
||||
private static Pattern colonSplit = Pattern.compile(":");
|
||||
|
@ -216,7 +217,7 @@ abstract class CSVLoader {
|
|||
templateAdd.overwritePending=false;
|
||||
}
|
||||
|
||||
strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, true, false, true);
|
||||
strategy = new CSVStrategy(',', '"', CSVStrategy.COMMENTS_DISABLED, CSVStrategy.ESCAPE_DISABLED, false, false, false, true);
|
||||
String sep = params.get(SEPARATOR);
|
||||
if (sep!=null) {
|
||||
if (sep.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid separator:'"+sep+"'");
|
||||
|
@ -225,8 +226,32 @@ abstract class CSVLoader {
|
|||
|
||||
String encapsulator = params.get(ENCAPSULATOR);
|
||||
if (encapsulator!=null) {
|
||||
if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+sep+"'");
|
||||
strategy.setEncapsulator(encapsulator.charAt(0));
|
||||
if (encapsulator.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid encapsulator:'"+encapsulator+"'");
|
||||
}
|
||||
|
||||
String escape = params.get(ESCAPE);
|
||||
if (escape!=null) {
|
||||
if (escape.length()!=1) throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,"Invalid escape:'"+escape+"'");
|
||||
}
|
||||
|
||||
// if only encapsulator or escape is set, disable the other escaping mechanism
|
||||
if (encapsulator == null && escape != null) {
|
||||
strategy.setEncapsulator((char)-2); // TODO: add CSVStrategy.ENCAPSULATOR_DISABLED
|
||||
strategy.setEscape(escape.charAt(0));
|
||||
} else {
|
||||
if (encapsulator != null) {
|
||||
strategy.setEncapsulator(encapsulator.charAt(0));
|
||||
}
|
||||
if (escape != null) {
|
||||
char ch = escape.charAt(0);
|
||||
strategy.setEscape(ch);
|
||||
if (ch == '\\') {
|
||||
// If the escape is the standard backslash, then also enable
|
||||
// unicode escapes (it's harmless since 'u' would not otherwise
|
||||
// be escaped.
|
||||
strategy.setUnicodeEscapeInterpretation(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String fn = params.get(FIELDNAMES);
|
||||
|
|
|
@ -234,7 +234,9 @@ public class TestCSVLoader extends AbstractSolrTestCase {
|
|||
+"100|^quoted^\n"
|
||||
+"101|a;'b';c\n"
|
||||
+"102|a;;b\n"
|
||||
+"103|\n");
|
||||
+"103|\n"
|
||||
+"104|a\\\\b\n" // no backslash escaping should be done by default
|
||||
);
|
||||
|
||||
loadLocal("stream.file",filename, "commit","true",
|
||||
"separator","|",
|
||||
|
@ -244,14 +246,38 @@ public class TestCSVLoader extends AbstractSolrTestCase {
|
|||
"f.str_s.separator",";",
|
||||
"f.str_s.encapsulator","'"
|
||||
);
|
||||
assertQ(req("id:[100 TO 110]"),"//*[@numFound='4']");
|
||||
assertQ(req("id:[100 TO 110]"),"//*[@numFound='5']");
|
||||
assertQ(req("id:100"),"//str[@name='str_s'][.='quoted']");
|
||||
assertQ(req("id:101"),"//arr[@name='str_s']/str[1][.='a']");
|
||||
assertQ(req("id:101"),"//arr[@name='str_s']/str[2][.='b']");
|
||||
assertQ(req("id:101"),"//arr[@name='str_s']/str[3][.='c']");
|
||||
assertQ(req("id:102"),"//arr[@name='str_s']/str[2][.='EMPTY']");
|
||||
assertQ(req("id:103"),"//str[@name='str_s'][.='EMPTY']");
|
||||
assertQ(req("id:104"),"//str[@name='str_s'][.='a\\\\b']");
|
||||
|
||||
// test no escaping + double encapsulator escaping by default
|
||||
makeFile("id,str_s\n"
|
||||
+"100,\"quoted \"\" \\ string\"\n"
|
||||
+"101,unquoted \"\" \\ string\n" // double encap shouldn't be an escape outside encap
|
||||
+"102,end quote \\\n"
|
||||
);
|
||||
loadLocal("stream.file",filename, "commit","true"
|
||||
);
|
||||
assertQ(req("id:100"),"//str[@name='str_s'][.='quoted \" \\ string']");
|
||||
assertQ(req("id:101"),"//str[@name='str_s'][.='unquoted \"\" \\ string']");
|
||||
assertQ(req("id:102"),"//str[@name='str_s'][.='end quote \\']");
|
||||
|
||||
|
||||
// setting an escape should disable encapsulator
|
||||
makeFile("id,str_s\n"
|
||||
+"100,\"quoted \"\" \\\" \\\\ string\"\n" // quotes should be part of value
|
||||
+"101,unquoted \"\" \\\" \\, \\\\ string\n"
|
||||
);
|
||||
loadLocal("stream.file",filename, "commit","true"
|
||||
,"escape","\\"
|
||||
);
|
||||
assertQ(req("id:100"),"//str[@name='str_s'][.='\"quoted \"\" \" \\ string\"']");
|
||||
assertQ(req("id:101"),"//str[@name='str_s'][.='unquoted \"\" \" , \\ string']");
|
||||
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue