SOLR-1061 -- Improve RegexTransformer to create multiple columns from regex groups

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@755143 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-03-17 07:58:10 +00:00
parent 7de4dee17c
commit f92165443e
3 changed files with 105 additions and 13 deletions

View File

@ -97,6 +97,9 @@ New Features
push data to Solr instead of just pulling it from DB/Files/URLs.
(Noble Paul via shalin)
23.SOLR-1061: Improve RegexTransformer to create multiple columns from regex groups.
(Noble Paul via shalin)
Optimizations
----------------------
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used

View File

@ -55,6 +55,7 @@ public class RegexTransformer extends Transformer {
splitBy = vr.replaceTokens(splitBy);
String replaceWith = field.get(REPLACE_WITH);
replaceWith = vr.replaceTokens(replaceWith);
String groupNames = vr.replaceTokens(field.get(GROUP_NAMES));
if (reStr != null || splitBy != null) {
String srcColName = field.get(SRC_COL_NAME);
if (srcColName == null) {
@ -67,17 +68,50 @@ public class RegexTransformer extends Transformer {
if (tmpVal instanceof List) {
List<String> inputs = (List<String>) tmpVal;
List results = new ArrayList();
Map<String,List> otherVars= null;
for (String input : inputs) {
Object o = process(col, reStr, splitBy, replaceWith, input);
if (o != null)
results.add(o);
Object o = process(col, reStr, splitBy, replaceWith, input, groupNames);
if (o != null){
if (o instanceof Map) {
Map map = (Map) o;
for (Object e : map.entrySet()) {
Map.Entry<String ,Object> entry = (Map.Entry<String, Object>) e;
List l = results;
if(!col.equals(entry.getKey())){
if(otherVars == null) otherVars = new HashMap<String, List>();
l = otherVars.get(entry.getKey());
if(l == null){
l = new ArrayList();
otherVars.put(entry.getKey(), l);
}
}
if (entry.getValue() instanceof Collection) {
l.addAll((Collection) entry.getValue());
} else {
l.add(entry.getValue());
}
}
} else {
if (o instanceof Collection) {
results.addAll((Collection) o);
} else {
results.add(o);
}
}
}
}
row.put(col, results);
for (Object result : results) row.put(col, result);
if(otherVars != null) row.putAll(otherVars);
} else {
String value = tmpVal.toString();
Object o = process(col, reStr, splitBy, replaceWith, value);
if (o != null)
row.put(col, o);
Object o = process(col, reStr, splitBy, replaceWith, value, groupNames);
if (o != null){
if (o instanceof Map) {
row.putAll((Map) o);
} else{
row.put(col, o);
}
}
}
}
}
@ -85,14 +119,14 @@ public class RegexTransformer extends Transformer {
}
private Object process(String col, String reStr, String splitBy,
String replaceWith, String value) {
String replaceWith, String value, String groupNames) {
if (splitBy != null) {
return readBySplit(splitBy, value);
} else if (replaceWith != null) {
Pattern p = getPattern(reStr);
return p.matcher(value).replaceAll(replaceWith);
} else {
return readfromRegExp(reStr, value, col);
return readfromRegExp(reStr, value, col, groupNames);
}
}
@ -105,20 +139,39 @@ public class RegexTransformer extends Transformer {
}
@SuppressWarnings("unchecked")
private Object readfromRegExp(String reStr, String value, String columnName) {
private Object readfromRegExp(String reStr, String value, String columnName, String gNames) {
String[] groupNames = null;
if(gNames != null && gNames.trim().length() >0){
groupNames = gNames.split(",");
}
Pattern regexp = getPattern(reStr);
Matcher m = regexp.matcher(value);
if (m.find() && m.groupCount() > 0) {
if (m.groupCount() > 1) {
List l = new ArrayList();
List l = null;
Map<String ,String > map = null;
if(groupNames == null){
l = new ArrayList();
} else {
map = new HashMap<String, String>();
}
for (int i = 1; i <= m.groupCount(); i++) {
try {
l.add(m.group(i));
if(l != null){
l.add(m.group(i));
} else if (map != null ){
if(i <= groupNames.length){
String nameOfGroup = groupNames[i-1];
if(nameOfGroup != null && nameOfGroup.trim().length() >0){
map.put(nameOfGroup, m.group(i));
}
}
}
} catch (Exception e) {
LOG.warn("Parsing failed for field : " + columnName, e);
}
}
return l;
return l == null ? map: l;
} else {
return m.group(1);
}
@ -145,4 +198,6 @@ public class RegexTransformer extends Transformer {
public static final String SRC_COL_NAME = "sourceColName";
public static final String GROUP_NAMES = "groupNames";
}

View File

@ -16,6 +16,9 @@
*/
package org.apache.solr.handler.dataimport;
import static org.apache.solr.handler.dataimport.RegexTransformer.REGEX;
import static org.apache.solr.handler.dataimport.RegexTransformer.GROUP_NAMES;
import static org.apache.solr.handler.dataimport.DataImporter.COLUMN;
import org.junit.Assert;
import org.junit.Test;
@ -46,6 +49,37 @@ public class TestRegexTransformer {
Assert.assertEquals(2, result.size());
Assert.assertEquals(4, ((List) result.get("col1")).size());
}
@Test
public void groupNames() {
List<Map<String, String>> fields = new ArrayList<Map<String, String>>();
// <field column="col1" regex="(\w*)(\w*) (\w*)" groupNames=",firstName,lastName"/>
Map<String ,String > m = new HashMap<String, String>();
m.put(COLUMN,"fullName");
m.put(GROUP_NAMES,",firstName,lastName");
m.put(REGEX,"(\\w*) (\\w*) (\\w*)");
fields.add(m);
Context context = AbstractDataImportHandlerTest.getContext(null, null, null, 0, fields, null);
Map<String, Object> src = new HashMap<String, Object>();
src.put("fullName", "Mr Noble Paul");
Map<String, Object> result = new RegexTransformer().transformRow(src, context);
Assert.assertEquals("Noble", result.get("firstName"));
Assert.assertEquals("Paul", result.get("lastName"));
src= new HashMap<String, Object>();
List<String> l= new ArrayList();
l.add("Mr Noble Paul") ;
l.add("Mr Shalin Mangar") ;
src.put("fullName", l);
result = new RegexTransformer().transformRow(src, context);
List l1 = (List) result.get("firstName");
List l2 = (List) result.get("lastName");
Assert.assertEquals("Noble", l1.get(0));
Assert.assertEquals("Shalin", l1.get(1));
Assert.assertEquals("Paul", l2.get(0));
Assert.assertEquals("Mangar", l2.get(1));
}
@Test
public void replaceWith() {