SOLR-980 -- A PlainTextEntityProcessor which can read from any DataSource<Reader> and output a String

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@738401 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shalin Shekhar Mangar 2009-01-28 08:30:02 +00:00
parent 61f9aab471
commit 8220d4383e
3 changed files with 157 additions and 0 deletions

View File

@ -56,6 +56,9 @@ New Features
12.SOLR-988: Add a new scope for session data stored in Context to store objects across imports.
(Noble Paul via shalin)
13.SOLR-980: A PlainTextEntityProcessor which can read from any DataSource<Reader> and output a String.
(Nathan Adams, Noble Paul via shalin)
Optimizations
----------------------
1. SOLR-846: Reduce memory consumption during delta import by removing keys when used

View File

@ -0,0 +1,84 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.dataimport;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.SEVERE;
import static org.apache.solr.handler.dataimport.DataImportHandlerException.wrapAndThrow;
import static org.apache.solr.handler.dataimport.XPathEntityProcessor.URL;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Map;
/**
* <p>An implementation of EntityProcessor which reads data from a url/file and give out a row which contains one String
* value. The name of the field is 'plainText'.
*
* @version $Id$
* @since solr 1.4
*/
public class PlainTextEntityProcessor extends EntityProcessorBase {
private static final Logger LOG = LoggerFactory.getLogger(PlainTextEntityProcessor.class);
private boolean ended = false;
public void init(Context context) {
super.init(context);
ended = false;
}
public Map<String, Object> nextRow() {
if (ended) return null;
DataSource<Reader> ds = context.getDataSource();
String url = context.getVariableResolver().replaceTokens(context.getEntityAttribute(URL));
Reader r = null;
try {
r = ds.getData(url);
} catch (Exception e) {
if (ABORT.equals(onError)) {
wrapAndThrow(SEVERE, e, "Exception reading url : " + url);
}
return null;
}
StringWriter sw = new StringWriter();
char[] buf = new char[1024];
while (true) {
int len = 0;
try {
len = r.read(buf);
} catch (IOException e) {
if (ABORT.equals(onError)) {
wrapAndThrow(SEVERE, e, "Exception reading url : " + url);
} else {
LOG.warn("IOException while reading from data source", e);
return null;
}
}
if (len <= 0) break;
sw.append(new String(buf, 0, len));
}
Map<String, Object> row = new HashMap<String, Object>();
row.put(PLAIN_TEXT, sw.toString());
ended = true;
return super.applyTransformer(row);
}
public static final String PLAIN_TEXT = "plainText";
}

View File

@ -0,0 +1,70 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.dataimport;
import junit.framework.Assert;
import static org.apache.solr.handler.dataimport.AbstractDataImportHandlerTest.createMap;
import org.junit.Test;
import java.io.StringReader;
import java.util.Properties;
/**
* Test for PlainTextEntityProcessor
*
* @version $Id$
* @see org.apache.solr.handler.dataimport.PlainTextEntityProcessor
* @since solr 1.4
*/
public class TestPlainTextEntityProcessor {
@Test
public void simple() {
DataImporter di = new DataImporter();
di.loadAndInit(DATA_CONFIG);
TestDocBuilder.SolrWriterImpl sw = new TestDocBuilder.SolrWriterImpl();
DataImporter.RequestParams rp = new DataImporter.RequestParams(createMap("command", "full-import"));
di.runCmd(rp, sw);
Assert.assertEquals(DS.s, sw.docs.get(0).getFieldValue("x"));
}
public static class DS extends DataSource {
static String s = "hello world";
public void init(Context context, Properties initProps) {
}
public Object getData(String query) {
return new StringReader(s);
}
public void close() {
}
}
static String DATA_CONFIG = "<dataConfig>\n" +
"\t<dataSource type=\"TestPlainTextEntityProcessor$DS\" />\n" +
"\t<document>\n" +
"\t\t<entity processor=\"PlainTextEntityProcessor\" name=\"x\" query=\"x\">\n" +
"\t\t\t<field column=\"plainText\" name=\"x\" />\n" +
"\t\t</entity>\n" +
"\t</document>\n" +
"</dataConfig>";
}