SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225120 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-12-28 07:17:55 +00:00
parent 7c7c7bd077
commit 21822811a9
2 changed files with 10 additions and 1 deletions

View File

@ -30,7 +30,9 @@ $Id$
================== Release 3.6.0 ==================
(No Changes)
* SOLR-2346: Add a chance to set content encoding explicitly via content type of stream.
This is convenient when Tika's auto detector cannot detect encoding, especially
the text file is too short to detect encoding. (koji)
================== Release 3.5.0 ==================

View File

@ -26,6 +26,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.solr.request.SolrQueryRequest;
@ -158,6 +159,12 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
// HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
if(charset != null){
metadata.add(Metadata.CONTENT_ENCODING, charset);
}
String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, schema);