From 21822811a9d4fed78972183e366672b5ca014c22 Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Wed, 28 Dec 2011 07:17:55 +0000 Subject: [PATCH] SOLR-2346: Add a chance to set content encoding explicitly via content type of stream. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225120 13f79535-47bb-0310-9956-ffa450edef68 --- solr/contrib/extraction/CHANGES.txt | 4 +++- .../solr/handler/extraction/ExtractingDocumentLoader.java | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/solr/contrib/extraction/CHANGES.txt b/solr/contrib/extraction/CHANGES.txt index ab2adf66cf4..daf04a76e56 100644 --- a/solr/contrib/extraction/CHANGES.txt +++ b/solr/contrib/extraction/CHANGES.txt @@ -30,7 +30,9 @@ $Id$ ================== Release 3.6.0 ================== -(No Changes) +* SOLR-2346: Add a chance to set content encoding explicitly via content type of stream. + This is convenient when Tika's auto detector cannot detect encoding, especially + the text file is too short to detect encoding. (koji) ================== Release 3.5.0 ================== diff --git a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index bbc3ba97fb3..a731fe1603a 100644 --- a/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -26,6 +26,7 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.params.UpdateParams; import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; import org.apache.solr.handler.ContentStreamLoader; import org.apache.solr.request.SolrQueryRequest; @@ -158,6 +159,12 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); + // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata + String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); + if(charset != null){ + metadata.add(Metadata.CONTENT_ENCODING, charset); + } + String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, schema);