From c3df2607b0acdcce287da0c8f116decad4096d3e Mon Sep 17 00:00:00 2001
From: Ryan McKinley <ryan@apache.org>
Date: Mon, 7 May 2007 23:35:55 +0000
Subject: [PATCH] SOLR-214 - use the charset encoded in the contentType to
 decode the posted text.  Even though they are supposed to, some containers do
 not obey this specification.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@536019 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |  7 +++++++
 .../solr/servlet/SolrRequestParsers.java      | 21 ++++++++++++-------
 2 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index bcb0ffa26a6..d92ad600957 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -271,6 +271,13 @@ Bug Fixes
 13. Changed the SOLR-104 RequestDispatcher so that /select?qt=xxx can not 
     access handlers that start with "/".  This makes path based authentication
     possible for path based request handlers.  (ryan)
+
+14. SOLR-214: Some servlet containers (including Tomcat and Resin) do not
+    obey the specified charset.  Rather then letting the the container handle 
+    it solr now uses the charset from the header contentType to decode posted
+    content.  Using the contentType: "text/xml; charset=utf-8" will force
+    utf-8 encoding.  If you do not specify a contentType, it will use the 
+    platform default.  (Koji Sekiguchi via ryan)
  
 Other Changes
  1. Updated to Lucene 2.1
diff --git a/src/webapp/src/org/apache/solr/servlet/SolrRequestParsers.java b/src/webapp/src/org/apache/solr/servlet/SolrRequestParsers.java
index 25d7816c50f..3615aa43a8c 100644
--- a/src/webapp/src/org/apache/solr/servlet/SolrRequestParsers.java
+++ b/src/webapp/src/org/apache/solr/servlet/SolrRequestParsers.java
@@ -20,7 +20,6 @@ package org.apache.solr.servlet;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLDecoder;
@@ -225,16 +224,27 @@ class RawRequestParser implements SolrRequestParser
   public SolrParams parseParamsAndFillStreams( 
       final HttpServletRequest req, ArrayList<ContentStream> streams ) throws Exception
   {
-    streams.add( new ContentStream() {
+    // The javadocs for HttpServletRequest are clear that req.getReader() should take
+    // care of any character encoding issues.  BUT, there are problems while running on
+    // some servlet containers: including Tomcat 5 and resin.
+    //
+    // Rather than return req.getReader(), this uses the default ContentStreamBase method
+    // that checks for charset definitions in the ContentType.
+    
+    streams.add( new ContentStreamBase() {
+      @Override
       public String getContentType() {
         return req.getContentType();
       }
+      @Override
       public String getName() {
-        return null; // Is there any meaningfull name?
+        return null; // Is there any meaningful name?
       }
+      @Override
       public String getSourceInfo() {
-        return null; // Is there any meaningfull name?
+        return null; // Is there any meaningful source?
       }
+      @Override
       public Long getSize() { 
         String v = req.getHeader( "Content-Length" );
         if( v != null ) {
@@ -245,9 +255,6 @@ class RawRequestParser implements SolrRequestParser
       public InputStream getStream() throws IOException {
         return req.getInputStream();
       }
-      public Reader getReader() throws IOException {
-        return req.getReader();
-      }
     });
     return SolrRequestParsers.parseQueryString( req.getQueryString() );
   }