SOLR-1370: Show the output of CharFilters in FieldAnalysisRequestHandler

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@805880 13f79535-47bb-0310-9956-ffa450edef68
2009-08-19 17:01:12 +00:00 · 2009-08-19 17:01:12 +00:00 · 00e1299bd6
parent a86d0c24c3
commit 00e1299bd6
5 changed files with 316 additions and 2 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -271,6 +271,8 @@ New Features
 69. SOLR-1372: Enhance FieldAnalysisRequestHandler to accept field value from content stream (ehatcher)
 70. SOLR-1370: Show the output of CharFilters in FieldAnalysisRequestHandler (koji)
 Optimizations
 ----------------------
--- a/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
+++ b/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java
@ -18,10 +18,14 @@
 package org.apache.solr.handler;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.solr.analysis.CharFilterFactory;
 import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerChain;
 import org.apache.solr.analysis.TokenizerFactory;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.common.SolrException;
@ -83,17 +87,29 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
    }
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
    TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
    TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
    NamedList<List<NamedList>> namedList = new SimpleOrderedMap<List<NamedList>>();
-    TokenStream tokenStream = tokenizerChain.getTokenizerFactory().create(new StringReader(value));
+    if( cfiltfacs != null ){
      String source = value;
      for(CharFilterFactory cfiltfac : cfiltfacs ){
        CharStream reader = CharReader.get(new StringReader(source));
        reader = cfiltfac.create(reader);
        source = writeCharStream(namedList, reader);
      }
    }
    TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value)));
    List<Token> tokens = analyzeTokenStream(tokenStream);
    namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
    ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens);
-    for (TokenFilterFactory tokenFilterFactory : tokenizerChain.getTokenFilterFactories()) {
+    for (TokenFilterFactory tokenFilterFactory : filtfacs) {
      tokenStream = tokenFilterFactory.create(listBasedTokenStream);
      List<Token> tokenList = analyzeTokenStream(tokenStream);
      namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context));
@ -189,6 +205,23 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase {
    return tokensNamedLists;
  }
  private String writeCharStream(NamedList out, CharStream input ){
    final int BUFFER_SIZE = 1024;
    char[] buf = new char[BUFFER_SIZE];
    int len = 0;
    StringBuilder sb = new StringBuilder();
    do {
      try {
        len = input.read( buf, 0, BUFFER_SIZE );
      } catch (IOException e) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
      }
      sb.append(buf, 0, len);
    } while( len == BUFFER_SIZE );
    out.add( input.getClass().getName(), sb.toString());
    return sb.toString();
  }
  // ================================================= Inner classes =================================================
--- a/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
+++ b/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
@ -293,4 +293,30 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB
  }
  public void testCharFilterAnalysis() throws Exception {
    FieldAnalysisRequest request = new FieldAnalysisRequest();
    request.addFieldType("charfilthtmlmap");
    request.setFieldValue("<html><body>whátëvêr</body></html>");
    request.setShowMatch(false);
    NamedList<NamedList> result = handler.handleAnalysisRequest(request, h.getCore().getSchema());
    assertTrue("result is null and it shouldn't be", result != null);
    NamedList<NamedList> fieldTypes = result.get("field_types");
    assertNotNull("field_types should never be null", fieldTypes);
    NamedList<NamedList> textType = fieldTypes.get("charfilthtmlmap");
    assertNotNull("expecting result for field type 'charfilthtmlmap'", textType);
    NamedList indexPart = textType.get("index");
    assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
    assertEquals("            whátëvêr              ", indexPart.get("org.apache.solr.analysis.HTMLStripCharFilter"));
    assertEquals("            whatever              ", indexPart.get("org.apache.lucene.analysis.MappingCharFilter"));
    List<NamedList> tokenList = (List<NamedList>)indexPart.get("org.apache.lucene.analysis.WhitespaceTokenizer");
    assertNotNull("Expcting WhitespaceTokenizer analysis breakdown", tokenList);
    assertEquals(tokenList.size(), 1);
    assertToken(tokenList.get(0), new TokenInfo("whatever", null, "word", 12, 20, 1, null, false));
  }
 }
--- a/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt
+++ b/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt
@ -0,0 +1,246 @@
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Syntax:
 #   "source" => "target"
 #     "source".length() > 0 (source cannot be empty.)
 #     "target".length() >= 0 (target can be empty.)
 # example:
 #   "À" => "A"
 #   "\u00C0" => "A"
 #   "\u00C0" => "\u0041"
 #   "ß" => "ss"
 #   "\t" => " "
 #   "\n" => ""
 # À => A
 "\u00C0" => "A"
 # Á => A
 "\u00C1" => "A"
 # Â => A
 "\u00C2" => "A"
 # Ã => A
 "\u00C3" => "A"
 # Ä => A
 "\u00C4" => "A"
 # Å => A
 "\u00C5" => "A"
 # Æ => AE
 "\u00C6" => "AE"
 # Ç => C
 "\u00C7" => "C"
 # È => E
 "\u00C8" => "E"
 # É => E
 "\u00C9" => "E"
 # Ê => E
 "\u00CA" => "E"
 # Ë => E
 "\u00CB" => "E"
 # Ì => I
 "\u00CC" => "I"
 # Í => I
 "\u00CD" => "I"
 # Î => I
 "\u00CE" => "I"
 # Ï => I
 "\u00CF" => "I"
 # Ĳ => IJ
 "\u0132" => "IJ"
 # Ð => D
 "\u00D0" => "D"
 # Ñ => N
 "\u00D1" => "N"
 # Ò => O
 "\u00D2" => "O"
 # Ó => O
 "\u00D3" => "O"
 # Ô => O
 "\u00D4" => "O"
 # Õ => O
 "\u00D5" => "O"
 # Ö => O
 "\u00D6" => "O"
 # Ø => O
 "\u00D8" => "O"
 # Œ => OE
 "\u0152" => "OE"
 # Þ
 "\u00DE" => "TH"
 # Ù => U
 "\u00D9" => "U"
 # Ú => U
 "\u00DA" => "U"
 # Û => U
 "\u00DB" => "U"
 # Ü => U
 "\u00DC" => "U"
 # Ý => Y
 "\u00DD" => "Y"
 # Ÿ => Y
 "\u0178" => "Y"
 # à => a
 "\u00E0" => "a"
 # á => a
 "\u00E1" => "a"
 # â => a
 "\u00E2" => "a"
 # ã => a
 "\u00E3" => "a"
 # ä => a
 "\u00E4" => "a"
 # å => a
 "\u00E5" => "a"
 # æ => ae
 "\u00E6" => "ae"
 # ç => c
 "\u00E7" => "c"
 # è => e
 "\u00E8" => "e"
 # é => e
 "\u00E9" => "e"
 # ê => e
 "\u00EA" => "e"
 # ë => e
 "\u00EB" => "e"
 # ì => i
 "\u00EC" => "i"
 # í => i
 "\u00ED" => "i"
 # î => i
 "\u00EE" => "i"
 # ï => i
 "\u00EF" => "i"
 # ĳ => ij
 "\u0133" => "ij"
 # ð => d
 "\u00F0" => "d"
 # ñ => n
 "\u00F1" => "n"
 # ò => o
 "\u00F2" => "o"
 # ó => o
 "\u00F3" => "o"
 # ô => o
 "\u00F4" => "o"
 # õ => o
 "\u00F5" => "o"
 # ö => o
 "\u00F6" => "o"
 # ø => o
 "\u00F8" => "o"
 # œ => oe
 "\u0153" => "oe"
 # ß => ss
 "\u00DF" => "ss"
 # þ => th
 "\u00FE" => "th"
 # ù => u
 "\u00F9" => "u"
 # ú => u
 "\u00FA" => "u"
 # û => u
 "\u00FB" => "u"
 # ü => u
 "\u00FC" => "u"
 # ý => y
 "\u00FD" => "y"
 # ÿ => y
 "\u00FF" => "y"
 # ﬀ => ff
 "\uFB00" => "ff"
 # ﬁ => fi
 "\uFB01" => "fi"
 # ﬂ => fl
 "\uFB02" => "fl"
 # ﬃ => ffi
 "\uFB03" => "ffi"
 # ﬄ => ffl
 "\uFB04" => "ffl"
 # ﬅ => ft
 "\uFB05" => "ft"
 # ﬆ => st
 "\uFB06" => "st"
--- a/src/test/test-files/solr/conf/schema.xml
+++ b/src/test/test-files/solr/conf/schema.xml
@ -247,6 +247,13 @@
        <filter class="solr.LengthFilterFactory" min="2" max="5"/>
      </analyzer>
    </fieldtype>
    <fieldType name="charfilthtmlmap" class="solr.TextField">
      <analyzer>
        <charFilter class="solr.HTMLStripCharFilterFactory"/>
        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    <fieldtype name="subword" class="solr.TextField" multiValued="true" positionIncrementGap="100">
      <analyzer type="index">