From 00e1299bd63091a4845ee1e5853b482828942581 Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Wed, 19 Aug 2009 17:01:12 +0000 Subject: [PATCH] SOLR-1370: Show the output of CharFilters in FieldAnalysisRequestHandler git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@805880 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 + .../handler/AnalysisRequestHandlerBase.java | 37 ++- .../FieldAnalysisRequestHandlerTest.java | 26 ++ .../solr/conf/mapping-ISOLatin1Accent.txt | 246 ++++++++++++++++++ src/test/test-files/solr/conf/schema.xml | 7 + 5 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt diff --git a/CHANGES.txt b/CHANGES.txt index 39a96f7f180..7cd0fdc0849 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -271,6 +271,8 @@ New Features 69. SOLR-1372: Enhance FieldAnalysisRequestHandler to accept field value from content stream (ehatcher) +70. SOLR-1370: Show the output of CharFilters in FieldAnalysisRequestHandler (koji) + Optimizations ---------------------- diff --git a/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java b/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java index 2f2a1fdd3a3..edc9dd24ee4 100644 --- a/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java +++ b/src/java/org/apache/solr/handler/AnalysisRequestHandlerBase.java @@ -18,10 +18,14 @@ package org.apache.solr.handler; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharReader; +import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.analysis.CharFilterFactory; import org.apache.solr.analysis.TokenFilterFactory; import org.apache.solr.analysis.TokenizerChain; +import org.apache.solr.analysis.TokenizerFactory; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.SolrException; @@ -83,17 +87,29 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { } TokenizerChain tokenizerChain = (TokenizerChain) analyzer; + CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories(); + TokenizerFactory tfac = tokenizerChain.getTokenizerFactory(); + TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories(); NamedList> namedList = new SimpleOrderedMap>(); - TokenStream tokenStream = tokenizerChain.getTokenizerFactory().create(new StringReader(value)); + if( cfiltfacs != null ){ + String source = value; + for(CharFilterFactory cfiltfac : cfiltfacs ){ + CharStream reader = CharReader.get(new StringReader(source)); + reader = cfiltfac.create(reader); + source = writeCharStream(namedList, reader); + } + } + + TokenStream tokenStream = tfac.create(tokenizerChain.charStream(new StringReader(value))); List tokens = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context)); ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokens); - for (TokenFilterFactory tokenFilterFactory : tokenizerChain.getTokenFilterFactories()) { + for (TokenFilterFactory tokenFilterFactory : filtfacs) { tokenStream = tokenFilterFactory.create(listBasedTokenStream); List tokenList = analyzeTokenStream(tokenStream); namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokenList, context)); @@ -188,6 +204,23 @@ public abstract class AnalysisRequestHandlerBase extends RequestHandlerBase { return tokensNamedLists; } + + private String writeCharStream(NamedList out, CharStream input ){ + final int BUFFER_SIZE = 1024; + char[] buf = new char[BUFFER_SIZE]; + int len = 0; + StringBuilder sb = new StringBuilder(); + do { + try { + len = input.read( buf, 0, BUFFER_SIZE ); + } catch (IOException e) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e); + } + sb.append(buf, 0, len); + } while( len == BUFFER_SIZE ); + out.add( input.getClass().getName(), sb.toString()); + return sb.toString(); + } // ================================================= Inner classes ================================================= diff --git a/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java b/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java index 056233f10d2..9087d1c65ea 100644 --- a/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java +++ b/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java @@ -293,4 +293,30 @@ public class FieldAnalysisRequestHandlerTest extends AnalysisRequestHandlerTestB } + public void testCharFilterAnalysis() throws Exception { + + FieldAnalysisRequest request = new FieldAnalysisRequest(); + request.addFieldType("charfilthtmlmap"); + request.setFieldValue("whátëvêr"); + request.setShowMatch(false); + + NamedList result = handler.handleAnalysisRequest(request, h.getCore().getSchema()); + assertTrue("result is null and it shouldn't be", result != null); + + NamedList fieldTypes = result.get("field_types"); + assertNotNull("field_types should never be null", fieldTypes); + NamedList textType = fieldTypes.get("charfilthtmlmap"); + assertNotNull("expecting result for field type 'charfilthtmlmap'", textType); + + NamedList indexPart = textType.get("index"); + assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart); + + assertEquals(" whátëvêr ", indexPart.get("org.apache.solr.analysis.HTMLStripCharFilter")); + assertEquals(" whatever ", indexPart.get("org.apache.lucene.analysis.MappingCharFilter")); + + List tokenList = (List)indexPart.get("org.apache.lucene.analysis.WhitespaceTokenizer"); + assertNotNull("Expcting WhitespaceTokenizer analysis breakdown", tokenList); + assertEquals(tokenList.size(), 1); + assertToken(tokenList.get(0), new TokenInfo("whatever", null, "word", 12, 20, 1, null, false)); + } } diff --git a/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt b/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt new file mode 100644 index 00000000000..ede7742581b --- /dev/null +++ b/src/test/test-files/solr/conf/mapping-ISOLatin1Accent.txt @@ -0,0 +1,246 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Syntax: +# "source" => "target" +# "source".length() > 0 (source cannot be empty.) +# "target".length() >= 0 (target can be empty.) + +# example: +# "À" => "A" +# "\u00C0" => "A" +# "\u00C0" => "\u0041" +# "ß" => "ss" +# "\t" => " " +# "\n" => "" + +# À => A +"\u00C0" => "A" + +# Á => A +"\u00C1" => "A" + +#  => A +"\u00C2" => "A" + +# à => A +"\u00C3" => "A" + +# Ä => A +"\u00C4" => "A" + +# Å => A +"\u00C5" => "A" + +# Æ => AE +"\u00C6" => "AE" + +# Ç => C +"\u00C7" => "C" + +# È => E +"\u00C8" => "E" + +# É => E +"\u00C9" => "E" + +# Ê => E +"\u00CA" => "E" + +# Ë => E +"\u00CB" => "E" + +# Ì => I +"\u00CC" => "I" + +# Í => I +"\u00CD" => "I" + +# Î => I +"\u00CE" => "I" + +# Ï => I +"\u00CF" => "I" + +# IJ => IJ +"\u0132" => "IJ" + +# Ð => D +"\u00D0" => "D" + +# Ñ => N +"\u00D1" => "N" + +# Ò => O +"\u00D2" => "O" + +# Ó => O +"\u00D3" => "O" + +# Ô => O +"\u00D4" => "O" + +# Õ => O +"\u00D5" => "O" + +# Ö => O +"\u00D6" => "O" + +# Ø => O +"\u00D8" => "O" + +# Œ => OE +"\u0152" => "OE" + +# Þ +"\u00DE" => "TH" + +# Ù => U +"\u00D9" => "U" + +# Ú => U +"\u00DA" => "U" + +# Û => U +"\u00DB" => "U" + +# Ü => U +"\u00DC" => "U" + +# Ý => Y +"\u00DD" => "Y" + +# Ÿ => Y +"\u0178" => "Y" + +# à => a +"\u00E0" => "a" + +# á => a +"\u00E1" => "a" + +# â => a +"\u00E2" => "a" + +# ã => a +"\u00E3" => "a" + +# ä => a +"\u00E4" => "a" + +# å => a +"\u00E5" => "a" + +# æ => ae +"\u00E6" => "ae" + +# ç => c +"\u00E7" => "c" + +# è => e +"\u00E8" => "e" + +# é => e +"\u00E9" => "e" + +# ê => e +"\u00EA" => "e" + +# ë => e +"\u00EB" => "e" + +# ì => i +"\u00EC" => "i" + +# í => i +"\u00ED" => "i" + +# î => i +"\u00EE" => "i" + +# ï => i +"\u00EF" => "i" + +# ij => ij +"\u0133" => "ij" + +# ð => d +"\u00F0" => "d" + +# ñ => n +"\u00F1" => "n" + +# ò => o +"\u00F2" => "o" + +# ó => o +"\u00F3" => "o" + +# ô => o +"\u00F4" => "o" + +# õ => o +"\u00F5" => "o" + +# ö => o +"\u00F6" => "o" + +# ø => o +"\u00F8" => "o" + +# œ => oe +"\u0153" => "oe" + +# ß => ss +"\u00DF" => "ss" + +# þ => th +"\u00FE" => "th" + +# ù => u +"\u00F9" => "u" + +# ú => u +"\u00FA" => "u" + +# û => u +"\u00FB" => "u" + +# ü => u +"\u00FC" => "u" + +# ý => y +"\u00FD" => "y" + +# ÿ => y +"\u00FF" => "y" + +# ff => ff +"\uFB00" => "ff" + +# fi => fi +"\uFB01" => "fi" + +# fl => fl +"\uFB02" => "fl" + +# ffi => ffi +"\uFB03" => "ffi" + +# ffl => ffl +"\uFB04" => "ffl" + +# ſt => ft +"\uFB05" => "ft" + +# st => st +"\uFB06" => "st" diff --git a/src/test/test-files/solr/conf/schema.xml b/src/test/test-files/solr/conf/schema.xml index f32619d5b83..6db63261ea1 100644 --- a/src/test/test-files/solr/conf/schema.xml +++ b/src/test/test-files/solr/conf/schema.xml @@ -247,6 +247,13 @@ + + + + + + +