mirror of https://github.com/apache/lucene.git
SOLR-822: Add CharFilter so that characters can be filtered before Tokenizer/TokenFilters.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@713902 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4d7731fc90
commit
eb0ec4a3e2
|
@ -82,6 +82,9 @@ New Features
|
|||
DirectoryProvider will use NIOFSDirectory for better concurrency
|
||||
on non Windows platforms. (Mark Miller, TJ Laurenzo via yonik)
|
||||
|
||||
15. SOLR-822: Add CharFilter so that characters can be filtered (e.g. character normalization)
|
||||
before Tokenizer/TokenFilters. (koji)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
|
||||
|
|
|
@ -0,0 +1,246 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Syntax:
|
||||
# "source" => "target"
|
||||
# "source".length() > 0 (source cannot be empty.)
|
||||
# "target".length() >= 0 (target can be empty.)
|
||||
|
||||
# example:
|
||||
# "À" => "A"
|
||||
# "\u00C0" => "A"
|
||||
# "\u00C0" => "\u0041"
|
||||
# "ß" => "ss"
|
||||
# "\t" => " "
|
||||
# "\n" => ""
|
||||
|
||||
# À => A
|
||||
"\u00C0" => "A"
|
||||
|
||||
# Á => A
|
||||
"\u00C1" => "A"
|
||||
|
||||
# Â => A
|
||||
"\u00C2" => "A"
|
||||
|
||||
# Ã => A
|
||||
"\u00C3" => "A"
|
||||
|
||||
# Ä => A
|
||||
"\u00C4" => "A"
|
||||
|
||||
# Å => A
|
||||
"\u00C5" => "A"
|
||||
|
||||
# Æ => AE
|
||||
"\u00C6" => "AE"
|
||||
|
||||
# Ç => C
|
||||
"\u00C7" => "C"
|
||||
|
||||
# È => E
|
||||
"\u00C8" => "E"
|
||||
|
||||
# É => E
|
||||
"\u00C9" => "E"
|
||||
|
||||
# Ê => E
|
||||
"\u00CA" => "E"
|
||||
|
||||
# Ë => E
|
||||
"\u00CB" => "E"
|
||||
|
||||
# Ì => I
|
||||
"\u00CC" => "I"
|
||||
|
||||
# Í => I
|
||||
"\u00CD" => "I"
|
||||
|
||||
# Î => I
|
||||
"\u00CE" => "I"
|
||||
|
||||
# Ï => I
|
||||
"\u00CF" => "I"
|
||||
|
||||
# IJ => IJ
|
||||
"\u0132" => "IJ"
|
||||
|
||||
# Ð => D
|
||||
"\u00D0" => "D"
|
||||
|
||||
# Ñ => N
|
||||
"\u00D1" => "N"
|
||||
|
||||
# Ò => O
|
||||
"\u00D2" => "O"
|
||||
|
||||
# Ó => O
|
||||
"\u00D3" => "O"
|
||||
|
||||
# Ô => O
|
||||
"\u00D4" => "O"
|
||||
|
||||
# Õ => O
|
||||
"\u00D5" => "O"
|
||||
|
||||
# Ö => O
|
||||
"\u00D6" => "O"
|
||||
|
||||
# Ø => O
|
||||
"\u00D8" => "O"
|
||||
|
||||
# Œ => OE
|
||||
"\u0152" => "OE"
|
||||
|
||||
# Þ
|
||||
"\u00DE" => "TH"
|
||||
|
||||
# Ù => U
|
||||
"\u00D9" => "U"
|
||||
|
||||
# Ú => U
|
||||
"\u00DA" => "U"
|
||||
|
||||
# Û => U
|
||||
"\u00DB" => "U"
|
||||
|
||||
# Ü => U
|
||||
"\u00DC" => "U"
|
||||
|
||||
# Ý => Y
|
||||
"\u00DD" => "Y"
|
||||
|
||||
# Ÿ => Y
|
||||
"\u0178" => "Y"
|
||||
|
||||
# à => a
|
||||
"\u00E0" => "a"
|
||||
|
||||
# á => a
|
||||
"\u00E1" => "a"
|
||||
|
||||
# â => a
|
||||
"\u00E2" => "a"
|
||||
|
||||
# ã => a
|
||||
"\u00E3" => "a"
|
||||
|
||||
# ä => a
|
||||
"\u00E4" => "a"
|
||||
|
||||
# å => a
|
||||
"\u00E5" => "a"
|
||||
|
||||
# æ => ae
|
||||
"\u00E6" => "ae"
|
||||
|
||||
# ç => c
|
||||
"\u00E7" => "c"
|
||||
|
||||
# è => e
|
||||
"\u00E8" => "e"
|
||||
|
||||
# é => e
|
||||
"\u00E9" => "e"
|
||||
|
||||
# ê => e
|
||||
"\u00EA" => "e"
|
||||
|
||||
# ë => e
|
||||
"\u00EB" => "e"
|
||||
|
||||
# ì => i
|
||||
"\u00EC" => "i"
|
||||
|
||||
# í => i
|
||||
"\u00ED" => "i"
|
||||
|
||||
# î => i
|
||||
"\u00EE" => "i"
|
||||
|
||||
# ï => i
|
||||
"\u00EF" => "i"
|
||||
|
||||
# ij => ij
|
||||
"\u0133" => "ij"
|
||||
|
||||
# ð => d
|
||||
"\u00F0" => "d"
|
||||
|
||||
# ñ => n
|
||||
"\u00F1" => "n"
|
||||
|
||||
# ò => o
|
||||
"\u00F2" => "o"
|
||||
|
||||
# ó => o
|
||||
"\u00F3" => "o"
|
||||
|
||||
# ô => o
|
||||
"\u00F4" => "o"
|
||||
|
||||
# õ => o
|
||||
"\u00F5" => "o"
|
||||
|
||||
# ö => o
|
||||
"\u00F6" => "o"
|
||||
|
||||
# ø => o
|
||||
"\u00F8" => "o"
|
||||
|
||||
# œ => oe
|
||||
"\u0153" => "oe"
|
||||
|
||||
# ß => ss
|
||||
"\u00DF" => "ss"
|
||||
|
||||
# þ => th
|
||||
"\u00FE" => "th"
|
||||
|
||||
# ù => u
|
||||
"\u00F9" => "u"
|
||||
|
||||
# ú => u
|
||||
"\u00FA" => "u"
|
||||
|
||||
# û => u
|
||||
"\u00FB" => "u"
|
||||
|
||||
# ü => u
|
||||
"\u00FC" => "u"
|
||||
|
||||
# ý => y
|
||||
"\u00FD" => "y"
|
||||
|
||||
# ÿ => y
|
||||
"\u00FF" => "y"
|
||||
|
||||
# ff => ff
|
||||
"\uFB00" => "ff"
|
||||
|
||||
# fi => fi
|
||||
"\uFB01" => "fi"
|
||||
|
||||
# fl => fl
|
||||
"\uFB02" => "fl"
|
||||
|
||||
# ffi => ffi
|
||||
"\uFB03" => "ffi"
|
||||
|
||||
# ffl => ffl
|
||||
"\uFB04" => "ffl"
|
||||
|
||||
# ſt => ft
|
||||
"\uFB05" => "ft"
|
||||
|
||||
# st => st
|
||||
"\uFB06" => "st"
|
|
@ -215,6 +215,16 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- charFilter + "CharStream aware" WhitespaceTokenizer -->
|
||||
<!--
|
||||
<fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
|
||||
<analyzer>
|
||||
<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
|
||||
<tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
-->
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public abstract class BaseCharFilter extends CharFilter {
|
||||
|
||||
protected List<PosCorrectMap> pcmList;
|
||||
|
||||
public BaseCharFilter( CharStream in ){
|
||||
super(in);
|
||||
pcmList = new ArrayList<PosCorrectMap>();
|
||||
}
|
||||
|
||||
protected int correctPosition( int currentPos ){
|
||||
if( pcmList.isEmpty() ) return currentPos;
|
||||
for( int i = pcmList.size() - 1; i >= 0; i-- ){
|
||||
if( currentPos >= pcmList.get( i ).pos )
|
||||
return currentPos + pcmList.get( i ).cumulativeDiff;
|
||||
}
|
||||
return currentPos;
|
||||
}
|
||||
|
||||
protected static class PosCorrectMap {
|
||||
|
||||
protected int pos;
|
||||
protected int cumulativeDiff;
|
||||
|
||||
public PosCorrectMap( int pos, int cumulativeDiff ){
|
||||
this.pos = pos;
|
||||
this.cumulativeDiff = cumulativeDiff;
|
||||
}
|
||||
|
||||
public String toString(){
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append('(');
|
||||
sb.append(pos);
|
||||
sb.append(',');
|
||||
sb.append(cumulativeDiff);
|
||||
sb.append(')');
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public abstract class BaseCharFilterFactory implements CharFilterFactory {
|
||||
|
||||
public static final Logger log = LoggerFactory.getLogger(BaseCharFilterFactory.class);
|
||||
|
||||
/** The init args */
|
||||
protected Map<String,String> args;
|
||||
|
||||
public Map<String, String> getArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
public void init(Map<String, String> args) {
|
||||
this.args = args;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
*
|
||||
* Subclasses of CharFilter can be chained to filter CharStream.
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public abstract class CharFilter extends CharStream {
|
||||
|
||||
protected CharStream input;
|
||||
|
||||
protected CharFilter( CharStream in ){
|
||||
input = in;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Subclass may want to override to correct the current position.
|
||||
*
|
||||
* @param pos current position
|
||||
* @return corrected position
|
||||
*/
|
||||
protected int correctPosition( int pos ){
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final int correctOffset(int currentOff) {
|
||||
return input.correctOffset( correctPosition( currentOff ) );
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
return input.read(cbuf, off, len);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public interface CharFilterFactory {
|
||||
public void init(Map<String,String> args);
|
||||
public Map<String,String> getArgs();
|
||||
public CharStream create(CharStream input);
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public final class CharReader extends CharStream {
|
||||
|
||||
protected Reader input;
|
||||
|
||||
public CharReader( Reader in ){
|
||||
input = in;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int correctOffset(int currentOff) {
|
||||
return currentOff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
input.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
return input.read(cbuf, off, len );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public abstract class CharStream extends Reader {
|
||||
|
||||
/**
|
||||
* called by CharFilter(s) and Tokenizer to correct token offset.
|
||||
*
|
||||
* @param currentOff current offset
|
||||
* @return corrected token offset
|
||||
*/
|
||||
public abstract int correctOffset( int currentOff );
|
||||
}
|
|
@ -0,0 +1,276 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
/**
|
||||
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
||||
* most European languages. It performs other token methods for double-byte
|
||||
* Characters: the token will return at each two characters with overlap match.<br>
|
||||
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
|
||||
* also need filter filter zero length token ""<br>
|
||||
* for Digit: digit, '+', '#' will token as letter<br>
|
||||
* for more info on Asia language(Chinese Japanese Korean) text segmentation:
|
||||
* please search <a
|
||||
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
||||
*
|
||||
*/
|
||||
|
||||
/*
|
||||
* LUCENE-973 is applied
|
||||
*/
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public final class CharStreamAwareCJKTokenizer extends Tokenizer {
|
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
/** Word token type */
|
||||
static final int WORD_TYPE = 0;
|
||||
|
||||
/** Single byte token type */
|
||||
static final int SINGLE_TOKEN_TYPE = 1;
|
||||
|
||||
/** Double byte token type */
|
||||
static final int DOUBLE_TOKEN_TYPE = 2;
|
||||
|
||||
/** Names for token types */
|
||||
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
|
||||
|
||||
/** Max word length */
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
|
||||
/** buffer size: */
|
||||
private static final int IO_BUFFER_SIZE = 256;
|
||||
|
||||
//~ Instance fields --------------------------------------------------------
|
||||
|
||||
/** word offset, used to imply which character(in ) is parsed */
|
||||
private int offset = 0;
|
||||
|
||||
/** the index used only for ioBuffer */
|
||||
private int bufferIndex = 0;
|
||||
|
||||
/** data length */
|
||||
private int dataLen = 0;
|
||||
|
||||
/**
|
||||
* character buffer, store the characters which are used to compose <br>
|
||||
* the returned Token
|
||||
*/
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
|
||||
/**
|
||||
* I/O buffer, used to store the content of the input(one of the <br>
|
||||
* members of Tokenizer)
|
||||
*/
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
/** word type: single=>ASCII double=>non-ASCII word=>default */
|
||||
private int tokenType = WORD_TYPE;
|
||||
|
||||
/**
|
||||
* tag: previous character is a cached double-byte character "C1C2C3C4"
|
||||
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
|
||||
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
|
||||
*/
|
||||
private boolean preIsTokened = false;
|
||||
|
||||
//~ Constructors -----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Construct a token stream processing the given input.
|
||||
*
|
||||
* @param in I/O reader
|
||||
*/
|
||||
public CharStreamAwareCJKTokenizer(CharStream in) {
|
||||
input = in;
|
||||
}
|
||||
|
||||
//~ Methods ----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the next token in the stream, or null at EOS.
|
||||
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
||||
* for detail.
|
||||
*
|
||||
* @param reusableToken a reusable token
|
||||
* @return Token
|
||||
*
|
||||
* @throws java.io.IOException - throw IOException when read error <br>
|
||||
* happened in the InputStream
|
||||
*
|
||||
*/
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
/** how many character(s) has been stored in buffer */
|
||||
assert reusableToken != null;
|
||||
int length = 0;
|
||||
|
||||
/** the position used to create Token */
|
||||
int start = offset;
|
||||
|
||||
while (true) {
|
||||
/** current character */
|
||||
char c;
|
||||
|
||||
/** unicode block of current character for detail */
|
||||
Character.UnicodeBlock ub;
|
||||
|
||||
offset++;
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
if (dataLen == -1) {
|
||||
if (length > 0) {
|
||||
if (preIsTokened == true) {
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
}
|
||||
|
||||
break;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
//get current character
|
||||
c = ioBuffer[bufferIndex++];
|
||||
|
||||
//get the UnicodeBlock of the current character
|
||||
ub = Character.UnicodeBlock.of(c);
|
||||
}
|
||||
|
||||
//if the current character is ASCII or Extend ASCII
|
||||
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|
||||
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
|
||||
) {
|
||||
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
|
||||
// convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
|
||||
int i = (int) c;
|
||||
i = i - 65248;
|
||||
c = (char) i;
|
||||
}
|
||||
|
||||
// if the current character is a letter or "_" "+" "#"
|
||||
if (Character.isLetterOrDigit(c)
|
||||
|| ((c == '_') || (c == '+') || (c == '#'))
|
||||
) {
|
||||
if (length == 0) {
|
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the current character begin to token the ASCII
|
||||
// letter
|
||||
start = offset - 1;
|
||||
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
|
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the previous non-ASCII
|
||||
// : the current character
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
|
||||
if (preIsTokened == true) {
|
||||
// there is only one non-ASCII has been stored
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// store the LowerCase(c) in the buffer
|
||||
buffer[length++] = Character.toLowerCase(c);
|
||||
tokenType = SINGLE_TOKEN_TYPE;
|
||||
|
||||
// break the procedure if buffer overflowed!
|
||||
if (length == MAX_WORD_LEN) {
|
||||
break;
|
||||
}
|
||||
} else if (length > 0) {
|
||||
if (preIsTokened == true) {
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// non-ASCII letter, e.g."C1C2C3C4"
|
||||
if (Character.isLetter(c)) {
|
||||
if (length == 0) {
|
||||
start = offset - 1;
|
||||
buffer[length++] = c;
|
||||
tokenType = DOUBLE_TOKEN_TYPE;
|
||||
} else {
|
||||
if (tokenType == SINGLE_TOKEN_TYPE) {
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
|
||||
//return the previous ASCII characters
|
||||
break;
|
||||
} else {
|
||||
buffer[length++] = c;
|
||||
tokenType = DOUBLE_TOKEN_TYPE;
|
||||
|
||||
if (length == 2) {
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
preIsTokened = true;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (length > 0) {
|
||||
if (preIsTokened == true) {
|
||||
// empty the buffer
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (length > 0) {
|
||||
// Because of "CharStream aware" tokenizer, using correctOffset() to
|
||||
// correct start/end offsets
|
||||
return reusableToken.reinit
|
||||
(buffer, 0, length,
|
||||
((CharStream)input).correctOffset( start ),
|
||||
((CharStream)input).correctOffset( start+length ),
|
||||
TOKEN_TYPE_NAMES[tokenType]);
|
||||
} else if (dataLen != -1) {
|
||||
// Don't return an empty string - recurse to get the next token
|
||||
return next(reusableToken);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public class CharStreamAwareCJKTokenizerFactory extends BaseTokenizerFactory {
|
||||
|
||||
public TokenStream create(Reader input) {
|
||||
return new CharStreamAwareCJKTokenizer( (CharStream)input );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
||||
public abstract class CharStreamAwareCharTokenizer extends Tokenizer {
|
||||
public CharStreamAwareCharTokenizer(CharStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
private static final int IO_BUFFER_SIZE = 4096;
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
/** Returns true iff a character should be included in a token. This
|
||||
* tokenizer generates as tokens adjacent sequences of characters which
|
||||
* satisfy this predicate. Characters for which this is false are used to
|
||||
* define token boundaries and are not included in tokens. */
|
||||
protected abstract boolean isTokenChar(char c);
|
||||
|
||||
/** Called on each token character to normalize it before it is added to the
|
||||
* token. The default implementation does nothing. Subclasses may use this
|
||||
* to, e.g., lowercase tokens. */
|
||||
protected char normalize(char c) {
|
||||
return c;
|
||||
}
|
||||
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
reusableToken.clear();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
char[] buffer = reusableToken.termBuffer();
|
||||
while (true) {
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
offset += dataLen;
|
||||
dataLen = input.read(ioBuffer);
|
||||
if (dataLen == -1) {
|
||||
if (length > 0)
|
||||
break;
|
||||
else
|
||||
return null;
|
||||
}
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
final char c = ioBuffer[bufferIndex++];
|
||||
|
||||
if (isTokenChar(c)) { // if it's a token char
|
||||
|
||||
if (length == 0) // start of token
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length == buffer.length)
|
||||
buffer = reusableToken.resizeTermBuffer(1+length);
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||
break;
|
||||
|
||||
} else if (length > 0) // at non-Letter w/ chars
|
||||
break; // return 'em
|
||||
}
|
||||
|
||||
reusableToken.setTermLength(length);
|
||||
// Because of "CharStream aware" tokenizer, using correctOffset() to
|
||||
// correct start/end offsets
|
||||
reusableToken.setStartOffset(((CharStream)input).correctOffset(start));
|
||||
reusableToken.setEndOffset(((CharStream)input).correctOffset(start+length));
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
bufferIndex = 0;
|
||||
offset = 0;
|
||||
dataLen = 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
|
||||
* Adjacent sequences of non-Whitespace characters form tokens. */
|
||||
public class CharStreamAwareWhitespaceTokenizer extends CharStreamAwareCharTokenizer {
|
||||
/** Construct a new WhitespaceTokenizer. */
|
||||
public CharStreamAwareWhitespaceTokenizer(CharStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/** Collects only characters which do not satisfy
|
||||
* {@link Character#isWhitespace(char)}.*/
|
||||
protected boolean isTokenChar(char c) {
|
||||
return !Character.isWhitespace(c);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public class CharStreamAwareWhitespaceTokenizerFactory extends BaseTokenizerFactory {
|
||||
|
||||
public TokenStream create(Reader input) {
|
||||
return new CharStreamAwareWhitespaceTokenizer( (CharStream)input );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public class MappingCharFilter extends BaseCharFilter {
|
||||
|
||||
private final NormalizeMap normMap;
|
||||
private LinkedList<Character> buffer;
|
||||
private String replacement;
|
||||
private int charPointer;
|
||||
private int nextCharCounter;
|
||||
|
||||
public MappingCharFilter( NormalizeMap normMap, CharStream in ){
|
||||
super( in );
|
||||
this.normMap = normMap;
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
while( true ){
|
||||
if( replacement != null && charPointer < replacement.length() )
|
||||
return replacement.charAt( charPointer++ );
|
||||
|
||||
int firstChar = nextChar();
|
||||
if( firstChar == -1 ) return -1;
|
||||
NormalizeMap nm = normMap.submap != null ?
|
||||
normMap.submap.get( (char)firstChar ) : null;
|
||||
if( nm == null ) return firstChar;
|
||||
NormalizeMap result = match( nm );
|
||||
if( result == null ) return firstChar;
|
||||
replacement = result.normStr;
|
||||
charPointer = 0;
|
||||
if( result.diff != 0 ){
|
||||
int prevCumulativeDiff = pcmList.isEmpty() ? 0 :
|
||||
pcmList.get( pcmList.size() - 1 ).cumulativeDiff;
|
||||
if( result.diff < 0 ){
|
||||
for( int i = 0; i < -result.diff ; i++ )
|
||||
pcmList.add( new PosCorrectMap( nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i ) );
|
||||
}
|
||||
else{
|
||||
pcmList.add( new PosCorrectMap( nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int nextChar() throws IOException {
|
||||
nextCharCounter++;
|
||||
if( buffer != null && !buffer.isEmpty() )
|
||||
return buffer.removeFirst();
|
||||
return input.read();
|
||||
}
|
||||
|
||||
private void pushChar( int c ){
|
||||
nextCharCounter--;
|
||||
if( buffer == null )
|
||||
buffer = new LinkedList<Character>();
|
||||
buffer.addFirst( (char)c );
|
||||
}
|
||||
|
||||
private void pushLastChar( int c ){
|
||||
if( buffer == null )
|
||||
buffer = new LinkedList<Character>();
|
||||
buffer.addLast( (char)c );
|
||||
}
|
||||
|
||||
private NormalizeMap match( NormalizeMap map ) throws IOException {
|
||||
NormalizeMap result = null;
|
||||
if( map.submap != null ){
|
||||
int chr = nextChar();
|
||||
if( chr != -1 ){
|
||||
NormalizeMap subMap = map.submap.get( (char)chr );
|
||||
if( subMap != null ){
|
||||
result = match( subMap );
|
||||
}
|
||||
if( result == null )
|
||||
pushChar( chr );
|
||||
}
|
||||
}
|
||||
if( result == null && map.normStr != null )
|
||||
result = map;
|
||||
return result;
|
||||
}
|
||||
|
||||
public int read( char[] cbuf, int off, int len ) throws IOException {
|
||||
char[] tmp = new char[len];
|
||||
int l = input.read( tmp, 0, len );
|
||||
if( l != -1 ){
|
||||
for( int i = 0; i < l; i++ )
|
||||
pushLastChar( tmp[i] );
|
||||
}
|
||||
l = 0;
|
||||
for( int i = off; i < off + len; i++ ){
|
||||
int c = read();
|
||||
if( c == -1 ) break;
|
||||
cbuf[i] = (char)c;
|
||||
l++;
|
||||
}
|
||||
return l == 0 ? -1 : l;
|
||||
}
|
||||
|
||||
public boolean markSupported(){
|
||||
return false;
|
||||
}
|
||||
|
||||
public void mark( int readAheadLimit ) throws IOException {
|
||||
throw new IOException( "mark/reset not supported" );
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
throw new IOException( "mark/reset not supported" );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public class MappingCharFilterFactory extends BaseCharFilterFactory implements
|
||||
ResourceLoaderAware {
|
||||
|
||||
protected NormalizeMap normMap;
|
||||
private String mapping;
|
||||
|
||||
public void inform(ResourceLoader loader) {
|
||||
mapping = args.get( "mapping" );
|
||||
|
||||
if( mapping != null ){
|
||||
List<String> wlist = null;
|
||||
try{
|
||||
File mappingFile = new File( mapping );
|
||||
if( mappingFile.exists() ){
|
||||
wlist = loader.getLines( mapping );
|
||||
}
|
||||
else{
|
||||
List<String> files = StrUtils.splitFileNames( mapping );
|
||||
wlist = new ArrayList<String>();
|
||||
for( String file : files ){
|
||||
List<String> lines = loader.getLines( file.trim() );
|
||||
wlist.addAll( lines );
|
||||
}
|
||||
}
|
||||
}
|
||||
catch( IOException e ){
|
||||
throw new RuntimeException( e );
|
||||
}
|
||||
normMap = new NormalizeMap();
|
||||
parseRules( wlist, normMap );
|
||||
}
|
||||
}
|
||||
|
||||
public CharStream create(CharStream input) {
|
||||
return new MappingCharFilter(normMap,input);
|
||||
}
|
||||
|
||||
// "source" => "target"
|
||||
static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
|
||||
|
||||
protected void parseRules( List<String> rules, NormalizeMap normMap ){
|
||||
for( String rule : rules ){
|
||||
Matcher m = p.matcher( rule );
|
||||
if( !m.find() )
|
||||
throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "], file = " + mapping );
|
||||
normMap.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
|
||||
}
|
||||
}
|
||||
|
||||
char[] out = new char[256];
|
||||
|
||||
protected String parseString( String s ){
|
||||
int readPos = 0;
|
||||
int len = s.length();
|
||||
int writePos = 0;
|
||||
while( readPos < len ){
|
||||
char c = s.charAt( readPos++ );
|
||||
if( c == '\\' ){
|
||||
if( readPos >= len )
|
||||
throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
|
||||
c = s.charAt( readPos++ );
|
||||
switch( c ) {
|
||||
case '\\' : c = '\\'; break;
|
||||
case '"' : c = '"'; break;
|
||||
case 'n' : c = '\n'; break;
|
||||
case 't' : c = '\t'; break;
|
||||
case 'r' : c = '\r'; break;
|
||||
case 'b' : c = '\b'; break;
|
||||
case 'f' : c = '\f'; break;
|
||||
case 'u' :
|
||||
if( readPos + 3 >= len )
|
||||
throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
|
||||
c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
|
||||
readPos += 4;
|
||||
break;
|
||||
}
|
||||
}
|
||||
out[writePos++] = c;
|
||||
}
|
||||
return new String( out, 0, writePos );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
*
|
||||
* @version $Id$
|
||||
* @since Solr 1.4
|
||||
*
|
||||
*/
|
||||
public class NormalizeMap {
|
||||
|
||||
Map<Character, NormalizeMap> submap;
|
||||
String normStr;
|
||||
int diff;
|
||||
|
||||
public void add( String singleMatch, String replacement ){
|
||||
NormalizeMap currMap = this;
|
||||
for( int i = 0; i < singleMatch.length(); i++ ){
|
||||
char c = singleMatch.charAt( i );
|
||||
if( currMap.submap == null ){
|
||||
currMap.submap = new HashMap<Character, NormalizeMap>( 1 );
|
||||
}
|
||||
NormalizeMap map = currMap.submap.get( c );
|
||||
if( map == null ){
|
||||
map = new NormalizeMap();
|
||||
currMap.submap.put( c, map );
|
||||
}
|
||||
currMap = map;
|
||||
}
|
||||
if( currMap.normStr != null ){
|
||||
throw new RuntimeException( "MappingCharFilter: there is already a mapping for " + singleMatch );
|
||||
}
|
||||
currMap.normStr = replacement;
|
||||
currMap.diff = singleMatch.length() - replacement.length();
|
||||
}
|
||||
}
|
|
@ -31,19 +31,37 @@ import java.io.Reader;
|
|||
// create a TokenStream.
|
||||
//
|
||||
public class TokenizerChain extends SolrAnalyzer {
|
||||
final private CharFilterFactory[] charFilters;
|
||||
final private TokenizerFactory tokenizer;
|
||||
final private TokenFilterFactory[] filters;
|
||||
|
||||
public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
|
||||
this(null,tokenizer,filters);
|
||||
}
|
||||
|
||||
public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
|
||||
this.charFilters = charFilters;
|
||||
this.tokenizer = tokenizer;
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
public CharFilterFactory[] getCharFilterFactories() { return charFilters; }
|
||||
public TokenizerFactory getTokenizerFactory() { return tokenizer; }
|
||||
public TokenFilterFactory[] getTokenFilterFactories() { return filters; }
|
||||
|
||||
public Reader charStream(Reader reader){
|
||||
if( charFilters != null && charFilters.length > 0 ){
|
||||
CharStream cs = new CharReader( reader );
|
||||
for (int i=0; i<charFilters.length; i++) {
|
||||
cs = charFilters[i].create(cs);
|
||||
}
|
||||
reader = cs;
|
||||
}
|
||||
return reader;
|
||||
}
|
||||
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream ts = tokenizer.create(reader);
|
||||
TokenStream ts = tokenizer.create(charStream(reader));
|
||||
for (int i=0; i<filters.length; i++) {
|
||||
ts = filters[i].create(ts);
|
||||
}
|
||||
|
@ -52,6 +70,10 @@ public class TokenizerChain extends SolrAnalyzer {
|
|||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("TokenizerChain(");
|
||||
for (CharFilterFactory filter: charFilters) {
|
||||
sb.append(filter);
|
||||
sb.append(", ");
|
||||
}
|
||||
sb.append(tokenizer);
|
||||
for (TokenFilterFactory filter: filters) {
|
||||
sb.append(", ");
|
||||
|
|
|
@ -37,6 +37,7 @@ import javax.naming.InitialContext;
|
|||
import javax.naming.NamingException;
|
||||
import javax.naming.NoInitialContextException;
|
||||
|
||||
import org.apache.solr.analysis.CharFilterFactory;
|
||||
import org.apache.solr.analysis.TokenFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerFactory;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
@ -394,8 +395,9 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
}
|
||||
);
|
||||
|
||||
awareCompatibility.put(
|
||||
awareCompatibility.put(
|
||||
ResourceLoaderAware.class, new Class[] {
|
||||
CharFilterFactory.class,
|
||||
TokenFilterFactory.class,
|
||||
TokenizerFactory.class,
|
||||
FieldType.class
|
||||
|
@ -427,5 +429,5 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
}
|
||||
throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, builder.toString() );
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -29,6 +29,7 @@ import org.apache.solr.common.util.DOMUtil;
|
|||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.core.Config;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.analysis.CharFilterFactory;
|
||||
import org.apache.solr.analysis.TokenFilterFactory;
|
||||
import org.apache.solr.analysis.TokenizerChain;
|
||||
import org.apache.solr.analysis.TokenizerFactory;
|
||||
|
@ -739,12 +740,33 @@ public final class IndexSchema {
|
|||
|
||||
XPath xpath = XPathFactory.newInstance().newXPath();
|
||||
|
||||
// Load the CharFilters
|
||||
// --------------------------------------------------------------------------------
|
||||
final ArrayList<CharFilterFactory> charFilters = new ArrayList<CharFilterFactory>();
|
||||
AbstractPluginLoader<CharFilterFactory> charFilterLoader =
|
||||
new AbstractPluginLoader<CharFilterFactory>( "[schema.xml] analyzer/charFilter", false, false )
|
||||
{
|
||||
@Override
|
||||
protected void init(CharFilterFactory plugin, Node node) throws Exception {
|
||||
if( plugin != null ) {
|
||||
plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
|
||||
charFilters.add( plugin );
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CharFilterFactory register(String name, CharFilterFactory plugin) throws Exception {
|
||||
return null; // used for map registration
|
||||
}
|
||||
};
|
||||
charFilterLoader.load( solrConfig.getResourceLoader(), (NodeList)xpath.evaluate("./charFilter", node, XPathConstants.NODESET) );
|
||||
|
||||
// Load the Tokenizer
|
||||
// Although an analyzer only allows a single Tokenizer, we load a list to make sure
|
||||
// Although an analyzer only allows a single Tokenizer, we load a list to make sure
|
||||
// the configuration is ok
|
||||
// --------------------------------------------------------------------------------
|
||||
final ArrayList<TokenizerFactory> tokenizers = new ArrayList<TokenizerFactory>(1);
|
||||
AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
|
||||
AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
|
||||
new AbstractPluginLoader<TokenizerFactory>( "[schema.xml] analyzer/tokenizer", false, false )
|
||||
{
|
||||
@Override
|
||||
|
@ -790,8 +812,9 @@ public final class IndexSchema {
|
|||
}
|
||||
};
|
||||
filterLoader.load( loader, (NodeList)xpath.evaluate("./filter", node, XPathConstants.NODESET) );
|
||||
|
||||
return new TokenizerChain(tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
|
||||
|
||||
return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]),
|
||||
tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestCharFilter extends TestCase {
|
||||
|
||||
public void testCharFilter1() throws Exception {
|
||||
CharStream cs = new CharFilter1( new CharReader( new StringReader("") ) );
|
||||
assertEquals( "corrected position is invalid", 1, cs.correctOffset( 0 ) );
|
||||
}
|
||||
|
||||
public void testCharFilter2() throws Exception {
|
||||
CharStream cs = new CharFilter2( new CharReader( new StringReader("") ) );
|
||||
assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
|
||||
}
|
||||
|
||||
public void testCharFilter12() throws Exception {
|
||||
CharStream cs = new CharFilter2( new CharFilter1( new CharReader( new StringReader("") ) ) );
|
||||
assertEquals( "corrected position is invalid", 3, cs.correctOffset( 0 ) );
|
||||
}
|
||||
|
||||
public void testCharFilter11() throws Exception {
|
||||
CharStream cs = new CharFilter1( new CharFilter1( new CharReader( new StringReader("") ) ) );
|
||||
assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
|
||||
}
|
||||
|
||||
static class CharFilter1 extends CharFilter {
|
||||
|
||||
protected CharFilter1(CharStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correctPosition(int currentPos) {
|
||||
return currentPos + 1;
|
||||
}
|
||||
}
|
||||
|
||||
static class CharFilter2 extends CharFilter {
|
||||
|
||||
protected CharFilter2(CharStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int correctPosition(int currentPos) {
|
||||
return currentPos + 2;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class TestMappingCharFilter extends BaseTokenTestCase {
|
||||
|
||||
NormalizeMap normMap;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
normMap = new NormalizeMap();
|
||||
|
||||
normMap.add( "aa", "a" );
|
||||
normMap.add( "bbb", "b" );
|
||||
normMap.add( "cccc", "cc" );
|
||||
|
||||
normMap.add( "h", "i" );
|
||||
normMap.add( "j", "jj" );
|
||||
normMap.add( "k", "kkk" );
|
||||
normMap.add( "ll", "llll" );
|
||||
|
||||
normMap.add( "empty", "" );
|
||||
}
|
||||
|
||||
public void testNothingChange() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "x" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "x" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test1to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "i" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test1to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "j" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "jj,1,0,1" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test1to3() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "k" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "kkk,1,0,1" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test2to4() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "ll" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "llll,1,0,2" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test2to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "aa" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "a,1,0,2" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test3to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "bbb" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "b,1,0,3" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test4to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "cccc" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "cc,1,0,4" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test5to0() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "empty" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
assertEquals( 0, real.size() );
|
||||
}
|
||||
|
||||
//
|
||||
// 1111111111222
|
||||
// 01234567890123456789012
|
||||
//(in) h i j k ll cccc bbb aa
|
||||
//
|
||||
// 1111111111222
|
||||
// 01234567890123456789012
|
||||
//(out) i i jj kkk llll cc b a
|
||||
//
|
||||
// h, 0, 1 => i, 0, 1
|
||||
// i, 2, 3 => i, 2, 3
|
||||
// j, 4, 5 => jj, 4, 5
|
||||
// k, 6, 7 => kkk, 6, 7
|
||||
// ll, 8,10 => llll, 8,10
|
||||
// cccc,11,15 => cc,11,15
|
||||
// bbb,16,19 => b,16,19
|
||||
// aa,20,22 => a,20,22
|
||||
//
|
||||
public void testTokenStream() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h i j k ll cccc bbb aa" ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// 0123456789
|
||||
//(in) aaaa ll h
|
||||
//(out-1) aa llll i
|
||||
//(out-2) a llllllll i
|
||||
//
|
||||
// aaaa,0,4 => a,0,4
|
||||
// ll,5,7 => llllllll,5,7
|
||||
// h,8,9 => i,8,9
|
||||
public void testChained() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap,
|
||||
new MappingCharFilter( normMap, new CharReader( new StringReader( "aaaa ll h" ) ) ) );
|
||||
TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
|
||||
List<Token> real = getTokens( ts );
|
||||
List<Token> expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class TestMappingCharFilterFactory extends TestCase {
|
||||
public void testParseString() throws Exception {
|
||||
|
||||
MappingCharFilterFactory f = new MappingCharFilterFactory();
|
||||
|
||||
try {
|
||||
f.parseString( "\\" );
|
||||
fail( "escape character cannot be alone." );
|
||||
}
|
||||
catch( RuntimeException expected ){}
|
||||
|
||||
assertEquals( "unexpected escaped characters",
|
||||
"\\\"\n\t\r\b\f", f.parseString( "\\\\\\\"\\n\\t\\r\\b\\f" ) );
|
||||
assertEquals( "unexpected escaped characters",
|
||||
"A", f.parseString( "\\u0041" ) );
|
||||
assertEquals( "unexpected escaped characters",
|
||||
"AB", f.parseString( "\\u0041\\u0042" ) );
|
||||
|
||||
try {
|
||||
f.parseString( "\\u000" );
|
||||
fail( "invalid length check." );
|
||||
}
|
||||
catch( RuntimeException expected ){}
|
||||
|
||||
try {
|
||||
f.parseString( "\\u123x" );
|
||||
fail( "invalid hex number check." );
|
||||
}
|
||||
catch( NumberFormatException expected ){}
|
||||
}
|
||||
}
|
|
@ -181,9 +181,9 @@
|
|||
TokenizerFactory tfac = tchain.getTokenizerFactory();
|
||||
TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();
|
||||
|
||||
TokenStream tstream = tfac.create(reader);
|
||||
TokenStream tstream = tfac.create(tchain.charStream(reader));
|
||||
List<Token> tokens = getTokens(tstream);
|
||||
tstream = tfac.create(reader);
|
||||
tstream = tfac.create(tchain.charStream(reader));
|
||||
if (verbose) {
|
||||
writeHeader(out, tfac.getClass(), tfac.getArgs());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue