mirror of https://github.com/apache/lucene.git
LUCENE-4095: remove deprecations from trunk (just the easy ones for now)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344531 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8deb16bcf9
commit
bc3a3dc5d4
|
@ -137,8 +137,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
|
|
|
@ -1,96 +0,0 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Tokenizer that breaks text into runs of letters and diacritics.
|
||||
* <p>
|
||||
* The problem with the standard Letter tokenizer is that it fails on diacritics.
|
||||
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc.
|
||||
* </p>
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* {@link ArabicLetterTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token characters. See {@link #isTokenChar(int)} and
|
||||
* {@link #normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public class ArabicLetterTokenizer extends LetterTokenizer {
|
||||
/**
|
||||
* Construct a new ArabicLetterTokenizer.
|
||||
* @param matchVersion Lucene version
|
||||
* to match See {@link <a href="#version">above</a>}
|
||||
*
|
||||
* @param in
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
public ArabicLetterTokenizer(Version matchVersion, Reader in) {
|
||||
super(matchVersion, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new ArabicLetterTokenizer using a given {@link AttributeSource}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* @param source
|
||||
* the attribute source to use for this Tokenizer
|
||||
* @param in
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
public ArabicLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
|
||||
super(matchVersion, source, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new ArabicLetterTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param
|
||||
* matchVersion Lucene version to match See
|
||||
* {@link <a href="#version">above</a>}
|
||||
*
|
||||
* @param factory
|
||||
* the attribute factory to use for this Tokenizer
|
||||
* @param in
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
public ArabicLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
|
||||
super(matchVersion, factory, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows for Letter category or NonspacingMark category
|
||||
* @see org.apache.lucene.analysis.core.LetterTokenizer#isTokenChar(int)
|
||||
*/
|
||||
@Override
|
||||
protected boolean isTokenChar(int c) {
|
||||
return super.isTokenChar(c) || Character.getType(c) == Character.NON_SPACING_MARK;
|
||||
}
|
||||
|
||||
}
|
|
@ -38,14 +38,6 @@ import org.tartarus.snowball.ext.CatalanStemmer;
|
|||
|
||||
/**
|
||||
* {@link Analyzer} for Catalan.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating CatalanAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, ElisionFilter with a set of Catalan
|
||||
* contractions is used by default.
|
||||
* </ul>
|
||||
*/
|
||||
public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
@ -126,8 +118,8 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
* @return A
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* {@link StandardFilter}, {@link ElisionFilter}, {@link LowerCaseFilter},
|
||||
* {@link StopFilter}, {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link SnowballFilter}.
|
||||
*/
|
||||
@Override
|
||||
|
@ -135,9 +127,7 @@ public final class CatalanAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
}
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -89,16 +89,11 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
TokenStream result = new CJKWidthFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new CJKBigramFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
} else {
|
||||
final Tokenizer source = new CJKTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
|
||||
}
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
// run the widthfilter first before bigramming, it sometimes combines characters.
|
||||
TokenStream result = new CJKWidthFilter(source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new CJKBigramFilter(result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,317 +0,0 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/**
|
||||
* CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
|
||||
* <p>
|
||||
* The tokens returned are every two adjacent characters with overlap match.
|
||||
* </p>
|
||||
* <p>
|
||||
* Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
|
||||
* </p>
|
||||
* Additionally, the following is applied to Latin text (such as English):
|
||||
* <ul>
|
||||
* <li>Text is converted to lowercase.
|
||||
* <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
|
||||
* <li>Full-width forms are converted to half-width forms.
|
||||
* </ul>
|
||||
* For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
|
||||
* please search <a
|
||||
* href="http://www.google.com/search?q=word+chinese+segment">google</a>
|
||||
*
|
||||
* @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class CJKTokenizer extends Tokenizer {
|
||||
//~ Static fields/initializers ---------------------------------------------
|
||||
/** Word token type */
|
||||
static final int WORD_TYPE = 0;
|
||||
|
||||
/** Single byte token type */
|
||||
static final int SINGLE_TOKEN_TYPE = 1;
|
||||
|
||||
/** Double byte token type */
|
||||
static final int DOUBLE_TOKEN_TYPE = 2;
|
||||
|
||||
/** Names for token types */
|
||||
static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
|
||||
|
||||
/** Max word length */
|
||||
private static final int MAX_WORD_LEN = 255;
|
||||
|
||||
/** buffer size: */
|
||||
private static final int IO_BUFFER_SIZE = 256;
|
||||
|
||||
//~ Instance fields --------------------------------------------------------
|
||||
|
||||
/** word offset, used to imply which character(in ) is parsed */
|
||||
private int offset = 0;
|
||||
|
||||
/** the index used only for ioBuffer */
|
||||
private int bufferIndex = 0;
|
||||
|
||||
/** data length */
|
||||
private int dataLen = 0;
|
||||
|
||||
/**
|
||||
* character buffer, store the characters which are used to compose <br>
|
||||
* the returned Token
|
||||
*/
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
|
||||
/**
|
||||
* I/O buffer, used to store the content of the input(one of the <br>
|
||||
* members of Tokenizer)
|
||||
*/
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
/** word type: single=>ASCII double=>non-ASCII word=>default */
|
||||
private int tokenType = WORD_TYPE;
|
||||
|
||||
/**
|
||||
* tag: previous character is a cached double-byte character "C1C2C3C4"
|
||||
* ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
|
||||
* C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
|
||||
*/
|
||||
private boolean preIsTokened = false;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
|
||||
//~ Constructors -----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Construct a token stream processing the given input.
|
||||
*
|
||||
* @param in I/O reader
|
||||
*/
|
||||
public CJKTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
public CJKTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
public CJKTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
//~ Methods ----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns true for the next token in the stream, or false at EOS.
|
||||
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
||||
* for detail.
|
||||
*
|
||||
* @return false for end of stream, true otherwise
|
||||
*
|
||||
* @throws java.io.IOException - throw IOException when read error <br>
|
||||
* happened in the InputStream
|
||||
*
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
/** how many character(s) has been stored in buffer */
|
||||
|
||||
while(true) { // loop until we find a non-empty token
|
||||
|
||||
int length = 0;
|
||||
|
||||
/** the position used to create Token */
|
||||
int start = offset;
|
||||
|
||||
while (true) { // loop until we've found a full token
|
||||
/** current character */
|
||||
char c;
|
||||
|
||||
/** unicode block of current character for detail */
|
||||
Character.UnicodeBlock ub;
|
||||
|
||||
offset++;
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
if (dataLen == -1) {
|
||||
if (length > 0) {
|
||||
if (preIsTokened == true) {
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
}
|
||||
else{
|
||||
offset--;
|
||||
}
|
||||
|
||||
break;
|
||||
} else {
|
||||
offset--;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
//get current character
|
||||
c = ioBuffer[bufferIndex++];
|
||||
|
||||
//get the UnicodeBlock of the current character
|
||||
ub = Character.UnicodeBlock.of(c);
|
||||
}
|
||||
|
||||
//if the current character is ASCII or Extend ASCII
|
||||
if ((ub == Character.UnicodeBlock.BASIC_LATIN)
|
||||
|| (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
|
||||
) {
|
||||
if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
|
||||
int i = (int) c;
|
||||
if (i >= 65281 && i <= 65374) {
|
||||
// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
|
||||
i = i - 65248;
|
||||
c = (char) i;
|
||||
}
|
||||
}
|
||||
|
||||
// if the current character is a letter or "_" "+" "#"
|
||||
if (Character.isLetterOrDigit(c)
|
||||
|| ((c == '_') || (c == '+') || (c == '#'))
|
||||
) {
|
||||
if (length == 0) {
|
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the current character begin to token the ASCII
|
||||
// letter
|
||||
start = offset - 1;
|
||||
} else if (tokenType == DOUBLE_TOKEN_TYPE) {
|
||||
// "javaC1C2C3C4linux" <br>
|
||||
// ^--: the previous non-ASCII
|
||||
// : the current character
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
|
||||
if (preIsTokened == true) {
|
||||
// there is only one non-ASCII has been stored
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// store the LowerCase(c) in the buffer
|
||||
buffer[length++] = Character.toLowerCase(c);
|
||||
tokenType = SINGLE_TOKEN_TYPE;
|
||||
|
||||
// break the procedure if buffer overflowed!
|
||||
if (length == MAX_WORD_LEN) {
|
||||
break;
|
||||
}
|
||||
} else if (length > 0) {
|
||||
if (preIsTokened == true) {
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// non-ASCII letter, e.g."C1C2C3C4"
|
||||
if (Character.isLetter(c)) {
|
||||
if (length == 0) {
|
||||
start = offset - 1;
|
||||
buffer[length++] = c;
|
||||
tokenType = DOUBLE_TOKEN_TYPE;
|
||||
} else {
|
||||
if (tokenType == SINGLE_TOKEN_TYPE) {
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
|
||||
//return the previous ASCII characters
|
||||
break;
|
||||
} else {
|
||||
buffer[length++] = c;
|
||||
tokenType = DOUBLE_TOKEN_TYPE;
|
||||
|
||||
if (length == 2) {
|
||||
offset--;
|
||||
bufferIndex--;
|
||||
preIsTokened = true;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (length > 0) {
|
||||
if (preIsTokened == true) {
|
||||
// empty the buffer
|
||||
length = 0;
|
||||
preIsTokened = false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (length > 0) {
|
||||
termAtt.copyBuffer(buffer, 0, length);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
|
||||
return true;
|
||||
} else if (dataLen == -1) {
|
||||
offset--;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Cycle back and try for the next token (don't
|
||||
// return an empty string)
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(offset);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offset = bufferIndex = dataLen = 0;
|
||||
preIsTokened = false;
|
||||
tokenType = WORD_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader reader) throws IOException {
|
||||
super.reset(reader);
|
||||
reset();
|
||||
}
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
|
||||
* filters with {@link ChineseFilter}
|
||||
* @deprecated (3.1) Use {@link StandardAnalyzer} instead, which has the same functionality.
|
||||
* This analyzer will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class ChineseAnalyzer extends Analyzer {
|
||||
|
||||
/**
|
||||
* Creates
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from a {@link ChineseTokenizer} filtered with
|
||||
* {@link ChineseFilter}
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new ChineseTokenizer(reader);
|
||||
return new TokenStreamComponents(source, new ChineseFilter(source));
|
||||
}
|
||||
}
|
|
@ -1,104 +0,0 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} with a stop word table.
|
||||
* <ul>
|
||||
* <li>Numeric tokens are removed.
|
||||
* <li>English tokens must be larger than 1 character.
|
||||
* <li>One Chinese character as one Chinese word.
|
||||
* </ul>
|
||||
* TO DO:
|
||||
* <ol>
|
||||
* <li>Add Chinese stop words, such as \ue400
|
||||
* <li>Dictionary based Chinese word extraction
|
||||
* <li>Intelligent Chinese word extraction
|
||||
* </ol>
|
||||
*
|
||||
* @deprecated (3.1) Use {@link StopFilter} instead, which has the same functionality.
|
||||
* This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class ChineseFilter extends TokenFilter {
|
||||
|
||||
|
||||
// Only English now, Chinese to be added later.
|
||||
public static final String[] STOP_WORDS = {
|
||||
"and", "are", "as", "at", "be", "but", "by",
|
||||
"for", "if", "in", "into", "is", "it",
|
||||
"no", "not", "of", "on", "or", "such",
|
||||
"that", "the", "their", "then", "there", "these",
|
||||
"they", "this", "to", "was", "will", "with"
|
||||
};
|
||||
|
||||
|
||||
private CharArraySet stopTable;
|
||||
|
||||
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
public ChineseFilter(TokenStream in) {
|
||||
super(in);
|
||||
|
||||
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
while (input.incrementToken()) {
|
||||
char text[] = termAtt.buffer();
|
||||
int termLength = termAtt.length();
|
||||
|
||||
// why not key off token type here assuming ChineseTokenizer comes first?
|
||||
if (!stopTable.contains(text, 0, termLength)) {
|
||||
switch (Character.getType(text[0])) {
|
||||
|
||||
case Character.LOWERCASE_LETTER:
|
||||
case Character.UPPERCASE_LETTER:
|
||||
|
||||
// English word/token should larger than 1 character.
|
||||
if (termLength>1) {
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
case Character.OTHER_LETTER:
|
||||
|
||||
// One Chinese character as one Chinese word.
|
||||
// Chinese word extraction to be added later here.
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,175 +0,0 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
|
||||
/**
|
||||
* Tokenize Chinese text as individual chinese characters.
|
||||
*
|
||||
* <p>
|
||||
* The difference between ChineseTokenizer and
|
||||
* CJKTokenizer is that they have different
|
||||
* token parsing logic.
|
||||
* </p>
|
||||
* <p>
|
||||
* For example, if the Chinese text
|
||||
* "C1C2C3C4" is to be indexed:
|
||||
* <ul>
|
||||
* <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4.
|
||||
* <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
|
||||
* </ul>
|
||||
* </p>
|
||||
* <p>
|
||||
* Therefore the index created by CJKTokenizer is much larger.
|
||||
* </p>
|
||||
* <p>
|
||||
* The problem is that when searching for C1, C1C2, C1C3,
|
||||
* C4C2, C1C2C3 ... the ChineseTokenizer works, but the
|
||||
* CJKTokenizer will not work.
|
||||
* </p>
|
||||
* @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality.
|
||||
* This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class ChineseTokenizer extends Tokenizer {
|
||||
|
||||
|
||||
public ChineseTokenizer(Reader in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
public ChineseTokenizer(AttributeSource source, Reader in) {
|
||||
super(source, in);
|
||||
}
|
||||
|
||||
public ChineseTokenizer(AttributeFactory factory, Reader in) {
|
||||
super(factory, in);
|
||||
}
|
||||
|
||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||
private final static int MAX_WORD_LEN = 255;
|
||||
private final static int IO_BUFFER_SIZE = 1024;
|
||||
private final char[] buffer = new char[MAX_WORD_LEN];
|
||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||
|
||||
|
||||
private int length;
|
||||
private int start;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
private final void push(char c) {
|
||||
|
||||
if (length == 0) start = offset-1; // start of token
|
||||
buffer[length++] = Character.toLowerCase(c); // buffer it
|
||||
|
||||
}
|
||||
|
||||
private final boolean flush() {
|
||||
|
||||
if (length>0) {
|
||||
//System.out.println(new String(buffer, 0,
|
||||
//length));
|
||||
termAtt.copyBuffer(buffer, 0, length);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
|
||||
length = 0;
|
||||
start = offset;
|
||||
|
||||
|
||||
while (true) {
|
||||
|
||||
final char c;
|
||||
offset++;
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
dataLen = input.read(ioBuffer);
|
||||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
if (dataLen == -1) {
|
||||
offset--;
|
||||
return flush();
|
||||
} else
|
||||
c = ioBuffer[bufferIndex++];
|
||||
|
||||
|
||||
switch(Character.getType(c)) {
|
||||
|
||||
case Character.DECIMAL_DIGIT_NUMBER:
|
||||
case Character.LOWERCASE_LETTER:
|
||||
case Character.UPPERCASE_LETTER:
|
||||
push(c);
|
||||
if (length == MAX_WORD_LEN) return flush();
|
||||
break;
|
||||
|
||||
case Character.OTHER_LETTER:
|
||||
if (length>0) {
|
||||
bufferIndex--;
|
||||
offset--;
|
||||
return flush();
|
||||
}
|
||||
push(c);
|
||||
return flush();
|
||||
|
||||
default:
|
||||
if (length>0) return flush();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(offset);
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
offset = bufferIndex = dataLen = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
reset();
|
||||
}
|
||||
}
|
|
@ -1,41 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
</head>
|
||||
<body>
|
||||
Analyzer for Chinese, which indexes unigrams (individual chinese characters).
|
||||
<p>
|
||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
|
||||
<ul>
|
||||
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
|
||||
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
|
||||
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
|
||||
</ul>
|
||||
|
||||
Example phrase: "我是中国人"
|
||||
<ol>
|
||||
<li>StandardAnalyzer: 我-是-中-国-人</li>
|
||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
|
||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
|
||||
</ol>
|
||||
</p>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -40,17 +40,6 @@ import java.io.*;
|
|||
* all). A default set of stopwords is used unless an alternative list is
|
||||
* specified.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* CzechAnalyzer:
|
||||
* <ul>
|
||||
* <li>As of 3.1, words are stemmed with {@link CzechStemFilter}
|
||||
* <li>As of 2.9, StopFilter preserves position increments
|
||||
* <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
|
||||
* <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1068</a>)
|
||||
* </ul>
|
||||
*/
|
||||
public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
||||
/** File containing default Czech stopwords. */
|
||||
|
@ -86,8 +75,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet()}).
|
||||
*
|
||||
* @param matchVersion Lucene version to match See
|
||||
* {@link <a href="#version">above</a>}
|
||||
* @param matchVersion Lucene version to match
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
|
@ -96,8 +84,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*
|
||||
* @param matchVersion Lucene version to match See
|
||||
* {@link <a href="#version">above</a>}
|
||||
* @param matchVersion Lucene version to match
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public CzechAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
|
@ -108,8 +95,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
* Builds an analyzer with the given stop words and a set of work to be
|
||||
* excluded from the {@link CzechStemFilter}.
|
||||
*
|
||||
* @param matchVersion Lucene version to match See
|
||||
* {@link <a href="#version">above</a>}
|
||||
* @param matchVersion Lucene version to match
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionTable a stemming exclusion set
|
||||
*/
|
||||
|
@ -127,7 +113,7 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
||||
* a version is >= LUCENE_31 and a stem exclusion set is provided via
|
||||
* a stem exclusion set is provided via
|
||||
* {@link #CzechAnalyzer(Version, CharArraySet, CharArraySet)} a
|
||||
* {@link KeywordMarkerFilter} is added before
|
||||
* {@link CzechStemFilter}.
|
||||
|
@ -139,11 +125,9 @@ public final class CzechAnalyzer extends StopwordAnalyzerBase {
|
|||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
if(!this.stemExclusionTable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionTable);
|
||||
result = new CzechStemFilter(result);
|
||||
}
|
||||
if(!this.stemExclusionTable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionTable);
|
||||
result = new CzechStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.de;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
|
@ -37,7 +36,6 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
|||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.German2Stemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for German language.
|
||||
|
@ -49,39 +47,11 @@ import org.tartarus.snowball.ext.German2Stemmer;
|
|||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GermanAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, GermanLightStemFilter is used for less aggressive stemming.
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter, and
|
||||
* Snowball stopwords are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
|
||||
@Deprecated
|
||||
private final static String[] GERMAN_STOP_WORDS = {
|
||||
"einer", "eine", "eines", "einem", "einen",
|
||||
"der", "die", "das", "dass", "daß",
|
||||
"du", "er", "sie", "es",
|
||||
"was", "wer", "wie", "wir",
|
||||
"und", "oder", "ohne", "mit",
|
||||
"am", "im", "in", "aus", "auf",
|
||||
"ist", "sein", "war", "wird",
|
||||
"ihr", "ihre", "ihres",
|
||||
"als", "für", "von", "mit",
|
||||
"dich", "dir", "mich", "mir",
|
||||
"mein", "sein", "kein",
|
||||
"durch", "wegen", "wird"
|
||||
};
|
||||
|
||||
/** File containing default German stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "german_stop.txt";
|
||||
|
||||
|
@ -94,10 +64,6 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
/** @deprecated in 3.1, remove in Lucene 5.0 (index bw compat) */
|
||||
@Deprecated
|
||||
private static final CharArraySet DEFAULT_SET_30 = CharArraySet.unmodifiableSet(new CharArraySet(
|
||||
Version.LUCENE_CURRENT, Arrays.asList(GERMAN_STOP_WORDS), false));
|
||||
private static final CharArraySet DEFAULT_SET;
|
||||
static {
|
||||
try {
|
||||
|
@ -125,9 +91,7 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
* {@link #getDefaultStopSet()}.
|
||||
*/
|
||||
public GermanAnalyzer(Version matchVersion) {
|
||||
this(matchVersion,
|
||||
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_SET
|
||||
: DefaultSetHolder.DEFAULT_SET_30);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -176,14 +140,8 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
result = new KeywordMarkerFilter(result, exclusionSet);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
result = new GermanNormalizationFilter(result);
|
||||
result = new GermanLightStemFilter(result);
|
||||
} else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
result = new SnowballFilter(result, new German2Stemmer());
|
||||
} else {
|
||||
result = new GermanStemFilter(result);
|
||||
}
|
||||
result = new GermanNormalizationFilter(result);
|
||||
result = new GermanLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,15 +37,6 @@ import org.apache.lucene.util.Version;
|
|||
* that will not be indexed at all).
|
||||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating GreekAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StandardFilter and GreekStemmer are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
|
@ -78,8 +69,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
|
||||
/**
|
||||
* Builds an analyzer with the default stop words.
|
||||
* @param matchVersion Lucene compatibility version,
|
||||
* See <a href="#version">above</a>
|
||||
* @param matchVersion Lucene compatibility version
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_SET);
|
||||
|
@ -91,8 +81,7 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
* <b>NOTE:</b> The stopwords set should be pre-processed with the logic of
|
||||
* {@link GreekLowerCaseFilter} for best results.
|
||||
*
|
||||
* @param matchVersion Lucene compatibility version,
|
||||
* See <a href="#version">above</a>
|
||||
* @param matchVersion Lucene compatibility version
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public GreekAnalyzer(Version matchVersion, CharArraySet stopwords) {
|
||||
|
@ -114,11 +103,9 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new GreekLowerCaseFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StandardFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new GreekStemFilter(result);
|
||||
result = new GreekStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -196,7 +196,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
|
||||
false);
|
||||
|
||||
|
@ -222,7 +222,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
|
||||
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
|
||||
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
|
||||
|
@ -247,7 +247,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
|
||||
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
|
||||
false);
|
||||
|
@ -274,11 +274,11 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("τρ", "τσ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
|
||||
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
|
||||
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
|
||||
|
@ -337,7 +337,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
|
||||
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
|
||||
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
|
||||
|
@ -425,11 +425,11 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
|
||||
false);
|
||||
|
||||
|
@ -449,7 +449,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
|
||||
false);
|
||||
|
||||
|
@ -483,7 +483,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
|
||||
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
|
||||
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
|
||||
|
@ -521,7 +521,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
|
||||
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
|
||||
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
|
||||
|
@ -530,7 +530,7 @@ public class GreekStemmer {
|
|||
"ουλαμ", "ουρ", "π", "τρ", "μ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("ψοφ", "ναυλοχ"),
|
||||
false);
|
||||
|
||||
|
@ -567,7 +567,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
|
||||
false);
|
||||
|
||||
|
@ -587,7 +587,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
|
||||
false);
|
||||
|
||||
|
@ -601,7 +601,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
|
||||
false);
|
||||
|
||||
|
@ -625,7 +625,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_31,
|
||||
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
|
||||
false);
|
||||
|
||||
|
|
|
@ -94,7 +94,8 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
* @return A
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* {@link StandardFilter}, {@link EnglishPossessiveFilter},
|
||||
* {@link LowerCaseFilter}, {@link StopFilter}
|
||||
* , {@link KeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided and {@link PorterStemFilter}.
|
||||
*/
|
||||
|
@ -103,9 +104,7 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
// prior to this we get the classic behavior, standardfilter does it for us.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new EnglishPossessiveFilter(matchVersion, result);
|
||||
result = new EnglishPossessiveFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
|
|
|
@ -26,30 +26,13 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
/**
|
||||
* TokenFilter that removes possessives (trailing 's) from words.
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating EnglishPossessiveFilter:
|
||||
* <ul>
|
||||
* <li> As of 3.6, U+2019 RIGHT SINGLE QUOTATION MARK and
|
||||
* U+FF07 FULLWIDTH APOSTROPHE are also treated as
|
||||
* quotation marks.
|
||||
* </ul>
|
||||
*/
|
||||
public final class EnglishPossessiveFilter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private Version matchVersion;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #EnglishPossessiveFilter(Version, TokenStream)} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public EnglishPossessiveFilter(TokenStream input) {
|
||||
this(Version.LUCENE_35, input);
|
||||
}
|
||||
|
||||
// NOTE: version now unused
|
||||
public EnglishPossessiveFilter(Version version, TokenStream input) {
|
||||
super(input);
|
||||
this.matchVersion = version;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -63,7 +46,8 @@ public final class EnglishPossessiveFilter extends TokenFilter {
|
|||
|
||||
if (bufferLength >= 2 &&
|
||||
(buffer[bufferLength-2] == '\'' ||
|
||||
(matchVersion.onOrAfter(Version.LUCENE_36) && (buffer[bufferLength-2] == '\u2019' || buffer[bufferLength-2] == '\uFF07'))) &&
|
||||
buffer[bufferLength-2] == '\u2019' ||
|
||||
buffer[bufferLength-2] == '\uFF07') &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||
termAtt.setLength(bufferLength - 2); // Strip last 2 characters off
|
||||
}
|
||||
|
|
|
@ -281,9 +281,9 @@ public class KStemmer {
|
|||
DictEntry entry;
|
||||
|
||||
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
|
||||
Version.LUCENE_31, 1000, false);
|
||||
Version.LUCENE_50, 1000, false);
|
||||
|
||||
d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);
|
||||
d = new CharArrayMap<DictEntry>(Version.LUCENE_50, 1000, false);
|
||||
for (int i = 0; i < exceptionWords.length; i++) {
|
||||
if (!d.containsKey(exceptionWords[i])) {
|
||||
entry = new DictEntry(exceptionWords[i], true);
|
||||
|
|
|
@ -34,17 +34,9 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
|||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.SpanishStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Spanish.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating SpanishAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, SpanishLightStemFilter is used for less aggressive stemming.
|
||||
* </ul>
|
||||
*/
|
||||
public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
@ -132,11 +124,7 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
result = new SpanishLightStemFilter(result);
|
||||
} else {
|
||||
result = new SnowballFilter(result, new SpanishStemmer());
|
||||
}
|
||||
result = new SpanishLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
|
@ -36,7 +35,7 @@ import org.apache.lucene.util.Version;
|
|||
/**
|
||||
* {@link Analyzer} for Persian.
|
||||
* <p>
|
||||
* This Analyzer uses {@link ArabicLetterTokenizer} which implies tokenizing around
|
||||
* This Analyzer uses {@link PersianCharFilter} which implies tokenizing around
|
||||
* zero-width non-joiner in addition to whitespace. Some persian-specific variant forms (such as farsi
|
||||
* yeh and keheh) are standardized. "Stemming" is accomplished via stopwords.
|
||||
* </p>
|
||||
|
@ -118,12 +117,7 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source;
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
source = new StandardTokenizer(matchVersion, reader);
|
||||
} else {
|
||||
source = new ArabicLetterTokenizer(matchVersion, reader);
|
||||
}
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
/* additional persian-specific normalization */
|
||||
|
@ -140,8 +134,6 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||
new PersianCharFilter(CharReader.get(reader)) :
|
||||
reader;
|
||||
return new PersianCharFilter(CharReader.get(reader));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for French language.
|
||||
|
@ -47,53 +46,11 @@ import java.util.Arrays;
|
|||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating FrenchAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, FrenchLightStemFilter is used for less aggressive stemming.
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||
* LowerCaseFilter is used prior to StopFilter, and ElisionFilter and
|
||||
* Snowball stopwords are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
*/
|
||||
public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/**
|
||||
* Extended list of typical French stopwords.
|
||||
* @deprecated (3.1) remove in Lucene 5.0 (index bw compat)
|
||||
*/
|
||||
@Deprecated
|
||||
private final static String[] FRENCH_STOP_WORDS = {
|
||||
"a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi",
|
||||
"autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir",
|
||||
"c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain",
|
||||
"certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci",
|
||||
"combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout",
|
||||
"dedans", "dehors", "delà", "depuis", "derrière", "des", "désormais", "desquelles",
|
||||
"desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse",
|
||||
"diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles",
|
||||
"en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis",
|
||||
"hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle",
|
||||
"le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là",
|
||||
"ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi",
|
||||
"moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre",
|
||||
"nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi",
|
||||
"partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi",
|
||||
"proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels",
|
||||
"qui", "quoi", "quoique", "revoici", "revoilà", "s", "sa", "sans", "sauf", "se", "selon",
|
||||
"seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit",
|
||||
"son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes",
|
||||
"tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers",
|
||||
"voici", "voilà", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à", "ça", "ès",
|
||||
"été", "être", "ô"
|
||||
};
|
||||
|
||||
/** File containing default French stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "french_stop.txt";
|
||||
|
||||
|
@ -111,11 +68,6 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
}
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
/** @deprecated (3.1) remove this in Lucene 5.0, index bw compat */
|
||||
@Deprecated
|
||||
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(FRENCH_STOP_WORDS),
|
||||
false));
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
static {
|
||||
try {
|
||||
|
@ -133,9 +85,7 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
* Builds an analyzer with the default stop words ({@link #getDefaultStopSet}).
|
||||
*/
|
||||
public FrenchAnalyzer(Version matchVersion) {
|
||||
this(matchVersion,
|
||||
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||
: DefaultSetHolder.DEFAULT_STOP_SET_30);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -182,30 +132,15 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new ElisionFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
result = new FrenchLightStemFilter(result);
|
||||
} else {
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.FrenchStemmer());
|
||||
}
|
||||
return new TokenStreamComponents(source, result);
|
||||
} else {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
result = new FrenchStemFilter(result);
|
||||
// Convert to lowercase after stemming!
|
||||
return new TokenStreamComponents(source, new LowerCaseFilter(matchVersion, result));
|
||||
}
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new ElisionFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
result = new FrenchLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,90 +0,0 @@
|
|||
package org.apache.lucene.analysis.fr;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that stems french words.
|
||||
* <p>
|
||||
* The used stemmer can be changed at runtime after the
|
||||
* filter object is created (as long as it is a {@link FrenchStemmer}).
|
||||
* </p>
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerFilter
|
||||
* @deprecated (3.1) Use {@link SnowballFilter} with
|
||||
* {@link org.tartarus.snowball.ext.FrenchStemmer} instead, which has the
|
||||
* same functionality. This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class FrenchStemFilter extends TokenFilter {
|
||||
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private FrenchStemmer stemmer = new FrenchStemmer();
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public FrenchStemFilter( TokenStream in ) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns true for the next token in the stream, or false at EOS
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
String term = termAtt.toString();
|
||||
|
||||
// Check the exclusion table
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
String s = stemmer.stem( term );
|
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals( term ) )
|
||||
termAtt.setEmpty().append(s);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Set a alternative/custom {@link FrenchStemmer} for this filter.
|
||||
*/
|
||||
public void setStemmer( FrenchStemmer stemmer ) {
|
||||
if ( stemmer != null ) {
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,712 +0,0 @@
|
|||
package org.apache.lucene.analysis.fr;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A stemmer for French words.
|
||||
* <p>
|
||||
* The algorithm is based on the work of
|
||||
* Dr Martin Porter on his snowball project<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html<br>
|
||||
* (French stemming algorithm) for details
|
||||
* </p>
|
||||
* @deprecated Use {@link org.tartarus.snowball.ext.FrenchStemmer} instead,
|
||||
* which has the same functionality. This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class FrenchStemmer {
|
||||
|
||||
/**
|
||||
* Buffer for the terms while stemming them.
|
||||
*/
|
||||
private StringBuilder sb = new StringBuilder();
|
||||
|
||||
/**
|
||||
* A temporary buffer, used to reconstruct R2
|
||||
*/
|
||||
private StringBuilder tb = new StringBuilder();
|
||||
|
||||
/**
|
||||
* Region R0 is equal to the whole buffer
|
||||
*/
|
||||
private String R0;
|
||||
|
||||
/**
|
||||
* Region RV
|
||||
* "If the word begins with two vowels, RV is the region after the third letter,
|
||||
* otherwise the region after the first vowel not at the beginning of the word,
|
||||
* or the end of the word if these positions cannot be found."
|
||||
*/
|
||||
private String RV;
|
||||
|
||||
/**
|
||||
* Region R1
|
||||
* "R1 is the region after the first non-vowel following a vowel
|
||||
* or is the null region at the end of the word if there is no such non-vowel"
|
||||
*/
|
||||
private String R1;
|
||||
|
||||
/**
|
||||
* Region R2
|
||||
* "R2 is the region after the first non-vowel in R1 following a vowel
|
||||
* or is the null region at the end of the word if there is no such non-vowel"
|
||||
*/
|
||||
private String R2;
|
||||
|
||||
|
||||
/**
|
||||
* Set to true if we need to perform step 2
|
||||
*/
|
||||
private boolean suite;
|
||||
|
||||
/**
|
||||
* Set to true if the buffer was modified
|
||||
*/
|
||||
private boolean modified;
|
||||
|
||||
|
||||
/**
|
||||
* Stems the given term to a unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term java.langString The term that should be stemmed
|
||||
* @return java.lang.String Discriminator for <tt>term</tt>
|
||||
*/
|
||||
protected String stem( String term ) {
|
||||
if ( !isStemmable( term ) ) {
|
||||
return term;
|
||||
}
|
||||
|
||||
// Use lowercase for medium stemming.
|
||||
term = term.toLowerCase();
|
||||
|
||||
// Reset the StringBuilder.
|
||||
sb.delete( 0, sb.length() );
|
||||
sb.insert( 0, term );
|
||||
|
||||
// reset the booleans
|
||||
modified = false;
|
||||
suite = false;
|
||||
|
||||
sb = treatVowels( sb );
|
||||
|
||||
setStrings();
|
||||
|
||||
step1();
|
||||
|
||||
if (!modified || suite)
|
||||
{
|
||||
if (RV != null)
|
||||
{
|
||||
suite = step2a();
|
||||
if (!suite)
|
||||
step2b();
|
||||
}
|
||||
}
|
||||
|
||||
if (modified || suite)
|
||||
step3();
|
||||
else
|
||||
step4();
|
||||
|
||||
step5();
|
||||
|
||||
step6();
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the search region Strings<br>
|
||||
* it needs to be done each time the buffer was modified
|
||||
*/
|
||||
private void setStrings() {
|
||||
// set the strings
|
||||
R0 = sb.toString();
|
||||
RV = retrieveRV( sb );
|
||||
R1 = retrieveR( sb );
|
||||
if ( R1 != null )
|
||||
{
|
||||
tb.delete( 0, tb.length() );
|
||||
tb.insert( 0, R1 );
|
||||
R2 = retrieveR( tb );
|
||||
}
|
||||
else
|
||||
R2 = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* First step of the Porter Algorithm<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*/
|
||||
private void step1( ) {
|
||||
String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };
|
||||
deleteFrom( R2, suffix );
|
||||
|
||||
replaceFrom( R2, new String[] { "logies", "logie" }, "log" );
|
||||
replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );
|
||||
replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );
|
||||
|
||||
String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};
|
||||
deleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" );
|
||||
|
||||
deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );
|
||||
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );
|
||||
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );
|
||||
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );
|
||||
deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );
|
||||
|
||||
deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );
|
||||
deleteFrom( RV, new String[] { "ements", "ement" } );
|
||||
|
||||
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );
|
||||
deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );
|
||||
deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );
|
||||
|
||||
String[] autre = { "ifs", "ives", "if", "ive" };
|
||||
deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );
|
||||
deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );
|
||||
|
||||
replaceFrom( R0, new String[] { "eaux" }, "eau" );
|
||||
|
||||
replaceFrom( R1, new String[] { "aux" }, "al" );
|
||||
|
||||
deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );
|
||||
|
||||
deleteFrom( R2, new String[] { "eux" } );
|
||||
|
||||
// if one of the next steps is performed, we will need to perform step2a
|
||||
boolean temp = false;
|
||||
temp = replaceFrom( RV, new String[] { "amment" }, "ant" );
|
||||
if (temp == true)
|
||||
suite = true;
|
||||
temp = replaceFrom( RV, new String[] { "emment" }, "ent" );
|
||||
if (temp == true)
|
||||
suite = true;
|
||||
temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );
|
||||
if (temp == true)
|
||||
suite = true;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Second step (A) of the Porter Algorithm<br>
|
||||
* Will be performed if nothing changed from the first step
|
||||
* or changed were done in the amment, emment, ments or ment suffixes<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*
|
||||
* @return boolean - true if something changed in the StringBuilder
|
||||
*/
|
||||
private boolean step2a() {
|
||||
String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",
|
||||
"irent", "iriez", "irez", "irions", "irons", "iront",
|
||||
"issaIent", "issais", "issantes", "issante", "issants", "issant",
|
||||
"issait", "issais", "issions", "issons", "issiez", "issez", "issent",
|
||||
"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };
|
||||
return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );
|
||||
}
|
||||
|
||||
/**
|
||||
* Second step (B) of the Porter Algorithm<br>
|
||||
* Will be performed if step 2 A was performed unsuccessfully<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*/
|
||||
private void step2b() {
|
||||
String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",
|
||||
"erons", "eront","erez", "èrent", "era", "ées", "iez",
|
||||
"ée", "és", "er", "ez", "é" };
|
||||
deleteFrom( RV, suffix );
|
||||
|
||||
String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",
|
||||
"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",
|
||||
"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };
|
||||
deleteButSuffixFrom( RV, search, "e", true );
|
||||
|
||||
deleteFrom( R2, new String[] { "ions" } );
|
||||
}
|
||||
|
||||
/**
|
||||
* Third step of the Porter Algorithm<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*/
|
||||
private void step3() {
|
||||
if (sb.length()>0)
|
||||
{
|
||||
char ch = sb.charAt( sb.length()-1 );
|
||||
if (ch == 'Y')
|
||||
{
|
||||
sb.setCharAt( sb.length()-1, 'i' );
|
||||
setStrings();
|
||||
}
|
||||
else if (ch == 'ç')
|
||||
{
|
||||
sb.setCharAt( sb.length()-1, 'c' );
|
||||
setStrings();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fourth step of the Porter Algorithm<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*/
|
||||
private void step4() {
|
||||
if (sb.length() > 1)
|
||||
{
|
||||
char ch = sb.charAt( sb.length()-1 );
|
||||
if (ch == 's')
|
||||
{
|
||||
char b = sb.charAt( sb.length()-2 );
|
||||
if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')
|
||||
{
|
||||
sb.delete( sb.length() - 1, sb.length());
|
||||
setStrings();
|
||||
}
|
||||
}
|
||||
}
|
||||
boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );
|
||||
if (!found)
|
||||
found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );
|
||||
|
||||
replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );
|
||||
deleteFrom( RV, new String[] { "e" } );
|
||||
deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );
|
||||
}
|
||||
|
||||
/**
|
||||
* Fifth step of the Porter Algorithm<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*/
|
||||
private void step5() {
|
||||
if (R0 != null)
|
||||
{
|
||||
if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))
|
||||
{
|
||||
sb.delete( sb.length() - 1, sb.length() );
|
||||
setStrings();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sixth (and last!) step of the Porter Algorithm<br>
|
||||
* refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation
|
||||
*/
|
||||
private void step6() {
|
||||
if (R0!=null && R0.length()>0)
|
||||
{
|
||||
boolean seenVowel = false;
|
||||
boolean seenConson = false;
|
||||
int pos = -1;
|
||||
for (int i = R0.length()-1; i > -1; i--)
|
||||
{
|
||||
char ch = R0.charAt(i);
|
||||
if (isVowel(ch))
|
||||
{
|
||||
if (!seenVowel)
|
||||
{
|
||||
if (ch == 'é' || ch == 'è')
|
||||
{
|
||||
pos = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
seenVowel = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (seenVowel)
|
||||
break;
|
||||
else
|
||||
seenConson = true;
|
||||
}
|
||||
}
|
||||
if (pos > -1 && seenConson && !seenVowel)
|
||||
sb.setCharAt(pos, 'e');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a suffix searched in zone "source" if zone "from" contains prefix + search string
|
||||
*
|
||||
* @param source java.lang.String - the primary source zone for search
|
||||
* @param search java.lang.String[] - the strings to search for suppression
|
||||
* @param from java.lang.String - the secondary source zone for search
|
||||
* @param prefix java.lang.String - the prefix to add to the search string to test
|
||||
* @return boolean - true if modified
|
||||
*/
|
||||
private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {
|
||||
boolean found = false;
|
||||
if (source!=null )
|
||||
{
|
||||
for (int i = 0; i < search.length; i++) {
|
||||
if ( source.endsWith( search[i] ))
|
||||
{
|
||||
if (from!=null && from.endsWith( prefix + search[i] ))
|
||||
{
|
||||
sb.delete( sb.length() - search[i].length(), sb.length());
|
||||
found = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel
|
||||
*
|
||||
* @param source java.lang.String - the primary source zone for search
|
||||
* @param search java.lang.String[] - the strings to search for suppression
|
||||
* @param vowel boolean - true if we need a vowel before the search string
|
||||
* @param from java.lang.String - the secondary source zone for search (where vowel could be)
|
||||
* @return boolean - true if modified
|
||||
*/
|
||||
private boolean deleteFromIfTestVowelBeforeIn( String source, String[] search, boolean vowel, String from ) {
|
||||
boolean found = false;
|
||||
if (source!=null && from!=null)
|
||||
{
|
||||
for (int i = 0; i < search.length; i++) {
|
||||
if ( source.endsWith( search[i] ))
|
||||
{
|
||||
if ((search[i].length() + 1) <= from.length())
|
||||
{
|
||||
boolean test = isVowel(sb.charAt(sb.length()-(search[i].length()+1)));
|
||||
if (test == vowel)
|
||||
{
|
||||
sb.delete( sb.length() - search[i].length(), sb.length());
|
||||
modified = true;
|
||||
found = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a suffix searched in zone "source" if preceded by the prefix
|
||||
*
|
||||
* @param source java.lang.String - the primary source zone for search
|
||||
* @param search java.lang.String[] - the strings to search for suppression
|
||||
* @param prefix java.lang.String - the prefix to add to the search string to test
|
||||
* @param without boolean - true if it will be deleted even without prefix found
|
||||
*/
|
||||
private void deleteButSuffixFrom( String source, String[] search, String prefix, boolean without ) {
|
||||
if (source!=null)
|
||||
{
|
||||
for (int i = 0; i < search.length; i++) {
|
||||
if ( source.endsWith( prefix + search[i] ))
|
||||
{
|
||||
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
|
||||
modified = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
else if ( without && source.endsWith( search[i] ))
|
||||
{
|
||||
sb.delete( sb.length() - search[i].length(), sb.length() );
|
||||
modified = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a suffix searched in zone "source" if preceded by prefix<br>
|
||||
* or replace it with the replace string if preceded by the prefix in the zone "from"<br>
|
||||
* or delete the suffix if specified
|
||||
*
|
||||
* @param source java.lang.String - the primary source zone for search
|
||||
* @param search java.lang.String[] - the strings to search for suppression
|
||||
* @param prefix java.lang.String - the prefix to add to the search string to test
|
||||
* @param without boolean - true if it will be deleted even without prefix found
|
||||
*/
|
||||
private void deleteButSuffixFromElseReplace( String source, String[] search, String prefix, boolean without, String from, String replace ) {
|
||||
if (source!=null)
|
||||
{
|
||||
for (int i = 0; i < search.length; i++) {
|
||||
if ( source.endsWith( prefix + search[i] ))
|
||||
{
|
||||
sb.delete( sb.length() - (prefix.length() + search[i].length()), sb.length() );
|
||||
modified = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
else if ( from!=null && from.endsWith( prefix + search[i] ))
|
||||
{
|
||||
sb.replace( sb.length() - (prefix.length() + search[i].length()), sb.length(), replace );
|
||||
modified = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
else if ( without && source.endsWith( search[i] ))
|
||||
{
|
||||
sb.delete( sb.length() - search[i].length(), sb.length() );
|
||||
modified = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace a search string with another within the source zone
|
||||
*
|
||||
* @param source java.lang.String - the source zone for search
|
||||
* @param search java.lang.String[] - the strings to search for replacement
|
||||
* @param replace java.lang.String - the replacement string
|
||||
*/
|
||||
private boolean replaceFrom( String source, String[] search, String replace ) {
|
||||
boolean found = false;
|
||||
if (source!=null)
|
||||
{
|
||||
for (int i = 0; i < search.length; i++) {
|
||||
if ( source.endsWith( search[i] ))
|
||||
{
|
||||
sb.replace( sb.length() - search[i].length(), sb.length(), replace );
|
||||
modified = true;
|
||||
found = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a search string within the source zone
|
||||
*
|
||||
* @param source the source zone for search
|
||||
* @param suffix the strings to search for suppression
|
||||
*/
|
||||
private void deleteFrom(String source, String[] suffix ) {
|
||||
if (source!=null)
|
||||
{
|
||||
for (int i = 0; i < suffix.length; i++) {
|
||||
if (source.endsWith( suffix[i] ))
|
||||
{
|
||||
sb.delete( sb.length() - suffix[i].length(), sb.length());
|
||||
modified = true;
|
||||
setStrings();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test if a char is a french vowel, including accentuated ones
|
||||
*
|
||||
* @param ch the char to test
|
||||
* @return boolean - true if the char is a vowel
|
||||
*/
|
||||
private boolean isVowel(char ch) {
|
||||
switch (ch)
|
||||
{
|
||||
case 'a':
|
||||
case 'e':
|
||||
case 'i':
|
||||
case 'o':
|
||||
case 'u':
|
||||
case 'y':
|
||||
case 'â':
|
||||
case 'à':
|
||||
case 'ë':
|
||||
case 'é':
|
||||
case 'ê':
|
||||
case 'è':
|
||||
case 'ï':
|
||||
case 'î':
|
||||
case 'ô':
|
||||
case 'ü':
|
||||
case 'ù':
|
||||
case 'û':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br>
|
||||
* "R is the region after the first non-vowel following a vowel
|
||||
* or is the null region at the end of the word if there is no such non-vowel"<br>
|
||||
* @param buffer java.lang.StringBuilder - the in buffer
|
||||
* @return java.lang.String - the resulting string
|
||||
*/
|
||||
private String retrieveR( StringBuilder buffer ) {
|
||||
int len = buffer.length();
|
||||
int pos = -1;
|
||||
for (int c = 0; c < len; c++) {
|
||||
if (isVowel( buffer.charAt( c )))
|
||||
{
|
||||
pos = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (pos > -1)
|
||||
{
|
||||
int consonne = -1;
|
||||
for (int c = pos; c < len; c++) {
|
||||
if (!isVowel(buffer.charAt( c )))
|
||||
{
|
||||
consonne = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (consonne > -1 && (consonne+1) < len)
|
||||
return buffer.substring( consonne+1, len );
|
||||
else
|
||||
return null;
|
||||
}
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the "RV zone" from a buffer an return the corresponding string<br>
|
||||
* "If the word begins with two vowels, RV is the region after the third letter,
|
||||
* otherwise the region after the first vowel not at the beginning of the word,
|
||||
* or the end of the word if these positions cannot be found."<br>
|
||||
* @param buffer java.lang.StringBuilder - the in buffer
|
||||
* @return java.lang.String - the resulting string
|
||||
*/
|
||||
private String retrieveRV( StringBuilder buffer ) {
|
||||
int len = buffer.length();
|
||||
if ( buffer.length() > 3)
|
||||
{
|
||||
if ( isVowel(buffer.charAt( 0 )) && isVowel(buffer.charAt( 1 ))) {
|
||||
return buffer.substring(3,len);
|
||||
}
|
||||
else
|
||||
{
|
||||
int pos = 0;
|
||||
for (int c = 1; c < len; c++) {
|
||||
if (isVowel( buffer.charAt( c )))
|
||||
{
|
||||
pos = c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( pos+1 < len )
|
||||
return buffer.substring( pos+1, len );
|
||||
else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Turns u and i preceded AND followed by a vowel to UpperCase<br>
|
||||
* Turns y preceded OR followed by a vowel to UpperCase<br>
|
||||
* Turns u preceded by q to UpperCase<br>
|
||||
*
|
||||
* @param buffer java.util.StringBuilder - the buffer to treat
|
||||
* @return java.util.StringBuilder - the treated buffer
|
||||
*/
|
||||
private StringBuilder treatVowels( StringBuilder buffer ) {
|
||||
for ( int c = 0; c < buffer.length(); c++ ) {
|
||||
char ch = buffer.charAt( c );
|
||||
|
||||
if (c == 0) // first char
|
||||
{
|
||||
if (buffer.length()>1)
|
||||
{
|
||||
if (ch == 'y' && isVowel(buffer.charAt( c + 1 )))
|
||||
buffer.setCharAt( c, 'Y' );
|
||||
}
|
||||
}
|
||||
else if (c == buffer.length()-1) // last char
|
||||
{
|
||||
if (ch == 'u' && buffer.charAt( c - 1 ) == 'q')
|
||||
buffer.setCharAt( c, 'U' );
|
||||
if (ch == 'y' && isVowel(buffer.charAt( c - 1 )))
|
||||
buffer.setCharAt( c, 'Y' );
|
||||
}
|
||||
else // other cases
|
||||
{
|
||||
if (ch == 'u')
|
||||
{
|
||||
if (buffer.charAt( c - 1) == 'q')
|
||||
buffer.setCharAt( c, 'U' );
|
||||
else if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
|
||||
buffer.setCharAt( c, 'U' );
|
||||
}
|
||||
if (ch == 'i')
|
||||
{
|
||||
if (isVowel(buffer.charAt( c - 1 )) && isVowel(buffer.charAt( c + 1 )))
|
||||
buffer.setCharAt( c, 'I' );
|
||||
}
|
||||
if (ch == 'y')
|
||||
{
|
||||
if (isVowel(buffer.charAt( c - 1 )) || isVowel(buffer.charAt( c + 1 )))
|
||||
buffer.setCharAt( c, 'Y' );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks a term if it can be processed correctly.
|
||||
*
|
||||
* @return boolean - true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable( String term ) {
|
||||
boolean upper = false;
|
||||
int first = -1;
|
||||
for ( int c = 0; c < term.length(); c++ ) {
|
||||
// Discard terms that contain non-letter characters.
|
||||
if ( !Character.isLetter( term.charAt( c ) ) ) {
|
||||
return false;
|
||||
}
|
||||
// Discard terms that contain multiple uppercase letters.
|
||||
if ( Character.isUpperCase( term.charAt( c ) ) ) {
|
||||
if ( upper ) {
|
||||
return false;
|
||||
}
|
||||
// First encountered uppercase letter, set flag and save
|
||||
// position.
|
||||
else {
|
||||
first = c;
|
||||
upper = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Discard the term if it contains a single uppercase letter that
|
||||
// is not starting the term.
|
||||
if ( first > 0 ) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -29,18 +29,10 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.in.IndicTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Analyzer for Hindi.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating HindiAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, StandardTokenizer is used for tokenization
|
||||
* </ul>
|
||||
*/
|
||||
public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
@ -126,12 +118,7 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source;
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
source = new StandardTokenizer(matchVersion, reader);
|
||||
} else {
|
||||
source = new IndicTokenizer(matchVersion, reader);
|
||||
}
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
|
|
|
@ -1,53 +0,0 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadocs
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Simple Tokenizer for text in Indian Languages.
|
||||
* @deprecated (3.6) Use {@link StandardTokenizer} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class IndicTokenizer extends CharTokenizer {
|
||||
|
||||
public IndicTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
|
||||
super(matchVersion, factory, input);
|
||||
}
|
||||
|
||||
public IndicTokenizer(Version matchVersion, AttributeSource source, Reader input) {
|
||||
super(matchVersion, source, input);
|
||||
}
|
||||
|
||||
public IndicTokenizer(Version matchVersion, Reader input) {
|
||||
super(matchVersion, input);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isTokenChar(int c) {
|
||||
return Character.isLetter(c)
|
||||
|| Character.getType(c) == Character.NON_SPACING_MARK
|
||||
|| Character.getType(c) == Character.FORMAT
|
||||
|| Character.getType(c) == Character.COMBINING_SPACING_MARK;
|
||||
}
|
||||
}
|
|
@ -36,19 +36,9 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
|||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.ItalianStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Italian.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ItalianAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, ItalianLightStemFilter is used for less aggressive stemming.
|
||||
* <li> As of 3.2, ElisionFilter with a set of Italian
|
||||
* contractions is used by default.
|
||||
* </ul>
|
||||
*/
|
||||
public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
@ -139,18 +129,12 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_32)) {
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
}
|
||||
result = new ElisionFilter(matchVersion, result, DEFAULT_ARTICLES);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
result = new ItalianLightStemFilter(result);
|
||||
} else {
|
||||
result = new SnowballFilter(result, new ItalianStemmer());
|
||||
}
|
||||
result = new ItalianLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,518 +0,0 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
|
||||
* {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
|
||||
* (with behaviour identical to {@link String#split(String)}),
|
||||
* and that combines the functionality of
|
||||
* {@link org.apache.lucene.analysis.core.LetterTokenizer},
|
||||
* {@link org.apache.lucene.analysis.core.LowerCaseTokenizer},
|
||||
* {@link org.apache.lucene.analysis.core.WhitespaceTokenizer},
|
||||
* {@link org.apache.lucene.analysis.core.StopFilter} into a single efficient
|
||||
* multi-purpose class.
|
||||
* <p>
|
||||
* If you are unsure how exactly a regular expression should look like, consider
|
||||
* prototyping by simply trying various expressions on some test texts via
|
||||
* {@link String#split(String)}. Once you are satisfied, give that regex to
|
||||
* PatternAnalyzer. Also see <a target="_blank"
|
||||
* href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>.
|
||||
* <p>
|
||||
* This class can be considerably faster than the "normal" Lucene tokenizers.
|
||||
* It can also serve as a building block in a compound Lucene
|
||||
* {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this
|
||||
* stemming example:
|
||||
* <pre>
|
||||
* PatternAnalyzer pat = ...
|
||||
* TokenStream tokenStream = new SnowballFilter(
|
||||
* pat.tokenStream("content", "James is running round in the woods"),
|
||||
* "English"));
|
||||
* </pre>
|
||||
* @deprecated (4.0) use the pattern-based analysis in the analysis/pattern package instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public final class PatternAnalyzer extends Analyzer {
|
||||
|
||||
/** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
|
||||
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
|
||||
|
||||
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
|
||||
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
|
||||
|
||||
private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
|
||||
CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(
|
||||
"a", "about", "above", "across", "adj", "after", "afterwards",
|
||||
"again", "against", "albeit", "all", "almost", "alone", "along",
|
||||
"already", "also", "although", "always", "among", "amongst", "an",
|
||||
"and", "another", "any", "anyhow", "anyone", "anything",
|
||||
"anywhere", "are", "around", "as", "at", "be", "became", "because",
|
||||
"become", "becomes", "becoming", "been", "before", "beforehand",
|
||||
"behind", "being", "below", "beside", "besides", "between",
|
||||
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
|
||||
"down", "during", "each", "eg", "either", "else", "elsewhere",
|
||||
"enough", "etc", "even", "ever", "every", "everyone", "everything",
|
||||
"everywhere", "except", "few", "first", "for", "former",
|
||||
"formerly", "from", "further", "had", "has", "have", "he", "hence",
|
||||
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
|
||||
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
|
||||
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
|
||||
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
|
||||
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
|
||||
"must", "my", "myself", "namely", "neither", "never",
|
||||
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
|
||||
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
|
||||
"once one", "only", "onto", "or", "other", "others", "otherwise",
|
||||
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
|
||||
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
|
||||
"several", "she", "should", "since", "so", "some", "somehow",
|
||||
"someone", "something", "sometime", "sometimes", "somewhere",
|
||||
"still", "such", "t", "than", "that", "the", "their", "them",
|
||||
"themselves", "then", "thence", "there", "thereafter", "thereby",
|
||||
"therefor", "therein", "thereupon", "these", "they", "this",
|
||||
"those", "though", "through", "throughout", "thru", "thus", "to",
|
||||
"together", "too", "toward", "towards", "under", "until", "up",
|
||||
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
|
||||
"whatever", "whatsoever", "when", "whence", "whenever",
|
||||
"whensoever", "where", "whereafter", "whereas", "whereat",
|
||||
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
|
||||
"whereon", "whereto", "whereunto", "whereupon", "wherever",
|
||||
"wherewith", "whether", "which", "whichever", "whichsoever",
|
||||
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
|
||||
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
|
||||
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
|
||||
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
|
||||
"yourselves"
|
||||
), true));
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with English stop words (can be shared
|
||||
* freely across threads without harm); global per class loader.
|
||||
*/
|
||||
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
|
||||
Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with <b>extended </b> English stop words
|
||||
* (can be shared freely across threads without harm); global per class
|
||||
* loader. The stop words are borrowed from
|
||||
* http://thomas.loc.gov/home/stopwords.html, see
|
||||
* http://thomas.loc.gov/home/all.about.inquery.html
|
||||
*/
|
||||
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
|
||||
Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
|
||||
|
||||
private final Pattern pattern;
|
||||
private final boolean toLowerCase;
|
||||
private final CharArraySet stopWords;
|
||||
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Constructs a new instance with the given parameters.
|
||||
*
|
||||
* @param matchVersion currently does nothing
|
||||
* @param pattern
|
||||
* a regular expression delimiting tokens
|
||||
* @param toLowerCase
|
||||
* if <code>true</code> returns tokens after applying
|
||||
* String.toLowerCase()
|
||||
* @param stopWords
|
||||
* if non-null, ignores all tokens that are contained in the
|
||||
* given stop set (after previously having applied toLowerCase()
|
||||
* if applicable). For example, created via
|
||||
* {@link StopFilter#makeStopSet(Version, String[])}and/or
|
||||
* {@link org.apache.lucene.analysis.util.WordlistLoader}as in
|
||||
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
|
||||
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
||||
* lists </a>.
|
||||
*/
|
||||
public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, CharArraySet stopWords) {
|
||||
if (pattern == null)
|
||||
throw new IllegalArgumentException("pattern must not be null");
|
||||
|
||||
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
|
||||
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
|
||||
|
||||
if (stopWords != null && stopWords.size() == 0) stopWords = null;
|
||||
|
||||
this.pattern = pattern;
|
||||
this.toLowerCase = toLowerCase;
|
||||
this.stopWords = stopWords;
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes the given string into token terms
|
||||
* (aka words).
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param reader
|
||||
* reader (e.g. charfilter) of the original text. can be null.
|
||||
* @param text
|
||||
* the string to tokenize
|
||||
* @return a new token stream
|
||||
*/
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
|
||||
// Ideally the Analyzer superclass should have a method with the same signature,
|
||||
// with a default impl that simply delegates to the StringReader flavour.
|
||||
if (text == null)
|
||||
throw new IllegalArgumentException("text must not be null");
|
||||
|
||||
if (pattern == NON_WORD_PATTERN) { // fast path
|
||||
return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
|
||||
} else if (pattern == WHITESPACE_PATTERN) { // fast path
|
||||
return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
|
||||
}
|
||||
|
||||
Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
|
||||
TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes all the text in the given Reader;
|
||||
* This implementation forwards to <code>tokenStream(String, Reader, String)</code> and is
|
||||
* less efficient than <code>tokenStream(String, Reader, String)</code>.
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param reader
|
||||
* the reader delivering the text
|
||||
* @return a new token stream
|
||||
*/
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
try {
|
||||
String text = toString(reader);
|
||||
return createComponents(fieldName, reader, text);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether some other object is "equal to" this one.
|
||||
*
|
||||
* @param other
|
||||
* the reference object with which to compare.
|
||||
* @return true if equal, false otherwise
|
||||
*/
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) return true;
|
||||
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
|
||||
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
|
||||
|
||||
if (other instanceof PatternAnalyzer) {
|
||||
PatternAnalyzer p2 = (PatternAnalyzer) other;
|
||||
return
|
||||
toLowerCase == p2.toLowerCase &&
|
||||
eqPattern(pattern, p2.pattern) &&
|
||||
eq(stopWords, p2.stopWords);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code value for the object.
|
||||
*
|
||||
* @return the hash code.
|
||||
*/
|
||||
@Override
|
||||
public int hashCode() {
|
||||
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
|
||||
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
|
||||
|
||||
int h = 1;
|
||||
h = 31*h + pattern.pattern().hashCode();
|
||||
h = 31*h + pattern.flags();
|
||||
h = 31*h + (toLowerCase ? 1231 : 1237);
|
||||
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
|
||||
return h;
|
||||
}
|
||||
|
||||
/** equality where o1 and/or o2 can be null */
|
||||
private static boolean eq(Object o1, Object o2) {
|
||||
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
|
||||
}
|
||||
|
||||
/** assumes p1 and p2 are not null */
|
||||
private static boolean eqPattern(Pattern p1, Pattern p2) {
|
||||
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads until end-of-stream and returns all read chars, finally closes the stream.
|
||||
*
|
||||
* @param input the input stream
|
||||
* @throws IOException if an I/O error occurs while reading the stream
|
||||
*/
|
||||
private static String toString(Reader input) throws IOException {
|
||||
if (input instanceof FastStringReader) { // fast path
|
||||
return ((FastStringReader) input).getString();
|
||||
}
|
||||
|
||||
try {
|
||||
int len = 256;
|
||||
char[] buffer = new char[len];
|
||||
char[] output = new char[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
char[] tmp = new char[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
return new String(output, 0, len);
|
||||
} finally {
|
||||
input.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* The work horse; performance isn't fantastic, but it's not nearly as bad
|
||||
* as one might think - kudos to the Sun regex developers.
|
||||
*/
|
||||
private static final class PatternTokenizer extends Tokenizer {
|
||||
|
||||
private final Pattern pattern;
|
||||
private String str;
|
||||
private final boolean toLowerCase;
|
||||
private Matcher matcher;
|
||||
private int pos = 0;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
|
||||
super(input);
|
||||
this.pattern = pattern;
|
||||
this.str = str;
|
||||
this.matcher = pattern.matcher(str);
|
||||
this.toLowerCase = toLowerCase;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() {
|
||||
if (matcher == null) return false;
|
||||
clearAttributes();
|
||||
while (true) { // loop takes care of leading and trailing boundary cases
|
||||
int start = pos;
|
||||
int end;
|
||||
boolean isMatch = matcher.find();
|
||||
if (isMatch) {
|
||||
end = matcher.start();
|
||||
pos = matcher.end();
|
||||
} else {
|
||||
end = str.length();
|
||||
matcher = null; // we're finished
|
||||
}
|
||||
|
||||
if (start != end) { // non-empty match (header/trailer)
|
||||
String text = str.substring(start, end);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
termAtt.setEmpty().append(text);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
|
||||
return true;
|
||||
}
|
||||
if (!isMatch) return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = correctOffset(str.length());
|
||||
this.offsetAtt.setOffset(finalOffset, finalOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
this.str = PatternAnalyzer.toString(input);
|
||||
this.matcher = pattern.matcher(this.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* Special-case class for best performance in common cases; this class is
|
||||
* otherwise unnecessary.
|
||||
*/
|
||||
private static final class FastStringTokenizer extends Tokenizer {
|
||||
|
||||
private String str;
|
||||
private int pos;
|
||||
private final boolean isLetter;
|
||||
private final boolean toLowerCase;
|
||||
private final CharArraySet stopWords;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, CharArraySet stopWords) {
|
||||
super(input);
|
||||
this.str = str;
|
||||
this.isLetter = isLetter;
|
||||
this.toLowerCase = toLowerCase;
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() {
|
||||
clearAttributes();
|
||||
// cache loop instance vars (performance)
|
||||
String s = str;
|
||||
int len = s.length();
|
||||
int i = pos;
|
||||
boolean letter = isLetter;
|
||||
|
||||
int start = 0;
|
||||
String text;
|
||||
do {
|
||||
// find beginning of token
|
||||
text = null;
|
||||
while (i < len && !isTokenChar(s.charAt(i), letter)) {
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < len) { // found beginning; now find end of token
|
||||
start = i;
|
||||
while (i < len && isTokenChar(s.charAt(i), letter)) {
|
||||
i++;
|
||||
}
|
||||
|
||||
text = s.substring(start, i);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
// if (toLowerCase) {
|
||||
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
|
||||
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
|
||||
// text = s.substring(start, i).toLowerCase();
|
||||
//// char[] chars = new char[i-start];
|
||||
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
|
||||
//// text = new String(chars);
|
||||
// } else {
|
||||
// text = s.substring(start, i);
|
||||
// }
|
||||
}
|
||||
} while (text != null && isStopWord(text));
|
||||
|
||||
pos = i;
|
||||
if (text == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
termAtt.setEmpty().append(text);
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(i));
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void end() {
|
||||
// set final offset
|
||||
final int finalOffset = str.length();
|
||||
this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
|
||||
}
|
||||
|
||||
private boolean isTokenChar(char c, boolean isLetter) {
|
||||
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
|
||||
}
|
||||
|
||||
private boolean isStopWord(String text) {
|
||||
return stopWords != null && stopWords.contains(text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset(Reader input) throws IOException {
|
||||
super.reset(input);
|
||||
this.str = PatternAnalyzer.toString(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
this.pos = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* A StringReader that exposes it's contained string for fast direct access.
|
||||
* Might make sense to generalize this to CharSequence and make it public?
|
||||
*/
|
||||
static final class FastStringReader extends StringReader {
|
||||
|
||||
private final String s;
|
||||
|
||||
FastStringReader(String s) {
|
||||
super(s);
|
||||
this.s = s;
|
||||
}
|
||||
|
||||
String getString() {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -35,7 +35,7 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
|||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
// use a fixed version, as we don't care about case sensitivity.
|
||||
private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false);
|
||||
private final CharArraySet previous = new CharArraySet(Version.LUCENE_50, 8, false);
|
||||
|
||||
/**
|
||||
* Creates a new RemoveDuplicatesTokenFilter
|
||||
|
|
|
@ -47,20 +47,6 @@ import java.io.Reader;
|
|||
* A default set of stopwords is used unless an alternative list is specified, but the
|
||||
* exclusion list is empty by default.
|
||||
* </p>
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating DutchAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, {@link #DutchAnalyzer(Version, CharArraySet)} and
|
||||
* {@link #DutchAnalyzer(Version, CharArraySet, CharArraySet)} also populate
|
||||
* the default entries for the stem override dictionary
|
||||
* <li> As of 3.1, Snowball stemming is done with SnowballFilter,
|
||||
* LowerCaseFilter is used prior to StopFilter, and Snowball
|
||||
* stopwords are used by default.
|
||||
* <li> As of 2.9, StopFilter preserves position
|
||||
* increments
|
||||
* </ul>
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}.</p>
|
||||
|
@ -119,26 +105,15 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
*
|
||||
*/
|
||||
public DutchAnalyzer(Version matchVersion) {
|
||||
// historically, only this ctor populated the stem dict!!!!!
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords){
|
||||
// historically, this ctor never the stem dict!!!!!
|
||||
// so we populate it only for >= 3.6
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET,
|
||||
matchVersion.onOrAfter(Version.LUCENE_36)
|
||||
? DefaultSetHolder.DEFAULT_STEM_DICT
|
||||
: CharArrayMap.<String>emptyMap());
|
||||
this(matchVersion, stopwords, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable){
|
||||
// historically, this ctor never the stem dict!!!!!
|
||||
// so we populate it only for >= 3.6
|
||||
this(matchVersion, stopwords, stemExclusionTable,
|
||||
matchVersion.onOrAfter(Version.LUCENE_36)
|
||||
? DefaultSetHolder.DEFAULT_STEM_DICT
|
||||
: CharArrayMap.<String>emptyMap());
|
||||
this(matchVersion, stopwords, stemExclusionTable, DefaultSetHolder.DEFAULT_STEM_DICT);
|
||||
}
|
||||
|
||||
public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<String> stemOverrideDict) {
|
||||
|
@ -160,25 +135,15 @@ public final class DutchAnalyzer extends Analyzer {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader aReader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
if (!stemdict.isEmpty())
|
||||
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
} else {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
result = new DutchStemFilter(result, stemdict);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, aReader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stoptable);
|
||||
if (!excltable.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, excltable);
|
||||
if (!stemdict.isEmpty())
|
||||
result = new StemmerOverrideFilter(matchVersion, result, stemdict);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,108 +0,0 @@
|
|||
package org.apache.lucene.analysis.nl;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* A {@link TokenFilter} that stems Dutch words.
|
||||
* <p>
|
||||
* It supports a table of words that should
|
||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||
* filter object is created (as long as it is a {@link DutchStemmer}).
|
||||
* </p>
|
||||
* <p>
|
||||
* To prevent terms from being stemmed use an instance of
|
||||
* {@link KeywordMarkerFilter} or a custom {@link TokenFilter} that sets
|
||||
* the {@link KeywordAttribute} before this {@link TokenStream}.
|
||||
* </p>
|
||||
* @see KeywordMarkerFilter
|
||||
* @deprecated (3.1) Use {@link SnowballFilter} with
|
||||
* {@link org.tartarus.snowball.ext.DutchStemmer} instead, which has the
|
||||
* same functionality. This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class DutchStemFilter extends TokenFilter {
|
||||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private DutchStemmer stemmer = new DutchStemmer();
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||
|
||||
public DutchStemFilter(TokenStream _in) {
|
||||
super(_in);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param stemdictionary Dictionary of word stem pairs, that overrule the algorithm
|
||||
*/
|
||||
public DutchStemFilter(TokenStream _in, Map<?,?> stemdictionary) {
|
||||
this(_in);
|
||||
stemmer.setStemDictionary(stemdictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final String term = termAtt.toString();
|
||||
|
||||
// Check the exclusion table.
|
||||
if (!keywordAttr.isKeyword()) {
|
||||
final String s = stemmer.stem(term);
|
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals(term))
|
||||
termAtt.setEmpty().append(s);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a alternative/custom {@link DutchStemmer} for this filter.
|
||||
*/
|
||||
public void setStemmer(DutchStemmer stemmer) {
|
||||
if (stemmer != null) {
|
||||
this.stemmer = stemmer;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set dictionary for stemming, this dictionary overrules the algorithm,
|
||||
* so you can correct for a particular unwanted word-stem pair.
|
||||
*/
|
||||
public void setStemDictionary(HashMap<?,?> dict) {
|
||||
if (stemmer != null)
|
||||
stemmer.setStemDictionary(dict);
|
||||
}
|
||||
}
|
|
@ -1,409 +0,0 @@
|
|||
package org.apache.lucene.analysis.nl;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A stemmer for Dutch words.
|
||||
* <p>
|
||||
* The algorithm is an implementation of
|
||||
* the <a href="http://snowball.tartarus.org/algorithms/dutch/stemmer.html">dutch stemming</a>
|
||||
* algorithm in Martin Porter's snowball project.
|
||||
* </p>
|
||||
* @deprecated (3.1) Use {@link org.tartarus.snowball.ext.DutchStemmer} instead,
|
||||
* which has the same functionality. This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class DutchStemmer {
|
||||
/**
|
||||
* Buffer for the terms while stemming them.
|
||||
*/
|
||||
private StringBuilder sb = new StringBuilder();
|
||||
private boolean _removedE;
|
||||
private Map _stemDict;
|
||||
|
||||
private int _R1;
|
||||
private int _R2;
|
||||
|
||||
//TODO convert to internal
|
||||
/*
|
||||
* Stems the given term to an unique <tt>discriminator</tt>.
|
||||
*
|
||||
* @param term The term that should be stemmed.
|
||||
* @return Discriminator for <tt>term</tt>
|
||||
*/
|
||||
public String stem(String term) {
|
||||
term = term.toLowerCase();
|
||||
if (!isStemmable(term))
|
||||
return term;
|
||||
if (_stemDict != null && _stemDict.containsKey(term))
|
||||
if (_stemDict.get(term) instanceof String)
|
||||
return (String) _stemDict.get(term);
|
||||
else
|
||||
return null;
|
||||
|
||||
// Reset the StringBuilder.
|
||||
sb.delete(0, sb.length());
|
||||
sb.insert(0, term);
|
||||
// Stemming starts here...
|
||||
substitute(sb);
|
||||
storeYandI(sb);
|
||||
_R1 = getRIndex(sb, 0);
|
||||
_R1 = Math.max(3, _R1);
|
||||
step1(sb);
|
||||
step2(sb);
|
||||
_R2 = getRIndex(sb, _R1);
|
||||
step3a(sb);
|
||||
step3b(sb);
|
||||
step4(sb);
|
||||
reStoreYandI(sb);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private boolean enEnding(StringBuilder sb) {
|
||||
String[] enend = new String[]{"ene", "en"};
|
||||
for (int i = 0; i < enend.length; i++) {
|
||||
String end = enend[i];
|
||||
String s = sb.toString();
|
||||
int index = s.length() - end.length();
|
||||
if (s.endsWith(end) &&
|
||||
index >= _R1 &&
|
||||
isValidEnEnding(sb, index - 1)
|
||||
) {
|
||||
sb.delete(index, index + end.length());
|
||||
unDouble(sb, index);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private void step1(StringBuilder sb) {
|
||||
if (_R1 >= sb.length())
|
||||
return;
|
||||
|
||||
String s = sb.toString();
|
||||
int lengthR1 = sb.length() - _R1;
|
||||
int index;
|
||||
|
||||
if (s.endsWith("heden")) {
|
||||
sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + _R1).replaceAll("heden", "heid"));
|
||||
return;
|
||||
}
|
||||
|
||||
if (enEnding(sb))
|
||||
return;
|
||||
|
||||
if (s.endsWith("se") &&
|
||||
(index = s.length() - 2) >= _R1 &&
|
||||
isValidSEnding(sb, index - 1)
|
||||
) {
|
||||
sb.delete(index, index + 2);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("s") &&
|
||||
(index = s.length() - 1) >= _R1 &&
|
||||
isValidSEnding(sb, index - 1)) {
|
||||
sb.delete(index, index + 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete suffix e if in R1 and
|
||||
* preceded by a non-vowel, and then undouble the ending
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step2(StringBuilder sb) {
|
||||
_removedE = false;
|
||||
if (_R1 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = s.length() - 1;
|
||||
if (index >= _R1 &&
|
||||
s.endsWith("e") &&
|
||||
!isVowel(sb.charAt(index - 1))) {
|
||||
sb.delete(index, index + 1);
|
||||
unDouble(sb);
|
||||
_removedE = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete "heid"
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step3a(StringBuilder sb) {
|
||||
if (_R2 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = s.length() - 4;
|
||||
if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
|
||||
sb.delete(index, index + 4); //remove heid
|
||||
enEnding(sb);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>A d-suffix, or derivational suffix, enables a new word,
|
||||
* often with a different grammatical category, or with a different
|
||||
* sense, to be built from another word. Whether a d-suffix can be
|
||||
* attached is discovered not from the rules of grammar, but by
|
||||
* referring to a dictionary. So in English, ness can be added to
|
||||
* certain adjectives to form corresponding nouns (littleness,
|
||||
* kindness, foolishness ...) but not to all adjectives
|
||||
* (not for example, to big, cruel, wise ...) d-suffixes can be
|
||||
* used to change meaning, often in rather exotic ways.</p>
|
||||
* Remove "ing", "end", "ig", "lijk", "baar" and "bar"
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step3b(StringBuilder sb) {
|
||||
if (_R2 >= sb.length())
|
||||
return;
|
||||
String s = sb.toString();
|
||||
int index = 0;
|
||||
|
||||
if ((s.endsWith("end") || s.endsWith("ing")) &&
|
||||
(index = s.length() - 3) >= _R2) {
|
||||
sb.delete(index, index + 3);
|
||||
if (sb.charAt(index - 2) == 'i' &&
|
||||
sb.charAt(index - 1) == 'g') {
|
||||
if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
|
||||
index -= 2;
|
||||
sb.delete(index, index + 2);
|
||||
}
|
||||
} else {
|
||||
unDouble(sb, index);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("ig") &&
|
||||
(index = s.length() - 2) >= _R2
|
||||
) {
|
||||
if (sb.charAt(index - 1) != 'e')
|
||||
sb.delete(index, index + 2);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("lijk") &&
|
||||
(index = s.length() - 4) >= _R2
|
||||
) {
|
||||
sb.delete(index, index + 4);
|
||||
step2(sb);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("baar") &&
|
||||
(index = s.length() - 4) >= _R2
|
||||
) {
|
||||
sb.delete(index, index + 4);
|
||||
return;
|
||||
}
|
||||
if (s.endsWith("bar") &&
|
||||
(index = s.length() - 3) >= _R2
|
||||
) {
|
||||
if (_removedE)
|
||||
sb.delete(index, index + 3);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* undouble vowel
|
||||
* If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> man, brood -> brod).
|
||||
*
|
||||
* @param sb String being stemmed
|
||||
*/
|
||||
private void step4(StringBuilder sb) {
|
||||
if (sb.length() < 4)
|
||||
return;
|
||||
String end = sb.substring(sb.length() - 4, sb.length());
|
||||
char c = end.charAt(0);
|
||||
char v1 = end.charAt(1);
|
||||
char v2 = end.charAt(2);
|
||||
char d = end.charAt(3);
|
||||
if (v1 == v2 &&
|
||||
d != 'I' &&
|
||||
v1 != 'i' &&
|
||||
isVowel(v1) &&
|
||||
!isVowel(d) &&
|
||||
!isVowel(c)) {
|
||||
sb.delete(sb.length() - 2, sb.length() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a term could be stemmed.
|
||||
*
|
||||
* @return true if, and only if, the given term consists in letters.
|
||||
*/
|
||||
private boolean isStemmable(String term) {
|
||||
for (int c = 0; c < term.length(); c++) {
|
||||
if (!Character.isLetter(term.charAt(c))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitute ä, ë, ï, ö, ü, á , é, í, ó, ú
|
||||
*/
|
||||
private void substitute(StringBuilder buffer) {
|
||||
for (int i = 0; i < buffer.length(); i++) {
|
||||
switch (buffer.charAt(i)) {
|
||||
case 'ä':
|
||||
case 'á':
|
||||
{
|
||||
buffer.setCharAt(i, 'a');
|
||||
break;
|
||||
}
|
||||
case 'ë':
|
||||
case 'é':
|
||||
{
|
||||
buffer.setCharAt(i, 'e');
|
||||
break;
|
||||
}
|
||||
case 'ü':
|
||||
case 'ú':
|
||||
{
|
||||
buffer.setCharAt(i, 'u');
|
||||
break;
|
||||
}
|
||||
case 'ï':
|
||||
case 'i':
|
||||
{
|
||||
buffer.setCharAt(i, 'i');
|
||||
break;
|
||||
}
|
||||
case 'ö':
|
||||
case 'ó':
|
||||
{
|
||||
buffer.setCharAt(i, 'o');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*private boolean isValidSEnding(StringBuilder sb) {
|
||||
return isValidSEnding(sb, sb.length() - 1);
|
||||
}*/
|
||||
|
||||
private boolean isValidSEnding(StringBuilder sb, int index) {
|
||||
char c = sb.charAt(index);
|
||||
if (isVowel(c) || c == 'j')
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*private boolean isValidEnEnding(StringBuilder sb) {
|
||||
return isValidEnEnding(sb, sb.length() - 1);
|
||||
}*/
|
||||
|
||||
private boolean isValidEnEnding(StringBuilder sb, int index) {
|
||||
char c = sb.charAt(index);
|
||||
if (isVowel(c))
|
||||
return false;
|
||||
if (c < 3)
|
||||
return false;
|
||||
// ends with "gem"?
|
||||
if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private void unDouble(StringBuilder sb) {
|
||||
unDouble(sb, sb.length());
|
||||
}
|
||||
|
||||
private void unDouble(StringBuilder sb, int endIndex) {
|
||||
String s = sb.substring(0, endIndex);
|
||||
if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
|
||||
sb.delete(endIndex - 1, endIndex);
|
||||
}
|
||||
}
|
||||
|
||||
private int getRIndex(StringBuilder sb, int start) {
|
||||
if (start == 0)
|
||||
start = 1;
|
||||
int i = start;
|
||||
for (; i < sb.length(); i++) {
|
||||
//first non-vowel preceded by a vowel
|
||||
if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
|
||||
return i + 1;
|
||||
}
|
||||
}
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
private void storeYandI(StringBuilder sb) {
|
||||
if (sb.charAt(0) == 'y')
|
||||
sb.setCharAt(0, 'Y');
|
||||
|
||||
int last = sb.length() - 1;
|
||||
|
||||
for (int i = 1; i < last; i++) {
|
||||
switch (sb.charAt(i)) {
|
||||
case 'i':
|
||||
{
|
||||
if (isVowel(sb.charAt(i - 1)) &&
|
||||
isVowel(sb.charAt(i + 1))
|
||||
)
|
||||
sb.setCharAt(i, 'I');
|
||||
break;
|
||||
}
|
||||
case 'y':
|
||||
{
|
||||
if (isVowel(sb.charAt(i - 1)))
|
||||
sb.setCharAt(i, 'Y');
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
|
||||
sb.setCharAt(last, 'Y');
|
||||
}
|
||||
|
||||
private void reStoreYandI(StringBuilder sb) {
|
||||
String tmp = sb.toString();
|
||||
sb.delete(0, sb.length());
|
||||
sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
|
||||
}
|
||||
|
||||
private boolean isVowel(char c) {
|
||||
switch (c) {
|
||||
case 'e':
|
||||
case 'a':
|
||||
case 'o':
|
||||
case 'i':
|
||||
case 'u':
|
||||
case 'y':
|
||||
case 'è':
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void setStemDictionary(Map dict) {
|
||||
_stemDict = dict;
|
||||
}
|
||||
|
||||
}
|
|
@ -49,8 +49,6 @@ import org.apache.lucene.analysis.charfilter.BaseCharFilter;
|
|||
* @since Solr 1.5
|
||||
*/
|
||||
public class PatternReplaceCharFilter extends BaseCharFilter {
|
||||
@Deprecated
|
||||
public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
|
||||
|
||||
private final Pattern pattern;
|
||||
private final String replacement;
|
||||
|
@ -62,12 +60,6 @@ public class PatternReplaceCharFilter extends BaseCharFilter {
|
|||
this.replacement = replacement;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public PatternReplaceCharFilter(Pattern pattern, String replacement,
|
||||
int maxBlockChars, String blockDelimiter, CharStream in) {
|
||||
this(pattern, replacement, in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
// Buffer all input on the first call.
|
||||
|
|
|
@ -34,17 +34,9 @@ import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
|||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.tartarus.snowball.ext.PortugueseStemmer;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} for Portuguese.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating PortugueseAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, PortugueseLightStemFilter is used for less aggressive stemming.
|
||||
* </ul>
|
||||
*/
|
||||
public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
@ -132,11 +124,7 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
|||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if(!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
result = new PortugueseLightStemFilter(result);
|
||||
} else {
|
||||
result = new SnowballFilter(result, new PortugueseStemmer());
|
||||
}
|
||||
result = new PortugueseLightStemFilter(result);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -134,7 +134,7 @@ public abstract class RSLPStemmerBase {
|
|||
if (!exceptions[i].endsWith(suffix))
|
||||
System.err.println("warning: useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
}
|
||||
this.exceptions = new CharArraySet(Version.LUCENE_31,
|
||||
this.exceptions = new CharArraySet(Version.LUCENE_50,
|
||||
Arrays.asList(exceptions), false);
|
||||
}
|
||||
|
||||
|
|
|
@ -31,14 +31,6 @@ import java.io.IOException;
|
|||
* that character. For example, with a marker of \u0001, "country" =>
|
||||
* "\u0001yrtnuoc". This is useful when implementing efficient leading
|
||||
* wildcards search.
|
||||
* </p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ReverseStringFilter, or when using any of
|
||||
* its static methods:
|
||||
* <ul>
|
||||
* <li> As of 3.1, supplementary characters are handled correctly
|
||||
* </ul>
|
||||
*/
|
||||
public final class ReverseStringFilter extends TokenFilter {
|
||||
|
||||
|
@ -74,7 +66,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
* The reversed tokens will not be marked.
|
||||
* </p>
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion Lucene compatibility version
|
||||
* @param in {@link TokenStream} to filter
|
||||
*/
|
||||
public ReverseStringFilter(Version matchVersion, TokenStream in) {
|
||||
|
@ -89,7 +81,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
* character.
|
||||
* </p>
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param in {@link TokenStream} to filter
|
||||
* @param marker A character used to mark reversed tokens
|
||||
*/
|
||||
|
@ -119,7 +111,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
/**
|
||||
* Reverses the given input string
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param input the string to reverse
|
||||
* @return the given input string in reversed order
|
||||
*/
|
||||
|
@ -131,7 +123,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Reverses the given input buffer in-place
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param buffer the input char array to reverse
|
||||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer) {
|
||||
|
@ -141,7 +133,7 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
/**
|
||||
* Partially reverses the given input buffer in-place from offset 0
|
||||
* up to the given length.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param buffer the input char array to reverse
|
||||
* @param len the length in the buffer up to where the
|
||||
* buffer should be reversed
|
||||
|
@ -151,24 +143,10 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
reverse( matchVersion, buffer, 0, len );
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated (3.1) Remove this when support for 3.0 indexes is no longer needed.
|
||||
*/
|
||||
@Deprecated
|
||||
private static void reverseUnicode3( char[] buffer, int start, int len ){
|
||||
if( len <= 1 ) return;
|
||||
int num = len>>1;
|
||||
for( int i = start; i < ( start + num ); i++ ){
|
||||
char c = buffer[i];
|
||||
buffer[i] = buffer[start * 2 + len - i - 1];
|
||||
buffer[start * 2 + len - i - 1] = c;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Partially reverses the given input buffer in-place from the given offset
|
||||
* up to the given length.
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param buffer the input char array to reverse
|
||||
* @param start the offset from where to reverse the buffer
|
||||
* @param len the length in the buffer up to where the
|
||||
|
@ -176,10 +154,6 @@ public final class ReverseStringFilter extends TokenFilter {
|
|||
*/
|
||||
public static void reverse(Version matchVersion, final char[] buffer,
|
||||
final int start, final int len) {
|
||||
if (!matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
reverseUnicode3(buffer, start, len);
|
||||
return;
|
||||
}
|
||||
/* modified version of Apache Harmony AbstractStringBuilder reverse0() */
|
||||
if (len < 2)
|
||||
return;
|
||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.ru;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
|
@ -42,44 +41,13 @@ import org.apache.lucene.util.Version;
|
|||
* Supports an external list of stopwords (words that
|
||||
* will not be indexed at all).
|
||||
* A default set of stopwords is used unless an alternative list is specified.
|
||||
* </p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating RussianAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.1, StandardTokenizer is used, Snowball stemming is done with
|
||||
* SnowballFilter, and Snowball stopwords are used by default.
|
||||
* </ul>
|
||||
*/
|
||||
public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||
{
|
||||
/**
|
||||
* List of typical Russian stopwords. (for backwards compatibility)
|
||||
* @deprecated (3.1) Remove this for LUCENE 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
private static final String[] RUSSIAN_STOP_WORDS_30 = {
|
||||
"а", "без", "более", "бы", "был", "была", "были", "было", "быть", "в",
|
||||
"вам", "вас", "весь", "во", "вот", "все", "всего", "всех", "вы", "где",
|
||||
"да", "даже", "для", "до", "его", "ее", "ей", "ею", "если", "есть",
|
||||
"еще", "же", "за", "здесь", "и", "из", "или", "им", "их", "к", "как",
|
||||
"ко", "когда", "кто", "ли", "либо", "мне", "может", "мы", "на", "надо",
|
||||
"наш", "не", "него", "нее", "нет", "ни", "них", "но", "ну", "о", "об",
|
||||
"однако", "он", "она", "они", "оно", "от", "очень", "по", "под", "при",
|
||||
"с", "со", "так", "также", "такой", "там", "те", "тем", "то", "того",
|
||||
"тоже", "той", "только", "том", "ты", "у", "уже", "хотя", "чего", "чей",
|
||||
"чем", "что", "чтобы", "чье", "чья", "эта", "эти", "это", "я"
|
||||
};
|
||||
public final class RussianAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
/** File containing default Russian stopwords. */
|
||||
public final static String DEFAULT_STOPWORD_FILE = "russian_stop.txt";
|
||||
|
||||
private static class DefaultSetHolder {
|
||||
/** @deprecated (3.1) remove this for Lucene 5.0 */
|
||||
@Deprecated
|
||||
static final CharArraySet DEFAULT_STOP_SET_30 = CharArraySet
|
||||
.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(RUSSIAN_STOP_WORDS_30), false));
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
|
@ -106,9 +74,7 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
}
|
||||
|
||||
public RussianAnalyzer(Version matchVersion) {
|
||||
this(matchVersion,
|
||||
matchVersion.onOrAfter(Version.LUCENE_31) ? DefaultSetHolder.DEFAULT_STOP_SET
|
||||
: DefaultSetHolder.DEFAULT_STOP_SET_30);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -151,23 +117,13 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(
|
||||
result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
} else {
|
||||
final Tokenizer source = new RussianLetterTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty()) result = new KeywordMarkerFilter(
|
||||
result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new StopFilter(matchVersion, result, stopwords);
|
||||
if (!stemExclusionSet.isEmpty())
|
||||
result = new KeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,97 +0,0 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.Tokenizer; // for javadocs
|
||||
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A RussianLetterTokenizer is a {@link Tokenizer} that extends {@link LetterTokenizer}
|
||||
* by also allowing the basic Latin digits 0-9.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* You must specify the required {@link Version} compatibility when creating
|
||||
* {@link RussianLetterTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token characters. See {@link CharTokenizer#isTokenChar(int)} and
|
||||
* {@link CharTokenizer#normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* @deprecated (3.1) Use {@link StandardTokenizer} instead, which has the same functionality.
|
||||
* This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class RussianLetterTokenizer extends CharTokenizer
|
||||
{
|
||||
private static final int DIGIT_0 = '0';
|
||||
private static final int DIGIT_9 = '9';
|
||||
|
||||
/**
|
||||
* Construct a new RussianLetterTokenizer. * @param matchVersion Lucene version
|
||||
* to match See {@link <a href="#version">above</a>}
|
||||
*
|
||||
* @param in
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
public RussianLetterTokenizer(Version matchVersion, Reader in) {
|
||||
super(matchVersion, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new RussianLetterTokenizer using a given {@link AttributeSource}.
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* @param source
|
||||
* the attribute source to use for this {@link Tokenizer}
|
||||
* @param in
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
public RussianLetterTokenizer(Version matchVersion, AttributeSource source, Reader in) {
|
||||
super(matchVersion, source, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a new RussianLetterTokenizer using a given
|
||||
* {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * @param
|
||||
* matchVersion Lucene version to match See
|
||||
* {@link <a href="#version">above</a>}
|
||||
*
|
||||
* @param factory
|
||||
* the attribute factory to use for this {@link Tokenizer}
|
||||
* @param in
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
public RussianLetterTokenizer(Version matchVersion, AttributeFactory factory, Reader in) {
|
||||
super(matchVersion, factory, in);
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects only characters which satisfy
|
||||
* {@link Character#isLetter(int)}.
|
||||
*/
|
||||
@Override
|
||||
protected boolean isTokenChar(int c) {
|
||||
return Character.isLetter(c) || (c >= DIGIT_0 && c <= DIGIT_9);
|
||||
}
|
||||
}
|
|
@ -1,88 +0,0 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
|
||||
* LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}.
|
||||
*
|
||||
* Available stemmers are listed in org.tartarus.snowball.ext. The name of a
|
||||
* stemmer is the part of the class name before "Stemmer", e.g., the stemmer in
|
||||
* {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English".
|
||||
*
|
||||
* <p><b>NOTE</b>: This class uses the same {@link Version}
|
||||
* dependent settings as {@link StandardAnalyzer}, with the following addition:
|
||||
* <ul>
|
||||
* <li> As of 3.1, uses {@link TurkishLowerCaseFilter} for Turkish language.
|
||||
* </ul>
|
||||
* </p>
|
||||
* @deprecated (3.1) Use the language-specific analyzer in modules/analysis instead.
|
||||
* This analyzer will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class SnowballAnalyzer extends Analyzer {
|
||||
private String name;
|
||||
private CharArraySet stopSet;
|
||||
private final Version matchVersion;
|
||||
|
||||
/** Builds the named analyzer with no stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name) {
|
||||
this.name = name;
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
/** Builds the named analyzer with the given stop words. */
|
||||
public SnowballAnalyzer(Version matchVersion, String name, CharArraySet stopWords) {
|
||||
this(matchVersion, name);
|
||||
stopSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion,
|
||||
stopWords));
|
||||
}
|
||||
|
||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
|
||||
and a {@link SnowballFilter} */
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, tokenizer);
|
||||
// remove the possessive 's for english stemmers
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) &&
|
||||
(name.equals("English") || name.equals("Porter") || name.equals("Lovins")))
|
||||
result = new EnglishPossessiveFilter(result);
|
||||
// Use a special lowercase filter for turkish, the stemmer expects it.
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31) && name.equals("Turkish"))
|
||||
result = new TurkishLowerCaseFilter(result);
|
||||
else
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
if (stopSet != null)
|
||||
result = new StopFilter(matchVersion,
|
||||
result, stopSet);
|
||||
result = new SnowballFilter(result, name);
|
||||
return new TokenStreamComponents(tokenizer, result);
|
||||
}
|
||||
}
|
|
@ -21,61 +21,19 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Normalizes tokens extracted with {@link StandardTokenizer}.
|
||||
*/
|
||||
public class StandardFilter extends TokenFilter {
|
||||
private final Version matchVersion;
|
||||
|
||||
public StandardFilter(Version matchVersion, TokenStream in) {
|
||||
super(in);
|
||||
this.matchVersion = matchVersion;
|
||||
}
|
||||
|
||||
private static final String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
|
||||
private static final String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
|
||||
|
||||
// this filters uses attribute type
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
return input.incrementToken(); // TODO: add some niceties for the new grammar
|
||||
else
|
||||
return incrementTokenClassic();
|
||||
}
|
||||
|
||||
public final boolean incrementTokenClassic() throws IOException {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int bufferLength = termAtt.length();
|
||||
final String type = typeAtt.type();
|
||||
|
||||
if (type == APOSTROPHE_TYPE && // remove 's
|
||||
bufferLength >= 2 &&
|
||||
buffer[bufferLength-2] == '\'' &&
|
||||
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||
// Strip last 2 characters off
|
||||
termAtt.setLength(bufferLength - 2);
|
||||
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||
int upto = 0;
|
||||
for(int i=0;i<bufferLength;i++) {
|
||||
char c = buffer[i];
|
||||
if (c != '.')
|
||||
buffer[upto++] = c;
|
||||
}
|
||||
termAtt.setLength(upto);
|
||||
}
|
||||
|
||||
return true;
|
||||
return input.incrementToken(); // TODO: add some niceties for the new grammar
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
|||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
@ -31,31 +30,20 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
/** A grammar-based tokenizer constructed with JFlex.
|
||||
* <p>
|
||||
* As of Lucene version 3.1, this class implements the Word Break rules from the
|
||||
* This class implements the Word Break rules from the
|
||||
* Unicode Text Segmentation algorithm, as specified in
|
||||
* <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>.
|
||||
* <p/>
|
||||
* <p>Many applications have specific tokenizer needs. If this tokenizer does
|
||||
* not suit your application, please consider copying this source code
|
||||
* directory to your project and maintaining your own grammar-based tokenizer.
|
||||
*
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating StandardTokenizer:
|
||||
* <ul>
|
||||
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
|
||||
* from their combining characters. If you use a previous version number,
|
||||
* you get the exact broken behavior for backwards compatibility.
|
||||
* <li> As of 3.1, StandardTokenizer implements Unicode text segmentation.
|
||||
* If you use a previous version number, you get the exact behavior of
|
||||
* {@link ClassicTokenizer} for backwards compatibility.
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public final class StandardTokenizer extends Tokenizer {
|
||||
/** A private instance of the JFlex-constructed scanner */
|
||||
private StandardTokenizerInterface scanner;
|
||||
|
||||
// TODO: how can we remove these old types?!
|
||||
public static final int ALPHANUM = 0;
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
|
@ -146,13 +134,7 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private final void init(Version matchVersion) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_34)) {
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
} else if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
this.scanner = new StandardTokenizerImpl31(input);
|
||||
} else {
|
||||
this.scanner = new ClassicTokenizerImpl(input);
|
||||
}
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
|
@ -184,15 +166,7 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
scanner.getText(termAtt);
|
||||
final int start = scanner.yychar();
|
||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == StandardTokenizer.ACRONYM_DEP) {
|
||||
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST]);
|
||||
termAtt.setLength(termAtt.length() - 1); // remove extra '.'
|
||||
} else {
|
||||
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
typeAtt.setType(StandardTokenizer.TOKEN_TYPES[tokenType]);
|
||||
return true;
|
||||
} else
|
||||
// When we skip a too-long term, we still increment the
|
||||
|
|
|
@ -23,8 +23,6 @@ import java.io.InputStreamReader;
|
|||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.std31.UAX29URLEmailTokenizerImpl31;
|
||||
import org.apache.lucene.analysis.standard.std34.UAX29URLEmailTokenizerImpl34;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -50,14 +48,6 @@ import org.apache.lucene.util.AttributeSource.AttributeFactory;
|
|||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* </ul>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating UAX29URLEmailTokenizer:
|
||||
* <ul>
|
||||
* <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
|
||||
* from their combining characters. If you use a previous version number,
|
||||
* you get the exact broken behavior for backwards compatibility.
|
||||
* </ul>
|
||||
*/
|
||||
|
||||
public final class UAX29URLEmailTokenizer extends Tokenizer {
|
||||
|
@ -128,13 +118,7 @@ public final class UAX29URLEmailTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
private static StandardTokenizerInterface getScannerFor(Version matchVersion, Reader input) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_36)) {
|
||||
return new UAX29URLEmailTokenizerImpl(input);
|
||||
} else if (matchVersion.onOrAfter(Version.LUCENE_34)) {
|
||||
return new UAX29URLEmailTokenizerImpl34(input);
|
||||
} else {
|
||||
return new UAX29URLEmailTokenizerImpl31(input);
|
||||
}
|
||||
return new UAX29URLEmailTokenizerImpl(input);
|
||||
}
|
||||
|
||||
// this tokenizer generates three attributes:
|
||||
|
|
|
@ -1,330 +0,0 @@
|
|||
/*
|
||||
* Copyright 2001-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Wednesday, February 9, 2011 12:34:10 PM UTC
|
||||
// generated on Wednesday, February 9, 2011 4:45:18 PM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
[aA][cC]
|
||||
| [aA][dD]
|
||||
| [aA][eE]
|
||||
| [aA][eE][rR][oO]
|
||||
| [aA][fF]
|
||||
| [aA][gG]
|
||||
| [aA][iI]
|
||||
| [aA][lL]
|
||||
| [aA][mM]
|
||||
| [aA][nN]
|
||||
| [aA][oO]
|
||||
| [aA][qQ]
|
||||
| [aA][rR]
|
||||
| [aA][rR][pP][aA]
|
||||
| [aA][sS]
|
||||
| [aA][sS][iI][aA]
|
||||
| [aA][tT]
|
||||
| [aA][uU]
|
||||
| [aA][wW]
|
||||
| [aA][xX]
|
||||
| [aA][zZ]
|
||||
| [bB][aA]
|
||||
| [bB][bB]
|
||||
| [bB][dD]
|
||||
| [bB][eE]
|
||||
| [bB][fF]
|
||||
| [bB][gG]
|
||||
| [bB][hH]
|
||||
| [bB][iI]
|
||||
| [bB][iI][zZ]
|
||||
| [bB][jJ]
|
||||
| [bB][mM]
|
||||
| [bB][nN]
|
||||
| [bB][oO]
|
||||
| [bB][rR]
|
||||
| [bB][sS]
|
||||
| [bB][tT]
|
||||
| [bB][vV]
|
||||
| [bB][wW]
|
||||
| [bB][yY]
|
||||
| [bB][zZ]
|
||||
| [cC][aA]
|
||||
| [cC][aA][tT]
|
||||
| [cC][cC]
|
||||
| [cC][dD]
|
||||
| [cC][fF]
|
||||
| [cC][gG]
|
||||
| [cC][hH]
|
||||
| [cC][iI]
|
||||
| [cC][kK]
|
||||
| [cC][lL]
|
||||
| [cC][mM]
|
||||
| [cC][nN]
|
||||
| [cC][oO]
|
||||
| [cC][oO][mM]
|
||||
| [cC][oO][oO][pP]
|
||||
| [cC][rR]
|
||||
| [cC][uU]
|
||||
| [cC][vV]
|
||||
| [cC][xX]
|
||||
| [cC][yY]
|
||||
| [cC][zZ]
|
||||
| [dD][eE]
|
||||
| [dD][jJ]
|
||||
| [dD][kK]
|
||||
| [dD][mM]
|
||||
| [dD][oO]
|
||||
| [dD][zZ]
|
||||
| [eE][cC]
|
||||
| [eE][dD][uU]
|
||||
| [eE][eE]
|
||||
| [eE][gG]
|
||||
| [eE][rR]
|
||||
| [eE][sS]
|
||||
| [eE][tT]
|
||||
| [eE][uU]
|
||||
| [fF][iI]
|
||||
| [fF][jJ]
|
||||
| [fF][kK]
|
||||
| [fF][mM]
|
||||
| [fF][oO]
|
||||
| [fF][rR]
|
||||
| [gG][aA]
|
||||
| [gG][bB]
|
||||
| [gG][dD]
|
||||
| [gG][eE]
|
||||
| [gG][fF]
|
||||
| [gG][gG]
|
||||
| [gG][hH]
|
||||
| [gG][iI]
|
||||
| [gG][lL]
|
||||
| [gG][mM]
|
||||
| [gG][nN]
|
||||
| [gG][oO][vV]
|
||||
| [gG][pP]
|
||||
| [gG][qQ]
|
||||
| [gG][rR]
|
||||
| [gG][sS]
|
||||
| [gG][tT]
|
||||
| [gG][uU]
|
||||
| [gG][wW]
|
||||
| [gG][yY]
|
||||
| [hH][kK]
|
||||
| [hH][mM]
|
||||
| [hH][nN]
|
||||
| [hH][rR]
|
||||
| [hH][tT]
|
||||
| [hH][uU]
|
||||
| [iI][dD]
|
||||
| [iI][eE]
|
||||
| [iI][lL]
|
||||
| [iI][mM]
|
||||
| [iI][nN]
|
||||
| [iI][nN][fF][oO]
|
||||
| [iI][nN][tT]
|
||||
| [iI][oO]
|
||||
| [iI][qQ]
|
||||
| [iI][rR]
|
||||
| [iI][sS]
|
||||
| [iI][tT]
|
||||
| [jJ][eE]
|
||||
| [jJ][mM]
|
||||
| [jJ][oO]
|
||||
| [jJ][oO][bB][sS]
|
||||
| [jJ][pP]
|
||||
| [kK][eE]
|
||||
| [kK][gG]
|
||||
| [kK][hH]
|
||||
| [kK][iI]
|
||||
| [kK][mM]
|
||||
| [kK][nN]
|
||||
| [kK][pP]
|
||||
| [kK][rR]
|
||||
| [kK][wW]
|
||||
| [kK][yY]
|
||||
| [kK][zZ]
|
||||
| [lL][aA]
|
||||
| [lL][bB]
|
||||
| [lL][cC]
|
||||
| [lL][iI]
|
||||
| [lL][kK]
|
||||
| [lL][rR]
|
||||
| [lL][sS]
|
||||
| [lL][tT]
|
||||
| [lL][uU]
|
||||
| [lL][vV]
|
||||
| [lL][yY]
|
||||
| [mM][aA]
|
||||
| [mM][cC]
|
||||
| [mM][dD]
|
||||
| [mM][eE]
|
||||
| [mM][gG]
|
||||
| [mM][hH]
|
||||
| [mM][iI][lL]
|
||||
| [mM][kK]
|
||||
| [mM][lL]
|
||||
| [mM][mM]
|
||||
| [mM][nN]
|
||||
| [mM][oO]
|
||||
| [mM][oO][bB][iI]
|
||||
| [mM][pP]
|
||||
| [mM][qQ]
|
||||
| [mM][rR]
|
||||
| [mM][sS]
|
||||
| [mM][tT]
|
||||
| [mM][uU]
|
||||
| [mM][uU][sS][eE][uU][mM]
|
||||
| [mM][vV]
|
||||
| [mM][wW]
|
||||
| [mM][xX]
|
||||
| [mM][yY]
|
||||
| [mM][zZ]
|
||||
| [nN][aA]
|
||||
| [nN][aA][mM][eE]
|
||||
| [nN][cC]
|
||||
| [nN][eE]
|
||||
| [nN][eE][tT]
|
||||
| [nN][fF]
|
||||
| [nN][gG]
|
||||
| [nN][iI]
|
||||
| [nN][lL]
|
||||
| [nN][oO]
|
||||
| [nN][pP]
|
||||
| [nN][rR]
|
||||
| [nN][uU]
|
||||
| [nN][zZ]
|
||||
| [oO][mM]
|
||||
| [oO][rR][gG]
|
||||
| [pP][aA]
|
||||
| [pP][eE]
|
||||
| [pP][fF]
|
||||
| [pP][gG]
|
||||
| [pP][hH]
|
||||
| [pP][kK]
|
||||
| [pP][lL]
|
||||
| [pP][mM]
|
||||
| [pP][nN]
|
||||
| [pP][rR]
|
||||
| [pP][rR][oO]
|
||||
| [pP][sS]
|
||||
| [pP][tT]
|
||||
| [pP][wW]
|
||||
| [pP][yY]
|
||||
| [qQ][aA]
|
||||
| [rR][eE]
|
||||
| [rR][oO]
|
||||
| [rR][sS]
|
||||
| [rR][uU]
|
||||
| [rR][wW]
|
||||
| [sS][aA]
|
||||
| [sS][bB]
|
||||
| [sS][cC]
|
||||
| [sS][dD]
|
||||
| [sS][eE]
|
||||
| [sS][gG]
|
||||
| [sS][hH]
|
||||
| [sS][iI]
|
||||
| [sS][jJ]
|
||||
| [sS][kK]
|
||||
| [sS][lL]
|
||||
| [sS][mM]
|
||||
| [sS][nN]
|
||||
| [sS][oO]
|
||||
| [sS][rR]
|
||||
| [sS][tT]
|
||||
| [sS][uU]
|
||||
| [sS][vV]
|
||||
| [sS][yY]
|
||||
| [sS][zZ]
|
||||
| [tT][cC]
|
||||
| [tT][dD]
|
||||
| [tT][eE][lL]
|
||||
| [tT][fF]
|
||||
| [tT][gG]
|
||||
| [tT][hH]
|
||||
| [tT][jJ]
|
||||
| [tT][kK]
|
||||
| [tT][lL]
|
||||
| [tT][mM]
|
||||
| [tT][nN]
|
||||
| [tT][oO]
|
||||
| [tT][pP]
|
||||
| [tT][rR]
|
||||
| [tT][rR][aA][vV][eE][lL]
|
||||
| [tT][tT]
|
||||
| [tT][vV]
|
||||
| [tT][wW]
|
||||
| [tT][zZ]
|
||||
| [uU][aA]
|
||||
| [uU][gG]
|
||||
| [uU][kK]
|
||||
| [uU][sS]
|
||||
| [uU][yY]
|
||||
| [uU][zZ]
|
||||
| [vV][aA]
|
||||
| [vV][cC]
|
||||
| [vV][eE]
|
||||
| [vV][gG]
|
||||
| [vV][iI]
|
||||
| [vV][nN]
|
||||
| [vV][uU]
|
||||
| [wW][fF]
|
||||
| [wW][sS]
|
||||
| [xX][nN]--0[zZ][wW][mM]56[dD]
|
||||
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
|
||||
| [xX][nN]--3[eE]0[bB]707[eE]
|
||||
| [xX][nN]--45[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
|
||||
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
|
||||
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
|
||||
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
|
||||
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
|
||||
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
|
||||
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
|
||||
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
|
||||
| [xX][nN]--[gG]6[wW]251[dD]
|
||||
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
|
||||
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
|
||||
| [xX][nN]--[jJ]6[wW]193[gG]
|
||||
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
|
||||
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
|
||||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
|
||||
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
|
||||
| [xX][nN]--[oO]3[cC][wW]4[hH]
|
||||
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
|
||||
| [xX][nN]--[pP]1[aA][iI]
|
||||
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
|
||||
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[wW][gG][bB][hH]1[cC]
|
||||
| [xX][nN]--[wW][gG][bB][lL]6[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
|
||||
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
|
||||
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
|
||||
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
|
||||
| [yY][eE]
|
||||
| [yY][tT]
|
||||
| [zZ][aA]
|
||||
| [zZ][mM]
|
||||
| [zZ][wW]
|
||||
) "."? // Accept trailing root (empty) domain
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.6.0.0 on Wednesday, February 9, 2011 4:45:11 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
ALetterSupp = (
|
||||
([\ud80d][\uDC00-\uDC2E])
|
||||
| ([\ud80c][\uDC00-\uDFFF])
|
||||
| ([\ud809][\uDC00-\uDC62])
|
||||
| ([\ud808][\uDC00-\uDF6E])
|
||||
| ([\ud81a][\uDC00-\uDE38])
|
||||
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
|
||||
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
|
||||
| ([\ud801][\uDC00-\uDC9D])
|
||||
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
|
||||
| ([\ud803][\uDC00-\uDC48])
|
||||
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
|
||||
)
|
||||
FormatSupp = (
|
||||
([\ud804][\uDCBD])
|
||||
| ([\ud834][\uDD73-\uDD7A])
|
||||
| ([\udb40][\uDC01\uDC20-\uDC7F])
|
||||
)
|
||||
ExtendSupp = (
|
||||
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
|
||||
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
|
||||
| ([\ud800][\uDDFD])
|
||||
| ([\udb40][\uDD00-\uDDEF])
|
||||
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud804][\uDC66-\uDC6F])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
KatakanaSupp = (
|
||||
([\ud82c][\uDC00])
|
||||
)
|
||||
MidLetterSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ComplexContextSupp = (
|
||||
[]
|
||||
)
|
||||
HanSupp = (
|
||||
([\ud87e][\uDC00-\uDE1D])
|
||||
| ([\ud86b][\uDC00-\uDFFF])
|
||||
| ([\ud86a][\uDC00-\uDFFF])
|
||||
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
|
||||
| ([\ud868][\uDC00-\uDFFF])
|
||||
| ([\ud86e][\uDC00-\uDC1D])
|
||||
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
|
||||
| ([\ud86c][\uDC00-\uDFFF])
|
||||
| ([\ud863][\uDC00-\uDFFF])
|
||||
| ([\ud862][\uDC00-\uDFFF])
|
||||
| ([\ud861][\uDC00-\uDFFF])
|
||||
| ([\ud860][\uDC00-\uDFFF])
|
||||
| ([\ud867][\uDC00-\uDFFF])
|
||||
| ([\ud866][\uDC00-\uDFFF])
|
||||
| ([\ud865][\uDC00-\uDFFF])
|
||||
| ([\ud864][\uDC00-\uDFFF])
|
||||
| ([\ud858][\uDC00-\uDFFF])
|
||||
| ([\ud859][\uDC00-\uDFFF])
|
||||
| ([\ud85a][\uDC00-\uDFFF])
|
||||
| ([\ud85b][\uDC00-\uDFFF])
|
||||
| ([\ud85c][\uDC00-\uDFFF])
|
||||
| ([\ud85d][\uDC00-\uDFFF])
|
||||
| ([\ud85e][\uDC00-\uDFFF])
|
||||
| ([\ud85f][\uDC00-\uDFFF])
|
||||
| ([\ud850][\uDC00-\uDFFF])
|
||||
| ([\ud851][\uDC00-\uDFFF])
|
||||
| ([\ud852][\uDC00-\uDFFF])
|
||||
| ([\ud853][\uDC00-\uDFFF])
|
||||
| ([\ud854][\uDC00-\uDFFF])
|
||||
| ([\ud855][\uDC00-\uDFFF])
|
||||
| ([\ud856][\uDC00-\uDFFF])
|
||||
| ([\ud857][\uDC00-\uDFFF])
|
||||
| ([\ud849][\uDC00-\uDFFF])
|
||||
| ([\ud848][\uDC00-\uDFFF])
|
||||
| ([\ud84b][\uDC00-\uDFFF])
|
||||
| ([\ud84a][\uDC00-\uDFFF])
|
||||
| ([\ud84d][\uDC00-\uDFFF])
|
||||
| ([\ud84c][\uDC00-\uDFFF])
|
||||
| ([\ud84f][\uDC00-\uDFFF])
|
||||
| ([\ud84e][\uDC00-\uDFFF])
|
||||
| ([\ud841][\uDC00-\uDFFF])
|
||||
| ([\ud840][\uDC00-\uDFFF])
|
||||
| ([\ud843][\uDC00-\uDFFF])
|
||||
| ([\ud842][\uDC00-\uDFFF])
|
||||
| ([\ud845][\uDC00-\uDFFF])
|
||||
| ([\ud844][\uDC00-\uDFFF])
|
||||
| ([\ud847][\uDC00-\uDFFF])
|
||||
| ([\ud846][\uDC00-\uDFFF])
|
||||
)
|
||||
HiraganaSupp = (
|
||||
([\ud83c][\uDE00])
|
||||
| ([\ud82c][\uDC01])
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -1,184 +0,0 @@
|
|||
package org.apache.lucene.analysis.standard.std31;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* This class implements StandardTokenizer, except with a bug
|
||||
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
|
||||
* characters would be split from combining characters:
|
||||
* @deprecated This class is only for exact backwards compatibility
|
||||
*/
|
||||
@Deprecated
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
%class StandardTokenizerImpl31
|
||||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||
Han = ([\p{Script:Han}] | {HanSupp})
|
||||
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||
|
||||
// Script=Hangul & Aletter
|
||||
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
|
||||
|
||||
/** Numbers */
|
||||
public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* <p>
|
||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
|
||||
|
||||
public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
|
||||
|
||||
public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
|
||||
|
||||
public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
{HangulEx}+
|
||||
{ return HANGUL_TYPE; }
|
||||
|
||||
{KatakanaEx}+
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
//
|
||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||
// boundary property values based on criteria outside of the scope of this
|
||||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||
// Lao, etc.) are kept together. This grammar does the same below.
|
||||
//
|
||||
// See also the Unicode Line Breaking Algorithm:
|
||||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
{Han} { return IDEOGRAPHIC_TYPE; }
|
||||
{Hiragana} { return HIRAGANA_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
File diff suppressed because it is too large
Load Diff
|
@ -1,269 +0,0 @@
|
|||
package org.apache.lucene.analysis.standard.std31;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* This class implements UAX29URLEmailTokenizer, except with a bug
|
||||
* (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
|
||||
* characters would be split from combining characters:
|
||||
* @deprecated This class is only for exact backwards compatibility
|
||||
*/
|
||||
@Deprecated
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
%class UAX29URLEmailTokenizerImpl31
|
||||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||
Han = ([\p{Script:Han}] | {HanSupp})
|
||||
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||
|
||||
// Script=Hangul & Aletter
|
||||
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||
// RFC-1738: Uniform Resource Locators (URL)
|
||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/std31/ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||
|
||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||
|
||||
URIunreserved = [-._~A-Za-z0-9]
|
||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||
URIsubDelims = [!$&'()*+,;=]
|
||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIport = ":" [0-9]{1,5}
|
||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||
|
||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||
|
||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||
HTTPpath = ("/" {HTTPsegment})*
|
||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||
|
||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||
FTPscheme = [fF][tT][pP] "://"
|
||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||
|
||||
FILEscheme = [fF][iI][lL][eE] "://"
|
||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||
|
||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||
|
||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||
// reminder that they are acceptable bracketed host forms.
|
||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
|
||||
|
||||
/** Numbers */
|
||||
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* <p>
|
||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
|
||||
|
||||
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
|
||||
|
||||
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
|
||||
|
||||
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
|
||||
|
||||
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
|
||||
|
||||
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
{URL} { return URL_TYPE; }
|
||||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
{HangulEx}+
|
||||
{ return HANGUL_TYPE; }
|
||||
|
||||
{KatakanaEx}+
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
//
|
||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||
// boundary property values based on criteria outside of the scope of this
|
||||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||
// Lao, etc.) are kept together. This grammar does the same below.
|
||||
//
|
||||
// See also the Unicode Line Breaking Algorithm:
|
||||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
{Han} { return IDEOGRAPHIC_TYPE; }
|
||||
{Hiragana} { return HIRAGANA_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -1,22 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Backwards-compatible implementation to match {@link org.apache.lucene.util.Version#LUCENE_31}
|
||||
</body>
|
||||
</html>
|
|
@ -1,334 +0,0 @@
|
|||
/*
|
||||
* Copyright 2001-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Thursday, August 4, 2011 11:34:20 AM UTC
|
||||
// generated on Thursday, August 4, 2011 11:46:19 PM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
[aA][cC]
|
||||
| [aA][dD]
|
||||
| [aA][eE]
|
||||
| [aA][eE][rR][oO]
|
||||
| [aA][fF]
|
||||
| [aA][gG]
|
||||
| [aA][iI]
|
||||
| [aA][lL]
|
||||
| [aA][mM]
|
||||
| [aA][nN]
|
||||
| [aA][oO]
|
||||
| [aA][qQ]
|
||||
| [aA][rR]
|
||||
| [aA][rR][pP][aA]
|
||||
| [aA][sS]
|
||||
| [aA][sS][iI][aA]
|
||||
| [aA][tT]
|
||||
| [aA][uU]
|
||||
| [aA][wW]
|
||||
| [aA][xX]
|
||||
| [aA][zZ]
|
||||
| [bB][aA]
|
||||
| [bB][bB]
|
||||
| [bB][dD]
|
||||
| [bB][eE]
|
||||
| [bB][fF]
|
||||
| [bB][gG]
|
||||
| [bB][hH]
|
||||
| [bB][iI]
|
||||
| [bB][iI][zZ]
|
||||
| [bB][jJ]
|
||||
| [bB][mM]
|
||||
| [bB][nN]
|
||||
| [bB][oO]
|
||||
| [bB][rR]
|
||||
| [bB][sS]
|
||||
| [bB][tT]
|
||||
| [bB][vV]
|
||||
| [bB][wW]
|
||||
| [bB][yY]
|
||||
| [bB][zZ]
|
||||
| [cC][aA]
|
||||
| [cC][aA][tT]
|
||||
| [cC][cC]
|
||||
| [cC][dD]
|
||||
| [cC][fF]
|
||||
| [cC][gG]
|
||||
| [cC][hH]
|
||||
| [cC][iI]
|
||||
| [cC][kK]
|
||||
| [cC][lL]
|
||||
| [cC][mM]
|
||||
| [cC][nN]
|
||||
| [cC][oO]
|
||||
| [cC][oO][mM]
|
||||
| [cC][oO][oO][pP]
|
||||
| [cC][rR]
|
||||
| [cC][uU]
|
||||
| [cC][vV]
|
||||
| [cC][xX]
|
||||
| [cC][yY]
|
||||
| [cC][zZ]
|
||||
| [dD][eE]
|
||||
| [dD][jJ]
|
||||
| [dD][kK]
|
||||
| [dD][mM]
|
||||
| [dD][oO]
|
||||
| [dD][zZ]
|
||||
| [eE][cC]
|
||||
| [eE][dD][uU]
|
||||
| [eE][eE]
|
||||
| [eE][gG]
|
||||
| [eE][rR]
|
||||
| [eE][sS]
|
||||
| [eE][tT]
|
||||
| [eE][uU]
|
||||
| [fF][iI]
|
||||
| [fF][jJ]
|
||||
| [fF][kK]
|
||||
| [fF][mM]
|
||||
| [fF][oO]
|
||||
| [fF][rR]
|
||||
| [gG][aA]
|
||||
| [gG][bB]
|
||||
| [gG][dD]
|
||||
| [gG][eE]
|
||||
| [gG][fF]
|
||||
| [gG][gG]
|
||||
| [gG][hH]
|
||||
| [gG][iI]
|
||||
| [gG][lL]
|
||||
| [gG][mM]
|
||||
| [gG][nN]
|
||||
| [gG][oO][vV]
|
||||
| [gG][pP]
|
||||
| [gG][qQ]
|
||||
| [gG][rR]
|
||||
| [gG][sS]
|
||||
| [gG][tT]
|
||||
| [gG][uU]
|
||||
| [gG][wW]
|
||||
| [gG][yY]
|
||||
| [hH][kK]
|
||||
| [hH][mM]
|
||||
| [hH][nN]
|
||||
| [hH][rR]
|
||||
| [hH][tT]
|
||||
| [hH][uU]
|
||||
| [iI][dD]
|
||||
| [iI][eE]
|
||||
| [iI][lL]
|
||||
| [iI][mM]
|
||||
| [iI][nN]
|
||||
| [iI][nN][fF][oO]
|
||||
| [iI][nN][tT]
|
||||
| [iI][oO]
|
||||
| [iI][qQ]
|
||||
| [iI][rR]
|
||||
| [iI][sS]
|
||||
| [iI][tT]
|
||||
| [jJ][eE]
|
||||
| [jJ][mM]
|
||||
| [jJ][oO]
|
||||
| [jJ][oO][bB][sS]
|
||||
| [jJ][pP]
|
||||
| [kK][eE]
|
||||
| [kK][gG]
|
||||
| [kK][hH]
|
||||
| [kK][iI]
|
||||
| [kK][mM]
|
||||
| [kK][nN]
|
||||
| [kK][pP]
|
||||
| [kK][rR]
|
||||
| [kK][wW]
|
||||
| [kK][yY]
|
||||
| [kK][zZ]
|
||||
| [lL][aA]
|
||||
| [lL][bB]
|
||||
| [lL][cC]
|
||||
| [lL][iI]
|
||||
| [lL][kK]
|
||||
| [lL][rR]
|
||||
| [lL][sS]
|
||||
| [lL][tT]
|
||||
| [lL][uU]
|
||||
| [lL][vV]
|
||||
| [lL][yY]
|
||||
| [mM][aA]
|
||||
| [mM][cC]
|
||||
| [mM][dD]
|
||||
| [mM][eE]
|
||||
| [mM][gG]
|
||||
| [mM][hH]
|
||||
| [mM][iI][lL]
|
||||
| [mM][kK]
|
||||
| [mM][lL]
|
||||
| [mM][mM]
|
||||
| [mM][nN]
|
||||
| [mM][oO]
|
||||
| [mM][oO][bB][iI]
|
||||
| [mM][pP]
|
||||
| [mM][qQ]
|
||||
| [mM][rR]
|
||||
| [mM][sS]
|
||||
| [mM][tT]
|
||||
| [mM][uU]
|
||||
| [mM][uU][sS][eE][uU][mM]
|
||||
| [mM][vV]
|
||||
| [mM][wW]
|
||||
| [mM][xX]
|
||||
| [mM][yY]
|
||||
| [mM][zZ]
|
||||
| [nN][aA]
|
||||
| [nN][aA][mM][eE]
|
||||
| [nN][cC]
|
||||
| [nN][eE]
|
||||
| [nN][eE][tT]
|
||||
| [nN][fF]
|
||||
| [nN][gG]
|
||||
| [nN][iI]
|
||||
| [nN][lL]
|
||||
| [nN][oO]
|
||||
| [nN][pP]
|
||||
| [nN][rR]
|
||||
| [nN][uU]
|
||||
| [nN][zZ]
|
||||
| [oO][mM]
|
||||
| [oO][rR][gG]
|
||||
| [pP][aA]
|
||||
| [pP][eE]
|
||||
| [pP][fF]
|
||||
| [pP][gG]
|
||||
| [pP][hH]
|
||||
| [pP][kK]
|
||||
| [pP][lL]
|
||||
| [pP][mM]
|
||||
| [pP][nN]
|
||||
| [pP][rR]
|
||||
| [pP][rR][oO]
|
||||
| [pP][sS]
|
||||
| [pP][tT]
|
||||
| [pP][wW]
|
||||
| [pP][yY]
|
||||
| [qQ][aA]
|
||||
| [rR][eE]
|
||||
| [rR][oO]
|
||||
| [rR][sS]
|
||||
| [rR][uU]
|
||||
| [rR][wW]
|
||||
| [sS][aA]
|
||||
| [sS][bB]
|
||||
| [sS][cC]
|
||||
| [sS][dD]
|
||||
| [sS][eE]
|
||||
| [sS][gG]
|
||||
| [sS][hH]
|
||||
| [sS][iI]
|
||||
| [sS][jJ]
|
||||
| [sS][kK]
|
||||
| [sS][lL]
|
||||
| [sS][mM]
|
||||
| [sS][nN]
|
||||
| [sS][oO]
|
||||
| [sS][rR]
|
||||
| [sS][tT]
|
||||
| [sS][uU]
|
||||
| [sS][vV]
|
||||
| [sS][yY]
|
||||
| [sS][zZ]
|
||||
| [tT][cC]
|
||||
| [tT][dD]
|
||||
| [tT][eE][lL]
|
||||
| [tT][fF]
|
||||
| [tT][gG]
|
||||
| [tT][hH]
|
||||
| [tT][jJ]
|
||||
| [tT][kK]
|
||||
| [tT][lL]
|
||||
| [tT][mM]
|
||||
| [tT][nN]
|
||||
| [tT][oO]
|
||||
| [tT][pP]
|
||||
| [tT][rR]
|
||||
| [tT][rR][aA][vV][eE][lL]
|
||||
| [tT][tT]
|
||||
| [tT][vV]
|
||||
| [tT][wW]
|
||||
| [tT][zZ]
|
||||
| [uU][aA]
|
||||
| [uU][gG]
|
||||
| [uU][kK]
|
||||
| [uU][sS]
|
||||
| [uU][yY]
|
||||
| [uU][zZ]
|
||||
| [vV][aA]
|
||||
| [vV][cC]
|
||||
| [vV][eE]
|
||||
| [vV][gG]
|
||||
| [vV][iI]
|
||||
| [vV][nN]
|
||||
| [vV][uU]
|
||||
| [wW][fF]
|
||||
| [wW][sS]
|
||||
| [xX][nN]--0[zZ][wW][mM]56[dD]
|
||||
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
|
||||
| [xX][nN]--3[eE]0[bB]707[eE]
|
||||
| [xX][nN]--45[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
|
||||
| [xX][nN]--90[aA]3[aA][cC]
|
||||
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
|
||||
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
|
||||
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
|
||||
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
|
||||
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
|
||||
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
|
||||
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
|
||||
| [xX][nN]--[gG]6[wW]251[dD]
|
||||
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
|
||||
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
|
||||
| [xX][nN]--[jJ]6[wW]193[gG]
|
||||
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
|
||||
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
|
||||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
|
||||
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
|
||||
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
|
||||
| [xX][nN]--[oO]3[cC][wW]4[hH]
|
||||
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
|
||||
| [xX][nN]--[pP]1[aA][iI]
|
||||
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
|
||||
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[wW][gG][bB][hH]1[cC]
|
||||
| [xX][nN]--[wW][gG][bB][lL]6[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
|
||||
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
|
||||
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
|
||||
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
|
||||
| [xX][xX][xX]
|
||||
| [yY][eE]
|
||||
| [yY][tT]
|
||||
| [zZ][aA]
|
||||
| [zZ][mM]
|
||||
| [zZ][wW]
|
||||
) "."? // Accept trailing root (empty) domain
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.8.0.0 on Friday, September 30, 2011 4:10:42 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
ALetterSupp = (
|
||||
([\ud80d][\uDC00-\uDC2E])
|
||||
| ([\ud80c][\uDC00-\uDFFF])
|
||||
| ([\ud809][\uDC00-\uDC62])
|
||||
| ([\ud808][\uDC00-\uDF6E])
|
||||
| ([\ud81a][\uDC00-\uDE38])
|
||||
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
|
||||
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
|
||||
| ([\ud801][\uDC00-\uDC9D])
|
||||
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
|
||||
| ([\ud803][\uDC00-\uDC48])
|
||||
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
|
||||
)
|
||||
FormatSupp = (
|
||||
([\ud804][\uDCBD])
|
||||
| ([\ud834][\uDD73-\uDD7A])
|
||||
| ([\udb40][\uDC01\uDC20-\uDC7F])
|
||||
)
|
||||
ExtendSupp = (
|
||||
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
|
||||
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
|
||||
| ([\ud800][\uDDFD])
|
||||
| ([\udb40][\uDD00-\uDDEF])
|
||||
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud804][\uDC66-\uDC6F])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
KatakanaSupp = (
|
||||
([\ud82c][\uDC00])
|
||||
)
|
||||
MidLetterSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ComplexContextSupp = (
|
||||
[]
|
||||
)
|
||||
HanSupp = (
|
||||
([\ud87e][\uDC00-\uDE1D])
|
||||
| ([\ud86b][\uDC00-\uDFFF])
|
||||
| ([\ud86a][\uDC00-\uDFFF])
|
||||
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
|
||||
| ([\ud868][\uDC00-\uDFFF])
|
||||
| ([\ud86e][\uDC00-\uDC1D])
|
||||
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
|
||||
| ([\ud86c][\uDC00-\uDFFF])
|
||||
| ([\ud863][\uDC00-\uDFFF])
|
||||
| ([\ud862][\uDC00-\uDFFF])
|
||||
| ([\ud861][\uDC00-\uDFFF])
|
||||
| ([\ud860][\uDC00-\uDFFF])
|
||||
| ([\ud867][\uDC00-\uDFFF])
|
||||
| ([\ud866][\uDC00-\uDFFF])
|
||||
| ([\ud865][\uDC00-\uDFFF])
|
||||
| ([\ud864][\uDC00-\uDFFF])
|
||||
| ([\ud858][\uDC00-\uDFFF])
|
||||
| ([\ud859][\uDC00-\uDFFF])
|
||||
| ([\ud85a][\uDC00-\uDFFF])
|
||||
| ([\ud85b][\uDC00-\uDFFF])
|
||||
| ([\ud85c][\uDC00-\uDFFF])
|
||||
| ([\ud85d][\uDC00-\uDFFF])
|
||||
| ([\ud85e][\uDC00-\uDFFF])
|
||||
| ([\ud85f][\uDC00-\uDFFF])
|
||||
| ([\ud850][\uDC00-\uDFFF])
|
||||
| ([\ud851][\uDC00-\uDFFF])
|
||||
| ([\ud852][\uDC00-\uDFFF])
|
||||
| ([\ud853][\uDC00-\uDFFF])
|
||||
| ([\ud854][\uDC00-\uDFFF])
|
||||
| ([\ud855][\uDC00-\uDFFF])
|
||||
| ([\ud856][\uDC00-\uDFFF])
|
||||
| ([\ud857][\uDC00-\uDFFF])
|
||||
| ([\ud849][\uDC00-\uDFFF])
|
||||
| ([\ud848][\uDC00-\uDFFF])
|
||||
| ([\ud84b][\uDC00-\uDFFF])
|
||||
| ([\ud84a][\uDC00-\uDFFF])
|
||||
| ([\ud84d][\uDC00-\uDFFF])
|
||||
| ([\ud84c][\uDC00-\uDFFF])
|
||||
| ([\ud84f][\uDC00-\uDFFF])
|
||||
| ([\ud84e][\uDC00-\uDFFF])
|
||||
| ([\ud841][\uDC00-\uDFFF])
|
||||
| ([\ud840][\uDC00-\uDFFF])
|
||||
| ([\ud843][\uDC00-\uDFFF])
|
||||
| ([\ud842][\uDC00-\uDFFF])
|
||||
| ([\ud845][\uDC00-\uDFFF])
|
||||
| ([\ud844][\uDC00-\uDFFF])
|
||||
| ([\ud847][\uDC00-\uDFFF])
|
||||
| ([\ud846][\uDC00-\uDFFF])
|
||||
)
|
||||
HiraganaSupp = (
|
||||
([\ud83c][\uDE00])
|
||||
| ([\ud82c][\uDC01])
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -1,272 +0,0 @@
|
|||
package org.apache.lucene.analysis.standard.std34;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
* This class implements UAX29URLEmailTokenizer, except with a bug
|
||||
* (https://issues.apache.org/jira/browse/LUCENE-3880) where "mailto:"
|
||||
* URI scheme prepended to an email address will disrupt recognition
|
||||
* of the email address.
|
||||
* @deprecated This class is only for exact backwards compatibility
|
||||
*/
|
||||
@Deprecated
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
%class UAX29URLEmailTokenizerImpl34
|
||||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||
Han = ([\p{Script:Han}] | {HanSupp})
|
||||
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||
|
||||
// Script=Hangul & Aletter
|
||||
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
HanEx = {Han} ({Format} | {Extend})*
|
||||
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
// RFC-952: DOD INTERNET HOST TABLE SPECIFICATION
|
||||
// RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
|
||||
// RFC-1123: Requirements for Internet Hosts - Application and Support
|
||||
// RFC-1738: Uniform Resource Locators (URL)
|
||||
// RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
|
||||
// RFC-5234: Augmented BNF for Syntax Specifications: ABNF
|
||||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
DomainNameLoose = {DomainLabel} ("." {DomainLabel})*
|
||||
|
||||
IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
|
||||
IPv4Address = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
|
||||
IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
|
||||
IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
|
||||
IPv6Address = ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
|
||||
| "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
|
||||
| {IPv6Hex16Bit}? "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit} ":" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::" {IPv6LeastSignificant32Bits}
|
||||
| (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::" {IPv6Hex16Bit}
|
||||
| (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
|
||||
|
||||
URIunreserved = [-._~A-Za-z0-9]
|
||||
URIpercentEncoded = "%" [0-9A-Fa-f]{2}
|
||||
URIsubDelims = [!$&'()*+,;=]
|
||||
URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
|
||||
URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
|
||||
URIquery = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
|
||||
URIport = ":" [0-9]{1,5}
|
||||
URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
|
||||
URIhostLoose = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
|
||||
|
||||
URIauthorityStrict = {URIhostStrict} {URIport}?
|
||||
URIauthorityLoose = {URIlogin}? {URIhostLoose} {URIport}?
|
||||
|
||||
HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
|
||||
HTTPpath = ("/" {HTTPsegment})*
|
||||
HTTPscheme = [hH][tT][tT][pP][sS]? "://"
|
||||
HTTPurlFull = {HTTPscheme} {URIauthorityLoose} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
// {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
|
||||
HTTPurlNoScheme = {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
|
||||
HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
|
||||
|
||||
FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
|
||||
FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
|
||||
FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
|
||||
FTPscheme = [fF][tT][pP] "://"
|
||||
FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
|
||||
|
||||
FILEscheme = [fF][iI][lL][eE] "://"
|
||||
FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
|
||||
|
||||
URL = {HTTPurl} | {FTPurl} | {FILEurl}
|
||||
|
||||
EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
|
||||
EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
|
||||
EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
|
||||
EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
|
||||
EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
|
||||
// DFA minimization allows {IPv6Address} and {IPv4Address} to be included
|
||||
// in the {EMAILbracketedHost} definition without incurring any size penalties,
|
||||
// since {EMAILdomainLiteralText} recognizes all valid IP addresses.
|
||||
// The IP address regexes are included in {EMAILbracketedHost} simply as a
|
||||
// reminder that they are acceptable bracketed host forms.
|
||||
EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
|
||||
EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
||||
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
|
||||
|
||||
/** Numbers */
|
||||
public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
|
||||
|
||||
/**
|
||||
* Chars in class \p{Line_Break = Complex_Context} are from South East Asian
|
||||
* scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
|
||||
* together as as a single token rather than broken up, because the logic
|
||||
* required to break them at word boundaries is too complex for UAX#29.
|
||||
* <p>
|
||||
* See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
|
||||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
|
||||
|
||||
public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
|
||||
|
||||
public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
|
||||
|
||||
public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
|
||||
|
||||
public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
|
||||
|
||||
public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills CharTermAttribute with the current token text.
|
||||
*/
|
||||
public final void getText(CharTermAttribute t) {
|
||||
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
// UAX#29 WB1. sot ÷
|
||||
// WB2. ÷ eot
|
||||
//
|
||||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
{URL} { return URL_TYPE; }
|
||||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
{HangulEx}+
|
||||
{ return HANGUL_TYPE; }
|
||||
|
||||
{KatakanaEx}+
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
||||
// From UAX #29:
|
||||
//
|
||||
// [C]haracters with the Line_Break property values of Contingent_Break (CB),
|
||||
// Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
|
||||
// boundary property values based on criteria outside of the scope of this
|
||||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
// character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
|
||||
// Lao, etc.) are kept together. This grammar does the same below.
|
||||
//
|
||||
// See also the Unicode Line Breaking Algorithm:
|
||||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
{HanEx} { return IDEOGRAPHIC_TYPE; }
|
||||
{HiraganaEx} { return HIRAGANA_TYPE; }
|
||||
|
||||
|
||||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
|
@ -1,22 +0,0 @@
|
|||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html><head></head>
|
||||
<body>
|
||||
Backwards-compatible implementation to match {@link org.apache.lucene.util.Version#LUCENE_34}
|
||||
</body>
|
||||
</html>
|
|
@ -33,13 +33,6 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
/**
|
||||
* {@link Analyzer} for Thai language. It uses {@link java.text.BreakIterator} to break words.
|
||||
* <p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ThaiAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 3.6, a set of Thai stopwords is used by default
|
||||
* </ul>
|
||||
*/
|
||||
public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
||||
|
||||
|
@ -84,7 +77,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
|||
* @param matchVersion lucene compatibility version
|
||||
*/
|
||||
public ThaiAnalyzer(Version matchVersion) {
|
||||
this(matchVersion, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STOP_SET : StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -112,8 +105,7 @@ public final class ThaiAnalyzer extends StopwordAnalyzerBase {
|
|||
Reader reader) {
|
||||
final Tokenizer source = new StandardTokenizer(matchVersion, reader);
|
||||
TokenStream result = new StandardFilter(matchVersion, source);
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31))
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new LowerCaseFilter(matchVersion, result);
|
||||
result = new ThaiWordFilter(matchVersion, result);
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion,
|
||||
result, stopwords));
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Locale;
|
|||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
|
@ -34,10 +33,6 @@ import org.apache.lucene.util.Version;
|
|||
/**
|
||||
* {@link TokenFilter} that use {@link java.text.BreakIterator} to break each
|
||||
* Token that is Thai into separate Token(s) for each Thai word.
|
||||
* <p>Please note: Since matchVersion 3.1 on, this filter no longer lowercases non-thai text.
|
||||
* {@link ThaiAnalyzer} will insert a {@link LowerCaseFilter} before this filter
|
||||
* so the behaviour of the Analyzer does not change. With version 3.1, the filter handles
|
||||
* position increments correctly.
|
||||
* <p>WARNING: this filter may not be supported by all JREs.
|
||||
* It is known to work with Sun/Oracle and Harmony JREs.
|
||||
* If your application needs to be fully portable, consider using ICUTokenizer instead,
|
||||
|
@ -58,8 +53,6 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
private final BreakIterator breaker = (BreakIterator) proto.clone();
|
||||
private final CharArrayIterator charIterator = CharArrayIterator.newWordInstance();
|
||||
|
||||
private final boolean handlePosIncr;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
|
@ -72,11 +65,9 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
|
||||
/** Creates a new ThaiWordFilter with the specified match version. */
|
||||
public ThaiWordFilter(Version matchVersion, TokenStream input) {
|
||||
super(matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||
input : new LowerCaseFilter(matchVersion, input));
|
||||
super(input);
|
||||
if (!DBBI_AVAILABLE)
|
||||
throw new UnsupportedOperationException("This JRE does not have support for Thai segmentation");
|
||||
handlePosIncr = matchVersion.onOrAfter(Version.LUCENE_31);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -92,7 +83,7 @@ public final class ThaiWordFilter extends TokenFilter {
|
|||
} else {
|
||||
offsetAtt.setOffset(clonedOffsetAtt.startOffset() + start, clonedOffsetAtt.startOffset() + end);
|
||||
}
|
||||
if (handlePosIncr) posAtt.setPositionIncrement(1);
|
||||
posAtt.setPositionIncrement(1);
|
||||
return true;
|
||||
}
|
||||
hasMoreTokensInClone = false;
|
||||
|
|
|
@ -30,40 +30,6 @@ import org.apache.lucene.analysis.util.CharacterUtils.CharacterBuffer;
|
|||
|
||||
/**
|
||||
* An abstract base class for simple, character-oriented tokenizers.
|
||||
* <p>
|
||||
* <a name="version">You must specify the required {@link Version} compatibility
|
||||
* when creating {@link CharTokenizer}:
|
||||
* <ul>
|
||||
* <li>As of 3.1, {@link CharTokenizer} uses an int based API to normalize and
|
||||
* detect token codepoints. See {@link #isTokenChar(int)} and
|
||||
* {@link #normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* <p>
|
||||
* A new {@link CharTokenizer} API has been introduced with Lucene 3.1. This API
|
||||
* moved from UTF-16 code units to UTF-32 codepoints to eventually add support
|
||||
* for <a href=
|
||||
* "http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Character.html#supplementary"
|
||||
* >supplementary characters</a>. The old <i>char</i> based API has been
|
||||
* deprecated and should be replaced with the <i>int</i> based methods
|
||||
* {@link #isTokenChar(int)} and {@link #normalize(int)}.
|
||||
* </p>
|
||||
* <p>
|
||||
* As of Lucene 3.1 each {@link CharTokenizer} - constructor expects a
|
||||
* {@link Version} argument. Based on the given {@link Version} either the new
|
||||
* API or a backwards compatibility layer is used at runtime. For
|
||||
* {@link Version} < 3.1 the backwards compatibility layer ensures correct
|
||||
* behavior even for indexes build with previous versions of Lucene. If a
|
||||
* {@link Version} >= 3.1 is used {@link CharTokenizer} requires the new API to
|
||||
* be implemented by the instantiated class. Yet, the old <i>char</i> based API
|
||||
* is not required anymore even if backwards compatibility must be preserved.
|
||||
* {@link CharTokenizer} subclasses implementing the new API are fully backwards
|
||||
* compatible if instantiated with {@link Version} < 3.1.
|
||||
* </p>
|
||||
* <p>
|
||||
* <strong>Note:</strong> If you use a subclass of {@link CharTokenizer} with {@link Version} >=
|
||||
* 3.1 on an index build with a version < 3.1, created tokens might not be
|
||||
* compatible with the terms in your index.
|
||||
* </p>
|
||||
**/
|
||||
public abstract class CharTokenizer extends Tokenizer {
|
||||
|
||||
|
@ -71,7 +37,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
* Creates a new {@link CharTokenizer} instance
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* Lucene version to match
|
||||
* @param input
|
||||
* the input to split up into tokens
|
||||
*/
|
||||
|
@ -84,7 +50,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
* Creates a new {@link CharTokenizer} instance
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* Lucene version to match
|
||||
* @param source
|
||||
* the attribute source to use for this {@link Tokenizer}
|
||||
* @param input
|
||||
|
@ -100,7 +66,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
* Creates a new {@link CharTokenizer} instance
|
||||
*
|
||||
* @param matchVersion
|
||||
* Lucene version to match See {@link <a href="#version">above</a>}
|
||||
* Lucene version to match
|
||||
* @param factory
|
||||
* the attribute factory to use for this {@link Tokenizer}
|
||||
* @param input
|
||||
|
|
|
@ -43,7 +43,12 @@ public abstract class CharacterUtils {
|
|||
* {@link Version} instance.
|
||||
*/
|
||||
public static CharacterUtils getInstance(final Version matchVersion) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
|
||||
return JAVA_5;
|
||||
}
|
||||
|
||||
/** explicitly returns a version matching java 4 semantics */
|
||||
public static CharacterUtils getJava4Instance() {
|
||||
return JAVA_4;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -98,7 +98,7 @@ public abstract class StopwordAnalyzerBase extends Analyzer {
|
|||
Reader reader = null;
|
||||
try {
|
||||
reader = IOUtils.getDecodingReader(aClass.getResourceAsStream(resource), IOUtils.CHARSET_UTF_8);
|
||||
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_31, 16, ignoreCase));
|
||||
return WordlistLoader.getWordSet(reader, comment, new CharArraySet(Version.LUCENE_CURRENT, 16, ignoreCase));
|
||||
} finally {
|
||||
IOUtils.close(reader);
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.collation;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.text.Collator;
|
||||
|
@ -28,12 +27,11 @@ import java.io.Reader;
|
|||
|
||||
/**
|
||||
* <p>
|
||||
* Filters {@link KeywordTokenizer} with {@link CollationKeyFilter}.
|
||||
* Configures {@link KeywordTokenizer} with {@link CollationAttributeFactory}.
|
||||
* </p>
|
||||
* <p>
|
||||
* Converts the token into its {@link java.text.CollationKey}, and then
|
||||
* encodes the CollationKey either directly or with
|
||||
* {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow
|
||||
* encodes the CollationKey directly to allow
|
||||
* it to be stored as an index term.
|
||||
* </p>
|
||||
* <p>
|
||||
|
@ -74,49 +72,24 @@ import java.io.Reader;
|
|||
* CollationKeyAnalyzer to generate index terms, do not use
|
||||
* ICUCollationKeyAnalyzer on the query side, or vice versa.
|
||||
* </p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating CollationKeyAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
|
||||
* versions will encode the bytes with {@link IndexableBinaryStringTools}.
|
||||
* </ul>
|
||||
*/
|
||||
public final class CollationKeyAnalyzer extends Analyzer {
|
||||
private final Collator collator;
|
||||
private final CollationAttributeFactory factory;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Create a new CollationKeyAnalyzer, using the specified collator.
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param collator CollationKey generator
|
||||
*/
|
||||
public CollationKeyAnalyzer(Version matchVersion, Collator collator) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.collator = collator;
|
||||
this.factory = new CollationAttributeFactory(collator);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)}
|
||||
* and specify a version instead. This ctor will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public CollationKeyAnalyzer(Collator collator) {
|
||||
this(Version.LUCENE_31, collator);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_40)) {
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
} else {
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
|
||||
}
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,108 +0,0 @@
|
|||
package org.apache.lucene.collation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.IndexableBinaryStringTools;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Collator;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Converts each token into its {@link java.text.CollationKey}, and then
|
||||
* encodes the CollationKey with {@link IndexableBinaryStringTools}, to allow
|
||||
* it to be stored as an index term.
|
||||
* </p>
|
||||
* <p>
|
||||
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
|
||||
* index and query time -- CollationKeys are only comparable when produced by
|
||||
* the same Collator. Since {@link java.text.RuleBasedCollator}s are not
|
||||
* independently versioned, it is unsafe to search against stored
|
||||
* CollationKeys unless the following are exactly the same (best practice is
|
||||
* to store this information with the index and check that they remain the
|
||||
* same at query time):
|
||||
* </p>
|
||||
* <ol>
|
||||
* <li>JVM vendor</li>
|
||||
* <li>JVM version, including patch version</li>
|
||||
* <li>
|
||||
* The language (and country and variant, if specified) of the Locale
|
||||
* used when constructing the collator via
|
||||
* {@link Collator#getInstance(java.util.Locale)}.
|
||||
* </li>
|
||||
* <li>
|
||||
* The collation strength used - see {@link Collator#setStrength(int)}
|
||||
* </li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* The <code>ICUCollationKeyFilter</code> in the analysis-icu package
|
||||
* uses ICU4J's Collator, which makes its
|
||||
* version available, thus allowing collation to be versioned independently
|
||||
* from the JVM. ICUCollationKeyFilter is also significantly faster and
|
||||
* generates significantly shorter keys than CollationKeyFilter. See
|
||||
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||
* generation timing and key length comparisons between ICU4J and
|
||||
* java.text.Collator over several languages.
|
||||
* </p>
|
||||
* <p>
|
||||
* CollationKeys generated by java.text.Collators are not compatible
|
||||
* with those those generated by ICU Collators. Specifically, if you use
|
||||
* CollationKeyFilter to generate index terms, do not use
|
||||
* ICUCollationKeyFilter on the query side, or vice versa.
|
||||
* </p>
|
||||
* @deprecated Use {@link CollationAttributeFactory} instead, which encodes
|
||||
* terms directly as bytes. This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class CollationKeyFilter extends TokenFilter {
|
||||
private final Collator collator;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
* @param input Source token stream
|
||||
* @param collator CollationKey generator
|
||||
*/
|
||||
public CollationKeyFilter(TokenStream input, Collator collator) {
|
||||
super(input);
|
||||
// clone in case JRE doesnt properly sync,
|
||||
// or to reduce contention in case they do
|
||||
this.collator = (Collator) collator.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
byte[] collationKey = collator.getCollationKey(termAtt.toString()).toByteArray();
|
||||
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
|
||||
collationKey, 0, collationKey.length);
|
||||
termAtt.resizeBuffer(encodedLength);
|
||||
termAtt.setLength(encodedLength);
|
||||
IndexableBinaryStringTools.encode(collationKey, 0, collationKey.length,
|
||||
termAtt.buffer(), 0, encodedLength);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,47 +0,0 @@
|
|||
package org.apache.lucene.analysis.ar;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Testcase for {@link TestArabicLetterTokenizer}
|
||||
* @deprecated (3.1) Remove in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testArabicLetterTokenizer() throws IOException {
|
||||
StringReader reader = new StringReader("1234567890 Tokenizer \ud801\udc1c\u0300test");
|
||||
ArabicLetterTokenizer tokenizer = new ArabicLetterTokenizer(Version.LUCENE_31,
|
||||
reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] {"Tokenizer",
|
||||
"\ud801\udc1c\u0300test"});
|
||||
}
|
||||
|
||||
public void testArabicLetterTokenizerBWCompat() throws IOException {
|
||||
StringReader reader = new StringReader("1234567890 Tokenizer \ud801\udc1c\u0300test");
|
||||
ArabicLetterTokenizer tokenizer = new ArabicLetterTokenizer(Version.LUCENE_30,
|
||||
reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] {"Tokenizer", "\u0300test"});
|
||||
}
|
||||
}
|
|
@ -23,6 +23,7 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
|
@ -88,7 +89,7 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
||||
assertTokenStreamContents(filter, new String[]{expected});
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
|
@ -121,14 +122,14 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
|
|||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("ساهدهات");
|
||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader("ساهدهات"));
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader("ساهدهات"), MockTokenizer.WHITESPACE, false);
|
||||
|
||||
ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerFilter(tokenStream, set));
|
||||
assertTokenStreamContents(filter, new String[]{"ساهدهات"});
|
||||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
||||
assertTokenStreamContents(filter, new String[]{expected});
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testWithStemExclusionSet() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("строеве");
|
||||
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
|
||||
|
|
|
@ -217,7 +217,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testWithKeywordAttribute() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("строеве");
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader("строевете строеве"), MockTokenizer.WHITESPACE, false);
|
||||
|
||||
|
|
|
@ -1,281 +0,0 @@
|
|||
package org.apache.lucene.analysis.cjk;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/** @deprecated Remove when CJKTokenizer is removed (5.0) */
|
||||
@Deprecated
|
||||
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
class TestToken {
|
||||
String termText;
|
||||
int start;
|
||||
int end;
|
||||
String type;
|
||||
}
|
||||
|
||||
public TestToken newToken(String termText, int start, int end, int type) {
|
||||
TestToken token = new TestToken();
|
||||
token.termText = termText;
|
||||
token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type];
|
||||
token.start = start;
|
||||
token.end = end;
|
||||
return token;
|
||||
}
|
||||
|
||||
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
String terms[] = new String[out_tokens.length];
|
||||
int startOffsets[] = new int[out_tokens.length];
|
||||
int endOffsets[] = new int[out_tokens.length];
|
||||
String types[] = new String[out_tokens.length];
|
||||
for (int i = 0; i < out_tokens.length; i++) {
|
||||
terms[i] = out_tokens[i].termText;
|
||||
startOffsets[i] = out_tokens[i].start;
|
||||
endOffsets[i] = out_tokens[i].end;
|
||||
types[i] = out_tokens[i].type;
|
||||
}
|
||||
assertAnalyzesTo(analyzer, str, terms, startOffsets, endOffsets, types, null);
|
||||
}
|
||||
|
||||
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
String terms[] = new String[out_tokens.length];
|
||||
int startOffsets[] = new int[out_tokens.length];
|
||||
int endOffsets[] = new int[out_tokens.length];
|
||||
String types[] = new String[out_tokens.length];
|
||||
for (int i = 0; i < out_tokens.length; i++) {
|
||||
terms[i] = out_tokens[i].termText;
|
||||
startOffsets[i] = out_tokens[i].start;
|
||||
endOffsets[i] = out_tokens[i].end;
|
||||
types[i] = out_tokens[i].type;
|
||||
}
|
||||
assertAnalyzesToReuse(analyzer, str, terms, startOffsets, endOffsets, types, null);
|
||||
}
|
||||
|
||||
public void testJa1() throws IOException {
|
||||
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testJa2() throws IOException {
|
||||
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testC() throws IOException {
|
||||
String str = "abc defgh ijklmn opqrstu vwxy z";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testMix() throws IOException {
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testMix2() throws IOException {
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testSingleChar() throws IOException {
|
||||
String str = "\u4e00";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
/*
|
||||
* Full-width text is normalized to half-width
|
||||
*/
|
||||
public void testFullWidth() throws Exception {
|
||||
String str = "Test 1234";
|
||||
TestToken[] out_tokens = {
|
||||
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("1234", 5, 9, CJKTokenizer.SINGLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
/*
|
||||
* Non-english text (not just CJK) is treated the same as CJK: C1C2 C2C3
|
||||
*/
|
||||
public void testNonIdeographic() throws Exception {
|
||||
String str = "\u4e00 روبرت موير";
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("رو", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("وب", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("بر", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("رت", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("مو", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("وي", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ير", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
/*
|
||||
* Non-english text with nonletters (non-spacing marks,etc) is treated as C1C2 C2C3,
|
||||
* except for words are split around non-letters.
|
||||
*/
|
||||
public void testNonIdeographicNonLetter() throws Exception {
|
||||
String str = "\u4e00 رُوبرت موير";
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ر", 2, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("وب", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("بر", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("رت", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("مو", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("وي", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ير", 11, 13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKToken(str, out_tokens);
|
||||
}
|
||||
|
||||
public void testTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
|
||||
new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
|
||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||
|
||||
TestToken[] out_tokens = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKTokenReusable(analyzer, str, out_tokens);
|
||||
|
||||
str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
||||
TestToken[] out_tokens2 = {
|
||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE)
|
||||
};
|
||||
checkCJKTokenReusable(analyzer, str, out_tokens2);
|
||||
}
|
||||
|
||||
/**
|
||||
* LUCENE-2207: wrong offset calculated by end()
|
||||
*/
|
||||
public void testFinalOffset() throws IOException {
|
||||
checkCJKToken("あい", new TestToken[] {
|
||||
newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
|
||||
checkCJKToken("あい ", new TestToken[] {
|
||||
newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
|
||||
checkCJKToken("test", new TestToken[] {
|
||||
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
|
||||
checkCJKToken("test ", new TestToken[] {
|
||||
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE) });
|
||||
checkCJKToken("あいtest", new TestToken[] {
|
||||
newToken("あい", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||
newToken("test", 2, 6, CJKTokenizer.SINGLE_TOKEN_TYPE) });
|
||||
checkCJKToken("testあい ", new TestToken[] {
|
||||
newToken("test", 0, 4, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||
newToken("あい", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE) });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new CJKAnalyzer(Version.LUCENE_30), 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
}
|
|
@ -1,126 +0,0 @@
|
|||
package org.apache.lucene.analysis.cn;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
||||
/** @deprecated Remove this test when ChineseAnalyzer is removed. */
|
||||
@Deprecated
|
||||
public class TestChineseTokenizer extends BaseTokenStreamTestCase
|
||||
{
|
||||
public void testOtherLetterOffset() throws IOException
|
||||
{
|
||||
String s = "a天b";
|
||||
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
|
||||
|
||||
int correctStartOffset = 0;
|
||||
int correctEndOffset = 1;
|
||||
OffsetAttribute offsetAtt = tokenizer.getAttribute(OffsetAttribute.class);
|
||||
while (tokenizer.incrementToken()) {
|
||||
assertEquals(correctStartOffset, offsetAtt.startOffset());
|
||||
assertEquals(correctEndOffset, offsetAtt.endOffset());
|
||||
correctStartOffset++;
|
||||
correctEndOffset++;
|
||||
}
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception
|
||||
{
|
||||
Analyzer a = new ChineseAnalyzer();
|
||||
assertAnalyzesToReuse(a, "中华人民共和国",
|
||||
new String[] { "中", "华", "人", "民", "共", "和", "国" },
|
||||
new int[] { 0, 1, 2, 3, 4, 5, 6 },
|
||||
new int[] { 1, 2, 3, 4, 5, 6, 7 });
|
||||
assertAnalyzesToReuse(a, "北京市",
|
||||
new String[] { "北", "京", "市" },
|
||||
new int[] { 0, 1, 2 },
|
||||
new int[] { 1, 2, 3 });
|
||||
}
|
||||
|
||||
/*
|
||||
* Analyzer that just uses ChineseTokenizer, not ChineseFilter.
|
||||
* convenience to show the behavior of the tokenizer
|
||||
*/
|
||||
private class JustChineseTokenizerAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new ChineseTokenizer(reader));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Analyzer that just uses ChineseFilter, not ChineseTokenizer.
|
||||
* convenience to show the behavior of the filter.
|
||||
*/
|
||||
private class JustChineseFilterAnalyzer extends Analyzer {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
|
||||
return new TokenStreamComponents(tokenizer, new ChineseFilter(tokenizer));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ChineseTokenizer tokenizes numbers as one token, but they are filtered by ChineseFilter
|
||||
*/
|
||||
public void testNumerics() throws Exception
|
||||
{
|
||||
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
|
||||
assertAnalyzesTo(justTokenizer, "中1234", new String[] { "中", "1234" });
|
||||
|
||||
// in this case the ChineseAnalyzer (which applies ChineseFilter) will remove the numeric token.
|
||||
Analyzer a = new ChineseAnalyzer();
|
||||
assertAnalyzesTo(a, "中1234", new String[] { "中" });
|
||||
}
|
||||
|
||||
/*
|
||||
* ChineseTokenizer tokenizes english similar to SimpleAnalyzer.
|
||||
* it will lowercase terms automatically.
|
||||
*
|
||||
* ChineseFilter has an english stopword list, it also removes any single character tokens.
|
||||
* the stopword list is case-sensitive.
|
||||
*/
|
||||
public void testEnglish() throws Exception
|
||||
{
|
||||
Analyzer chinese = new ChineseAnalyzer();
|
||||
assertAnalyzesTo(chinese, "This is a Test. b c d",
|
||||
new String[] { "test" });
|
||||
|
||||
Analyzer justTokenizer = new JustChineseTokenizerAnalyzer();
|
||||
assertAnalyzesTo(justTokenizer, "This is a Test. b c d",
|
||||
new String[] { "this", "is", "a", "test", "b", "c", "d" });
|
||||
|
||||
Analyzer justFilter = new JustChineseFilterAnalyzer();
|
||||
assertAnalyzesTo(justFilter, "This is a Test. b c d",
|
||||
new String[] { "This", "Test." });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new ChineseAnalyzer(), 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
}
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestAnalyzers extends BaseTokenStreamTestCase {
|
||||
|
||||
|
@ -182,15 +181,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
"\ud801\udc44test" });
|
||||
}
|
||||
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public void testLowerCaseTokenizerBWCompat() throws IOException {
|
||||
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
|
||||
LowerCaseTokenizer tokenizer = new LowerCaseTokenizer(Version.LUCENE_30,
|
||||
reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] { "tokenizer", "test" });
|
||||
}
|
||||
|
||||
public void testWhitespaceTokenizer() throws IOException {
|
||||
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||
|
@ -198,16 +188,6 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
|
||||
"\ud801\udc1ctest" });
|
||||
}
|
||||
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public void testWhitespaceTokenizerBWCompat() throws IOException {
|
||||
StringReader reader = new StringReader("Tokenizer \ud801\udc1ctest");
|
||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_30,
|
||||
reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] { "Tokenizer",
|
||||
"\ud801\udc1ctest" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
|
|
|
@ -5,8 +5,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.standard.ClassicAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
|
@ -15,7 +15,6 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -137,7 +136,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
// 2.4 should not show the bug. But, alas, it's also obsolete,
|
||||
// so we check latest released (Robert's gonna break this on 4.0 soon :) )
|
||||
a2 = new ClassicAnalyzer(Version.LUCENE_31);
|
||||
a2 = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a2, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||
}
|
||||
|
||||
|
@ -244,7 +243,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testJava14BWCompatibility() throws Exception {
|
||||
ClassicAnalyzer sa = new ClassicAnalyzer(Version.LUCENE_30);
|
||||
ClassicAnalyzer sa = new ClassicAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(sa, "test\u02C6test", new String[] { "test", "test" });
|
||||
}
|
||||
|
||||
|
@ -272,7 +271,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
|||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
|
||||
// Make sure all terms < max size were indexed
|
||||
assertEquals(2, reader.docFreq(new Term("content", "abc")));
|
||||
|
@ -306,7 +305,7 @@ public class TestClassicAnalyzer extends BaseTokenStreamTestCase {
|
|||
writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, sa));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir);
|
||||
reader = DirectoryReader.open(dir);
|
||||
assertEquals(1, reader.docFreq(new Term("content", bigTerm)));
|
||||
reader.close();
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.StringField;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -58,7 +59,7 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
writer.close();
|
||||
|
||||
reader = IndexReader.open(directory);
|
||||
reader = DirectoryReader.open(directory);
|
||||
searcher = new IndexSearcher(reader);
|
||||
}
|
||||
|
||||
|
@ -95,7 +96,7 @@ public class TestKeywordAnalyzer extends BaseTokenStreamTestCase {
|
|||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
DocsEnum td = _TestUtil.docs(random(),
|
||||
reader,
|
||||
"partnum",
|
||||
|
|
|
@ -230,16 +230,6 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
|
||||
@Deprecated
|
||||
public void testCombiningMarksBackwards() throws Exception {
|
||||
Analyzer a = new StandardAnalyzer(Version.LUCENE_33);
|
||||
checkOneTerm(a, "ざ", "さ"); // hiragana Bug
|
||||
checkOneTerm(a, "ザ", "ザ"); // katakana Works
|
||||
checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul Works
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
|
|
|
@ -209,16 +209,6 @@ public class TestUAX29URLEmailAnalyzer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
|
||||
@Deprecated
|
||||
public void testCombiningMarksBackwards() throws Exception {
|
||||
Analyzer a = new UAX29URLEmailAnalyzer(Version.LUCENE_33);
|
||||
checkOneTerm(a, "ざ", "さ"); // hiragana Bug
|
||||
checkOneTerm(a, "ザ", "ザ"); // katakana Works
|
||||
checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul Works
|
||||
}
|
||||
|
||||
public void testBasicEmails() throws Exception {
|
||||
BaseTokenStreamTestCase.assertAnalyzesTo(a,
|
||||
|
|
|
@ -453,39 +453,6 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
|
||||
@Deprecated
|
||||
public void testCombiningMarksBackwards() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents
|
||||
(String fieldName, Reader reader) {
|
||||
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_31, reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "ざ", "さ"); // hiragana Bug
|
||||
checkOneTerm(a, "ザ", "ザ"); // katakana Works
|
||||
checkOneTerm(a, "壹゙", "壹"); // ideographic Bug
|
||||
checkOneTerm(a, "아゙", "아゙"); // hangul Works
|
||||
}
|
||||
|
||||
// LUCENE-3880
|
||||
/** @deprecated remove this and sophisticated backwards layer in 5.0 */
|
||||
@Deprecated
|
||||
public void testMailtoBackwards() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new UAX29URLEmailTokenizer(Version.LUCENE_34, reader);
|
||||
return new TokenStreamComponents(tokenizer);
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "mailto:test@example.org",
|
||||
new String[] { "mailto:test", "example.org" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
|
|
|
@ -31,30 +31,12 @@ import org.apache.lucene.util.Version;
|
|||
*
|
||||
*/
|
||||
public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
||||
/**
|
||||
* @deprecated (3.1) Remove this test when support for 3.0 indexes is no longer needed.
|
||||
*/
|
||||
@Deprecated
|
||||
public void testStopWordLegacy() throws Exception {
|
||||
assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_30), "Pokud mluvime o volnem",
|
||||
new String[] { "mluvime", "volnem" });
|
||||
}
|
||||
|
||||
public void testStopWord() throws Exception {
|
||||
assertAnalyzesTo(new CzechAnalyzer(TEST_VERSION_CURRENT), "Pokud mluvime o volnem",
|
||||
new String[] { "mluvim", "voln" });
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated (3.1) Remove this test when support for 3.0 indexes is no longer needed.
|
||||
*/
|
||||
@Deprecated
|
||||
public void testReusableTokenStreamLegacy() throws Exception {
|
||||
Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvime", "volnem" });
|
||||
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česká", "republika" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer analyzer = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
|
||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
|
@ -58,10 +57,6 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
|||
// a/o/u + e is equivalent to the umlaut form
|
||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
|
||||
// here they are with the old stemmer
|
||||
a = new GermanAnalyzer(Version.LUCENE_30);
|
||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflaech");
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.el;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* A unit test class for verifying the correct operation of the GreekAnalyzer.
|
||||
|
@ -47,29 +46,6 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσ", "αψογ", "μεστ", "αλλ" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the analysis of various greek strings.
|
||||
*
|
||||
* @throws Exception in case an error occurs
|
||||
* @deprecated (3.1) Remove this test when support for 3.0 is no longer needed
|
||||
*/
|
||||
@Deprecated
|
||||
public void testAnalyzerBWCompat() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_30);
|
||||
// Verify the correct analysis of capitals and small accented letters
|
||||
assertAnalyzesTo(a, "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
|
||||
new String[] { "μια", "εξαιρετικα", "καλη", "πλουσια", "σειρα", "χαρακτηρων",
|
||||
"ελληνικησ", "γλωσσασ" });
|
||||
// Verify the correct analysis of small letters with diaeresis and the elimination
|
||||
// of punctuation marks
|
||||
assertAnalyzesTo(a, "Προϊόντα (και) [πολλαπλές] - ΑΝΑΓΚΕΣ",
|
||||
new String[] { "προιοντα", "πολλαπλεσ", "αναγκεσ" });
|
||||
// Verify the correct analysis of capital accented letters and capital letters with diaeresis,
|
||||
// as well as the elimination of stop words
|
||||
assertAnalyzesTo(a, "ΠΡΟΫΠΟΘΕΣΕΙΣ Άψογος, ο μεστός και οι άλλοι",
|
||||
new String[] { "προυποθεσεισ", "αψογοσ", "μεστοσ", "αλλοι" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||
|
|
|
@ -23,8 +23,8 @@ import java.io.StringReader;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
|
||||
/**
|
||||
|
@ -58,8 +58,7 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
private void check(final String input, final String expected) throws IOException {
|
||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader(input));
|
||||
MockTokenizer tokenStream = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
|
||||
PersianNormalizationFilter filter = new PersianNormalizationFilter(
|
||||
tokenStream);
|
||||
assertTokenStreamContents(filter, new String[]{expected});
|
||||
|
|
|
@ -115,94 +115,6 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated (3.1) remove this test for Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testAnalyzer30() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_30);
|
||||
|
||||
assertAnalyzesTo(fa, "", new String[] {
|
||||
});
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien chat cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"chien CHAT CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
" chien ,? + = - CHAT /: > CHEVAL",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
assertAnalyzesTo(fa, "chien++", new String[] { "chien" });
|
||||
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"mot \"entreguillemet\"",
|
||||
new String[] { "mot", "entreguillemet" });
|
||||
|
||||
// let's do some french specific tests now
|
||||
|
||||
/* 1. couldn't resist
|
||||
I would expect this to stay one term as in French the minus
|
||||
sign is often used for composing words */
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"Jean-François",
|
||||
new String[] { "jean", "françois" });
|
||||
|
||||
// 2. stopwords
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"le la chien les aux chat du des à cheval",
|
||||
new String[] { "chien", "chat", "cheval" });
|
||||
|
||||
// some nouns and adjectives
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"lances chismes habitable chiste éléments captifs",
|
||||
new String[] {
|
||||
"lanc",
|
||||
"chism",
|
||||
"habit",
|
||||
"chist",
|
||||
"élément",
|
||||
"captif" });
|
||||
|
||||
// some verbs
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"finissions souffrirent rugissante",
|
||||
new String[] { "fin", "souffr", "rug" });
|
||||
|
||||
// some everything else
|
||||
// aujourd'hui stays one term which is OK
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ ",
|
||||
new String[] {
|
||||
"c3po",
|
||||
"aujourd'hui",
|
||||
"oeuf",
|
||||
"ïâöûàä",
|
||||
"anticonstitutionnel",
|
||||
"jav" });
|
||||
|
||||
// some more everything else
|
||||
// here 1940-1945 stays as one term, 1940:1945 not ?
|
||||
assertAnalyzesTo(
|
||||
fa,
|
||||
"33Bis 1940-1945 1940:1945 (---i+++)*",
|
||||
new String[] { "33bis", "1940-1945", "1940", "1945", "i" });
|
||||
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
// stopwords
|
||||
|
@ -242,22 +154,11 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouil" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||
* stopwords were case sensitive. Preserve this for back compat.
|
||||
* @deprecated (3.1) Remove this test in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyStopwordsCasing() throws IOException {
|
||||
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(a, "Votre", new String[] { "votr" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that stopwords are not case sensitive
|
||||
*/
|
||||
public void testStopwordsCasing() throws IOException {
|
||||
FrenchAnalyzer a = new FrenchAnalyzer(Version.LUCENE_31);
|
||||
FrenchAnalyzer a = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a, "Votre", new String[] { });
|
||||
}
|
||||
|
||||
|
|
|
@ -63,11 +63,4 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(a, "dell'Italia", new String[] { "ital" });
|
||||
assertAnalyzesTo(a, "l'Italiano", new String[] { "italian" });
|
||||
}
|
||||
|
||||
/** test that we don't enable this before 3.2*/
|
||||
public void testContractionsBackwards() throws IOException {
|
||||
Analyzer a = new ItalianAnalyzer(Version.LUCENE_31);
|
||||
assertAnalyzesTo(a, "dell'Italia", new String[] { "dell'ital" });
|
||||
assertAnalyzesTo(a, "l'Italiano", new String[] { "l'ital" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,181 +0,0 @@
|
|||
package org.apache.lucene.analysis.miscellaneous;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.lang.Thread.UncaughtExceptionHandler;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
|
||||
/**
|
||||
* Verifies the behavior of PatternAnalyzer.
|
||||
*/
|
||||
public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a non-word pattern.
|
||||
* Behavior can be similar to SimpleAnalyzer (depending upon options)
|
||||
*/
|
||||
public void testNonWordPattern() throws IOException {
|
||||
// Split on non-letter pattern, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
|
||||
false, null);
|
||||
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||
"The", "quick", "brown", "Fox", "the", "abcd", "dc" });
|
||||
|
||||
// split on non-letter pattern, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
|
||||
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||
"quick", "brown", "fox", "abcd", "dc" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a whitespace pattern.
|
||||
* Behavior can be similar to WhitespaceAnalyzer (depending upon options)
|
||||
*/
|
||||
public void testWhitespacePattern() throws IOException {
|
||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
||||
false, null);
|
||||
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||
"The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
|
||||
|
||||
// Split on whitespace patterns, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
||||
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||
"quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer when it is configured with a custom pattern. In this
|
||||
* case, text is tokenized on the comma ","
|
||||
*/
|
||||
public void testCustomPattern() throws IOException {
|
||||
// Split on comma, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
|
||||
check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
|
||||
"Are", "some", "Comma", "separated", "words" });
|
||||
|
||||
// split on comma, lowercase, english stopwords
|
||||
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
|
||||
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
|
||||
"some", "comma", "separated", "words" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PatternAnalyzer against a large document.
|
||||
*/
|
||||
public void testHugeDocument() throws IOException {
|
||||
StringBuilder document = new StringBuilder();
|
||||
// 5000 a's
|
||||
char largeWord[] = new char[5000];
|
||||
Arrays.fill(largeWord, 'a');
|
||||
document.append(largeWord);
|
||||
|
||||
// a space
|
||||
document.append(' ');
|
||||
|
||||
// 2000 b's
|
||||
char largeWord2[] = new char[2000];
|
||||
Arrays.fill(largeWord2, 'b');
|
||||
document.append(largeWord2);
|
||||
|
||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
||||
false, null);
|
||||
check(a, document.toString(), new String[] { new String(largeWord),
|
||||
new String(largeWord2) });
|
||||
}
|
||||
|
||||
/**
|
||||
* Verify the analyzer analyzes to the expected contents. For PatternAnalyzer,
|
||||
* several methods are verified:
|
||||
* <ul>
|
||||
* <li>Analysis with a normal Reader
|
||||
* <li>Analysis with a FastStringReader
|
||||
* <li>Analysis with a String
|
||||
* </ul>
|
||||
*/
|
||||
private void check(PatternAnalyzer analyzer, String document,
|
||||
String expected[]) throws IOException {
|
||||
// ordinary analysis of a Reader
|
||||
assertAnalyzesTo(analyzer, document, expected);
|
||||
|
||||
// analysis with a "FastStringReader"
|
||||
TokenStream ts = analyzer.tokenStream("dummy",
|
||||
new PatternAnalyzer.FastStringReader(document));
|
||||
assertTokenStreamContents(ts, expected);
|
||||
|
||||
// analysis of a String, uses PatternAnalyzer.tokenStream(String, String)
|
||||
TokenStream ts2 = analyzer.tokenStream("dummy", new StringReader(document));
|
||||
assertTokenStreamContents(ts2, expected);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||
|
||||
// dodge jre bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=7104012
|
||||
final UncaughtExceptionHandler savedHandler = Thread.getDefaultUncaughtExceptionHandler();
|
||||
Thread.setDefaultUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() {
|
||||
@Override
|
||||
public void uncaughtException(Thread thread, Throwable throwable) {
|
||||
assumeTrue("not failing due to jre bug ", !isJREBug7104012(throwable));
|
||||
// otherwise its some other bug, pass to default handler
|
||||
savedHandler.uncaughtException(thread, throwable);
|
||||
}
|
||||
});
|
||||
|
||||
try {
|
||||
Thread.getDefaultUncaughtExceptionHandler();
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
} catch (ArrayIndexOutOfBoundsException ex) {
|
||||
assumeTrue("not failing due to jre bug ", !isJREBug7104012(ex));
|
||||
throw ex; // otherwise rethrow
|
||||
} finally {
|
||||
Thread.setDefaultUncaughtExceptionHandler(savedHandler);
|
||||
}
|
||||
}
|
||||
|
||||
static boolean isJREBug7104012(Throwable t) {
|
||||
if (!(t instanceof ArrayIndexOutOfBoundsException)) {
|
||||
// BaseTokenStreamTestCase now wraps exc in a new RuntimeException:
|
||||
t = t.getCause();
|
||||
if (!(t instanceof ArrayIndexOutOfBoundsException)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
StackTraceElement trace[] = t.getStackTrace();
|
||||
for (StackTraceElement st : trace) {
|
||||
if ("java.text.RuleBasedBreakIterator".equals(st.getClassName())
|
||||
&& "lookupBackwardState".equals(st.getMethodName())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
|||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
|
@ -61,7 +62,7 @@ public class TestLimitTokenCountAnalyzer extends BaseTokenStreamTestCase {
|
|||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
Term t = new Term("field", "x");
|
||||
assertEquals(1, reader.docFreq(t));
|
||||
reader.close();
|
||||
|
|
|
@ -112,17 +112,6 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
check("ophouden", "ophoud");
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated (3.1) remove this test in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testOldBuggyStemmer() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_30);
|
||||
checkOneTermReuse(a, "opheffen", "ophef"); // versus snowball 'opheff'
|
||||
checkOneTermReuse(a, "opheffende", "ophef"); // versus snowball 'opheff'
|
||||
checkOneTermReuse(a, "opheffing", "ophef"); // versus snowball 'opheff'
|
||||
}
|
||||
|
||||
public void testSnowballCorrectness() throws Exception {
|
||||
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||
checkOneTermReuse(a, "opheffen", "opheff");
|
||||
|
@ -139,7 +128,7 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testExclusionTableViaCtor() throws IOException {
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||
set.add("lichamelijk");
|
||||
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
||||
|
@ -158,33 +147,11 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "fiets", "fiets");
|
||||
}
|
||||
|
||||
/**
|
||||
* prior to 3.6, this confusingly did not happen if
|
||||
* you specified your own stoplist!!!!
|
||||
* @deprecated (3.6) Remove this test in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyStemOverrides() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_35, CharArraySet.EMPTY_SET);
|
||||
checkOneTerm(a, "fiets", "fiet");
|
||||
}
|
||||
|
||||
/**
|
||||
* Prior to 3.1, this analyzer had no lowercase filter.
|
||||
* stopwords were case sensitive. Preserve this for back compat.
|
||||
* @deprecated (3.1) Remove this test in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyStopwordsCasing() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesTo(a, "Zelf", new String[] { "zelf" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that stopwords are not case sensitive
|
||||
*/
|
||||
public void testStopwordsCasing() throws IOException {
|
||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_31);
|
||||
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesTo(a, "Zelf", new String[] { });
|
||||
}
|
||||
|
||||
|
|
|
@ -20,6 +20,7 @@ import org.apache.lucene.analysis.*;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
|
@ -53,7 +54,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir);
|
||||
reader = DirectoryReader.open(dir);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -57,15 +57,6 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
|||
assertEquals( "ABEDCF", new String( buffer ) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the broken 3.0 behavior, for back compat
|
||||
* @deprecated (3.1) Remove in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBackCompat() throws Exception {
|
||||
assertEquals("\uDF05\uD866\uDF05\uD866", ReverseStringFilter.reverse(Version.LUCENE_30, "𩬅𩬅"));
|
||||
}
|
||||
|
||||
public void testReverseSupplementary() throws Exception {
|
||||
// supplementary at end
|
||||
assertEquals("𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "瀛愯䇹鍟艱𩬅"));
|
||||
|
|
|
@ -37,16 +37,6 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(ra, "text 1000", new String[] { "text", "1000" });
|
||||
}
|
||||
|
||||
/** @deprecated (3.1) remove this test in Lucene 5.0: stopwords changed */
|
||||
@Deprecated
|
||||
public void testReusableTokenStream30() throws Exception {
|
||||
Analyzer a = new RussianAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||
new String[] { "знан", "хран", "тайн" });
|
||||
}
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
|
||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
package org.apache.lucene.analysis.ru;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* Testcase for {@link RussianLetterTokenizer}
|
||||
* @deprecated (3.1) Remove this test class in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestRussianLetterTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testRussianLetterTokenizer() throws IOException {
|
||||
StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest");
|
||||
RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(Version.LUCENE_CURRENT,
|
||||
reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] {"1234567890", "Вместе",
|
||||
"\ud801\udc1ctest"});
|
||||
}
|
||||
|
||||
public void testRussianLetterTokenizerBWCompat() throws IOException {
|
||||
StringReader reader = new StringReader("1234567890 Вместе \ud801\udc1ctest");
|
||||
RussianLetterTokenizer tokenizer = new RussianLetterTokenizer(Version.LUCENE_30,
|
||||
reader);
|
||||
assertTokenStreamContents(tokenizer, new String[] {"1234567890", "Вместе", "test"});
|
||||
}
|
||||
}
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
|
@ -74,7 +75,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
|||
|
||||
writer.close();
|
||||
|
||||
reader = IndexReader.open(directory);
|
||||
reader = DirectoryReader.open(directory);
|
||||
searcher = new IndexSearcher(reader);
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -103,7 +104,7 @@ public class TestTeeSinkTokenFilter extends BaseTokenStreamTestCase {
|
|||
w.addDocument(doc);
|
||||
w.close();
|
||||
|
||||
IndexReader r = IndexReader.open(dir);
|
||||
IndexReader r = DirectoryReader.open(dir);
|
||||
Terms vector = r.getTermVectors(0).terms("field");
|
||||
assertEquals(1, vector.size());
|
||||
TermsEnum termsEnum = vector.iterator(null);
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
@ -38,65 +39,18 @@ import org.apache.lucene.util.Version;
|
|||
public class TestSnowball extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testEnglish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new SnowballFilter(tokenizer, "English"));
|
||||
}
|
||||
};
|
||||
|
||||
assertAnalyzesTo(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
}
|
||||
|
||||
public void testStopwords() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English",
|
||||
StandardAnalyzer.STOP_WORDS_SET);
|
||||
assertAnalyzesTo(a, "the quick brown fox jumped",
|
||||
new String[]{"quick", "brown", "fox", "jump"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test english lowercasing. Test both cases (pre-3.1 and post-3.1) to ensure
|
||||
* we lowercase I correct for non-Turkish languages in either case.
|
||||
*/
|
||||
public void testEnglishLowerCase() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
|
||||
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
|
||||
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
|
||||
|
||||
Analyzer b = new SnowballAnalyzer(Version.LUCENE_30, "English");
|
||||
assertAnalyzesTo(b, "cryogenic", new String[] { "cryogen" });
|
||||
assertAnalyzesTo(b, "CRYOGENIC", new String[] { "cryogen" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing
|
||||
*/
|
||||
public void testTurkish() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "Turkish");
|
||||
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test turkish lowercasing (old buggy behavior)
|
||||
* @deprecated (3.1) Remove this when support for 3.0 indexes is no longer required (5.0)
|
||||
*/
|
||||
@Deprecated
|
||||
public void testTurkishBWComp() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_30, "Turkish");
|
||||
// AĞACI in turkish lowercases to ağacı, but with lowercase filter ağaci.
|
||||
// this fails due to wrong casing, because the stemmer
|
||||
// will only remove -ı, not -i
|
||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaci" });
|
||||
}
|
||||
|
||||
|
||||
public void testReusableTokenStream() throws Exception {
|
||||
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
|
||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||
new String[]{"he", "abhor", "accent"});
|
||||
assertAnalyzesToReuse(a, "she abhorred him",
|
||||
new String[]{"she", "abhor", "him"});
|
||||
}
|
||||
|
||||
public void testFilterTokens() throws Exception {
|
||||
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
|
|
|
@ -62,13 +62,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
new int[] { 5, 2, 1 });
|
||||
}
|
||||
|
||||
public void testBackwardsStopWords() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_35), "การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี" },
|
||||
new int[] { 0, 3, 6, 9, 13, 17, 20, 23 },
|
||||
new int[] { 3, 6, 9, 13, 17, 20, 23, 25 });
|
||||
}
|
||||
|
||||
public void testTokenType() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
|
@ -79,43 +72,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
"<NUM>" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Thai numeric tokens were typed as <ALPHANUM> instead of <NUM>.
|
||||
* @deprecated (3.1) testing backwards behavior
|
||||
*/
|
||||
@Deprecated
|
||||
public void testBuggyTokenType30() throws Exception {
|
||||
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_30), "การที่ได้ต้องแสดงว่างานดี ๑๒๓",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี", "๑๒๓" },
|
||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
/** @deprecated (3.1) testing backwards behavior */
|
||||
@Deprecated
|
||||
public void testAnalyzer30() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
|
||||
|
||||
assertAnalyzesTo(analyzer, "", new String[] {});
|
||||
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||
|
||||
// English stop words
|
||||
assertAnalyzesTo(
|
||||
analyzer,
|
||||
"ประโยคว่า The quick brown fox jumped over the lazy dogs",
|
||||
new String[] { "ประโยค", "ว่า", "quick", "brown", "fox", "jumped", "over", "lazy", "dogs" });
|
||||
}
|
||||
|
||||
/*
|
||||
* Test that position increments are adjusted correctly for stopwords.
|
||||
*/
|
||||
|
@ -151,23 +107,6 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
|
||||
}
|
||||
|
||||
/** @deprecated (3.1) for version back compat */
|
||||
@Deprecated
|
||||
public void testReusableTokenStream30() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
|
||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"การที่ได้ต้องแสดงว่างานดี",
|
||||
new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
|
||||
assertAnalyzesToReuse(
|
||||
analyzer,
|
||||
"บริษัทชื่อ XY&Z - คุยกับ xyz@demo.com",
|
||||
new String[] { "บริษัท", "ชื่อ", "xy&z", "คุย", "กับ", "xyz@demo.com" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new ThaiAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
|
||||
|
@ -181,7 +120,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
// LUCENE-3044
|
||||
public void testAttributeReuse() throws Exception {
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_30);
|
||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||
// just consume
|
||||
TokenStream ts = analyzer.tokenStream("dummy", new StringReader("ภาษาไทย"));
|
||||
assertTokenStreamContents(ts, new String[] { "ภาษา", "ไทย" });
|
||||
|
|
|
@ -250,77 +250,6 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is
|
||||
* no longer needed.
|
||||
*/
|
||||
@Deprecated
|
||||
public void testSupplementaryCharsBWCompat() {
|
||||
String missing = "Term %s is missing in the set";
|
||||
String falsePos = "Term %s is in the set but shouldn't";
|
||||
// for reference see
|
||||
// http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[[%3ACase_Sensitive%3DTrue%3A]%26[^[\u0000-\uFFFF]]]&esc=on
|
||||
String[] upperArr = new String[] {"Abc\ud801\udc1c",
|
||||
"\ud801\udc1c\ud801\udc1cCDE", "A\ud801\udc1cB"};
|
||||
String[] lowerArr = new String[] {"abc\ud801\udc44",
|
||||
"\ud801\udc44\ud801\udc44cde", "a\ud801\udc44b"};
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), true);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS), false);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set.contains(lowerArr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated (3.1) remove this test when lucene 3.0 "broken unicode 4" support is
|
||||
* no longer needed.
|
||||
*/
|
||||
@Deprecated
|
||||
public void testSingleHighSurrogateBWComapt() {
|
||||
String missing = "Term %s is missing in the set";
|
||||
String falsePos = "Term %s is in the set but shouldn't";
|
||||
String[] upperArr = new String[] { "ABC\uD800", "ABC\uD800EfG",
|
||||
"\uD800EfG", "\uD800\ud801\udc1cB" };
|
||||
|
||||
String[] lowerArr = new String[] { "abc\uD800", "abc\uD800efg",
|
||||
"\uD800efg", "\uD800\ud801\udc44b" };
|
||||
CharArraySet set = new CharArraySet(Version.LUCENE_30, Arrays
|
||||
.asList(TEST_STOP_WORDS), true);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
if (i == lowerArr.length - 1)
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
else
|
||||
assertTrue(String.format(missing, lowerArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
}
|
||||
set = new CharArraySet(Version.LUCENE_30, Arrays.asList(TEST_STOP_WORDS),
|
||||
false);
|
||||
for (String upper : upperArr) {
|
||||
set.add(upper);
|
||||
}
|
||||
for (int i = 0; i < upperArr.length; i++) {
|
||||
assertTrue(String.format(missing, upperArr[i]), set.contains(upperArr[i]));
|
||||
assertFalse(String.format(falsePos, lowerArr[i]), set
|
||||
.contains(lowerArr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("deprecated")
|
||||
public void testCopyCharArraySetBWCompat() {
|
||||
CharArraySet setIngoreCase = new CharArraySet(TEST_VERSION_CURRENT, 10, true);
|
||||
|
@ -499,10 +428,5 @@ public class TestCharArraySet extends LuceneTestCase {
|
|||
assertEquals("[test]", set.toString());
|
||||
set.add("test2");
|
||||
assertTrue(set.toString().contains(", "));
|
||||
|
||||
set = CharArraySet.copy(Version.LUCENE_30, Collections.singleton("test"));
|
||||
assertEquals("[test]", set.toString());
|
||||
set.add("test2");
|
||||
assertTrue(set.toString().contains(", "));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,7 +33,7 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testCodePointAtCharArrayInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
|
||||
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
|
||||
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
|
||||
|
@ -59,7 +59,7 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testCodePointAtCharSequenceInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
String cpAt3 = "Abc\ud801\udc1c";
|
||||
String highSurrogateAt3 = "Abc\ud801";
|
||||
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0));
|
||||
|
@ -86,7 +86,7 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testCodePointAtCharArrayIntInt() {
|
||||
CharacterUtils java4 = CharacterUtils.getInstance(Version.LUCENE_30);
|
||||
CharacterUtils java4 = CharacterUtils.getJava4Instance();
|
||||
char[] cpAt3 = "Abc\ud801\udc1c".toCharArray();
|
||||
char[] highSurrogateAt3 = "Abc\ud801".toCharArray();
|
||||
assertEquals((int) 'A', java4.codePointAt(cpAt3, 0, 2));
|
||||
|
@ -122,9 +122,10 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void testFillNoHighSurrogate() throws IOException {
|
||||
Version[] versions = new Version[] { Version.LUCENE_30, TEST_VERSION_CURRENT };
|
||||
for (Version version : versions) {
|
||||
CharacterUtils instance = CharacterUtils.getInstance(version);
|
||||
CharacterUtils versions[] = new CharacterUtils[] {
|
||||
CharacterUtils.getInstance(TEST_VERSION_CURRENT),
|
||||
CharacterUtils.getJava4Instance() };
|
||||
for (CharacterUtils instance : versions) {
|
||||
Reader reader = new StringReader("helloworld");
|
||||
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(6);
|
||||
assertTrue(instance.fill(buffer,reader));
|
||||
|
@ -172,7 +173,7 @@ public class TestCharacterUtils extends LuceneTestCase {
|
|||
@Test
|
||||
public void testFillJava14() throws IOException {
|
||||
String input = "1234\ud801\udc1c789123\ud801\ud801\udc1c\ud801";
|
||||
CharacterUtils instance = CharacterUtils.getInstance(Version.LUCENE_30);
|
||||
CharacterUtils instance = CharacterUtils.getJava4Instance();
|
||||
Reader reader = new StringReader(input);
|
||||
CharacterBuffer buffer = CharacterUtils.newCharacterBuffer(5);
|
||||
assertTrue(instance.fill(buffer, reader));
|
||||
|
|
|
@ -1,102 +0,0 @@
|
|||
package org.apache.lucene.collation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.text.Collator;
|
||||
import java.util.Locale;
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* @deprecated remove when CollationKeyFilter is removed.
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestCollationKeyFilter extends CollationTestBase {
|
||||
// the sort order of Ø versus U depends on the version of the rules being used
|
||||
// for the inherited root locale: Ø's order isnt specified in Locale.US since
|
||||
// its not used in english.
|
||||
boolean oStrokeFirst = Collator.getInstance(new Locale("")).compare("Ø", "U") < 0;
|
||||
|
||||
// Neither Java 1.4.2 nor 1.5.0 has Farsi Locale collation available in
|
||||
// RuleBasedCollator. However, the Arabic Locale seems to order the Farsi
|
||||
// characters properly.
|
||||
private Collator collator = Collator.getInstance(new Locale("ar"));
|
||||
private Analyzer analyzer = new TestAnalyzer(collator);
|
||||
|
||||
private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
|
||||
private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
|
||||
private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
|
||||
private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
|
||||
|
||||
|
||||
public final class TestAnalyzer extends Analyzer {
|
||||
private Collator _collator;
|
||||
|
||||
TestAnalyzer(Collator collator) {
|
||||
_collator = collator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer result = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(result, new CollationKeyFilter(result, _collator));
|
||||
}
|
||||
}
|
||||
|
||||
public void testFarsiRangeFilterCollating() throws Exception {
|
||||
testFarsiRangeFilterCollating
|
||||
(analyzer, firstRangeBeginning, firstRangeEnd,
|
||||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
public void testFarsiRangeQueryCollating() throws Exception {
|
||||
testFarsiRangeQueryCollating
|
||||
(analyzer, firstRangeBeginning, firstRangeEnd,
|
||||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
public void testFarsiTermRangeQuery() throws Exception {
|
||||
testFarsiTermRangeQuery
|
||||
(analyzer, firstRangeBeginning, firstRangeEnd,
|
||||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
public void testCollationKeySort() throws Exception {
|
||||
Analyzer usAnalyzer = new TestAnalyzer(Collator.getInstance(Locale.US));
|
||||
Analyzer franceAnalyzer
|
||||
= new TestAnalyzer(Collator.getInstance(Locale.FRANCE));
|
||||
Analyzer swedenAnalyzer
|
||||
= new TestAnalyzer(Collator.getInstance(new Locale("sv", "se")));
|
||||
Analyzer denmarkAnalyzer
|
||||
= new TestAnalyzer(Collator.getInstance(new Locale("da", "dk")));
|
||||
|
||||
// The ICU Collator and Sun java.text.Collator implementations differ in their
|
||||
// orderings - "BFJDH" is the ordering for java.text.Collator for Locale.US.
|
||||
testCollationKeySort
|
||||
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
|
||||
oStrokeFirst ? "BFJHD" : "BFJDH", "EACGI", "BJDFH", "BJDHF");
|
||||
}
|
||||
}
|
|
@ -22,18 +22,16 @@ import com.ibm.icu.text.Collator;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.collation.CollationKeyAnalyzer; // javadocs
|
||||
import org.apache.lucene.util.IndexableBinaryStringTools; // javadocs
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.Reader;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Filters {@link KeywordTokenizer} with {@link ICUCollationKeyFilter}.
|
||||
* Configures {@link KeywordTokenizer} with {@link ICUCollationAttributeFactory}.
|
||||
* <p>
|
||||
* Converts the token into its {@link com.ibm.icu.text.CollationKey}, and
|
||||
* then encodes the CollationKey either directly or with
|
||||
* {@link IndexableBinaryStringTools} (see <a href="#version">below</a>), to allow it to
|
||||
* then encodes the CollationKey directly to allow it to
|
||||
* be stored as an index term.
|
||||
* </p>
|
||||
* <p>
|
||||
|
@ -67,48 +65,24 @@ import java.io.Reader;
|
|||
* generation timing and key length comparisons between ICU4J and
|
||||
* java.text.Collator over several languages.
|
||||
* </p>
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version}
|
||||
* compatibility when creating ICUCollationKeyAnalyzer:
|
||||
* <ul>
|
||||
* <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
|
||||
* versions will encode the bytes with {@link IndexableBinaryStringTools}.
|
||||
* </ul>
|
||||
*/
|
||||
public final class ICUCollationKeyAnalyzer extends Analyzer {
|
||||
private final Collator collator;
|
||||
private final ICUCollationAttributeFactory factory;
|
||||
private final Version matchVersion;
|
||||
|
||||
/**
|
||||
* Create a new ICUCollationKeyAnalyzer, using the specified collator.
|
||||
*
|
||||
* @param matchVersion See <a href="#version">above</a>
|
||||
* @param matchVersion compatibility version
|
||||
* @param collator CollationKey generator
|
||||
*/
|
||||
public ICUCollationKeyAnalyzer(Version matchVersion, Collator collator) {
|
||||
this.matchVersion = matchVersion;
|
||||
this.collator = collator;
|
||||
this.factory = new ICUCollationAttributeFactory(collator);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link ICUCollationKeyAnalyzer#ICUCollationKeyAnalyzer(Version, Collator)}
|
||||
* and specify a version instead. This ctor will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public ICUCollationKeyAnalyzer(Collator collator) {
|
||||
this(Version.LUCENE_31, collator);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_40)) {
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
} else {
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(tokenizer, new ICUCollationKeyFilter(tokenizer, collator));
|
||||
}
|
||||
KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
|
||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,114 +0,0 @@
|
|||
package org.apache.lucene.collation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
import com.ibm.icu.text.RawCollationKey;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.util.IndexableBinaryStringTools;
|
||||
|
||||
import org.apache.lucene.collation.CollationKeyFilter; // javadocs
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
|
||||
* then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
|
||||
* allow it to be stored as an index term.
|
||||
* </p>
|
||||
* <p>
|
||||
* <strong>WARNING:</strong> Make sure you use exactly the same Collator at
|
||||
* index and query time -- CollationKeys are only comparable when produced by
|
||||
* the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
|
||||
* independently versioned, so it is safe to search against stored
|
||||
* CollationKeys if the following are exactly the same (best practice is
|
||||
* to store this information with the index and check that they remain the
|
||||
* same at query time):
|
||||
* </p>
|
||||
* <ol>
|
||||
* <li>
|
||||
* Collator version - see {@link Collator#getVersion()}
|
||||
* </li>
|
||||
* <li>
|
||||
* The collation strength used - see {@link Collator#setStrength(int)}
|
||||
* </li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* CollationKeys generated by ICU Collators are not compatible with those
|
||||
* generated by java.text.Collators. Specifically, if you use
|
||||
* ICUCollationKeyFilter to generate index terms, do not use
|
||||
* {@link CollationKeyFilter} on the query side, or vice versa.
|
||||
* </p>
|
||||
* <p>
|
||||
* ICUCollationKeyFilter is significantly faster and generates significantly
|
||||
* shorter keys than CollationKeyFilter. See
|
||||
* <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
|
||||
* >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
|
||||
* generation timing and key length comparisons between ICU4J and
|
||||
* java.text.Collator over several languages.
|
||||
* </p>
|
||||
* @deprecated Use {@link ICUCollationAttributeFactory} instead, which encodes
|
||||
* terms directly as bytes. This filter will be removed in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public final class ICUCollationKeyFilter extends TokenFilter {
|
||||
private Collator collator = null;
|
||||
private RawCollationKey reusableKey = new RawCollationKey();
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
/**
|
||||
*
|
||||
* @param input Source token stream
|
||||
* @param collator CollationKey generator
|
||||
*/
|
||||
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
|
||||
super(input);
|
||||
// clone the collator: see http://userguide.icu-project.org/collation/architecture
|
||||
try {
|
||||
this.collator = (Collator) collator.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char[] termBuffer = termAtt.buffer();
|
||||
String termText = new String(termBuffer, 0, termAtt.length());
|
||||
collator.getRawCollationKey(termText, reusableKey);
|
||||
int encodedLength = IndexableBinaryStringTools.getEncodedLength(
|
||||
reusableKey.bytes, 0, reusableKey.size);
|
||||
if (encodedLength > termBuffer.length) {
|
||||
termAtt.resizeBuffer(encodedLength);
|
||||
}
|
||||
termAtt.setLength(encodedLength);
|
||||
IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
|
||||
termAtt.buffer(), 0, encodedLength);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,98 +0,0 @@
|
|||
package org.apache.lucene.collation;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import com.ibm.icu.text.Collator;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Locale;
|
||||
|
||||
/** @deprecated remove this when ICUCollationKeyFilter is removed */
|
||||
@Deprecated
|
||||
public class TestICUCollationKeyFilter extends CollationTestBase {
|
||||
|
||||
private Collator collator = Collator.getInstance(new Locale("fa"));
|
||||
private Analyzer analyzer = new TestAnalyzer(collator);
|
||||
|
||||
private BytesRef firstRangeBeginning = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(firstRangeBeginningOriginal).toByteArray()));
|
||||
private BytesRef firstRangeEnd = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(firstRangeEndOriginal).toByteArray()));
|
||||
private BytesRef secondRangeBeginning = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(secondRangeBeginningOriginal).toByteArray()));
|
||||
private BytesRef secondRangeEnd = new BytesRef(encodeCollationKey
|
||||
(collator.getCollationKey(secondRangeEndOriginal).toByteArray()));
|
||||
|
||||
|
||||
public final class TestAnalyzer extends Analyzer {
|
||||
private Collator _collator;
|
||||
|
||||
TestAnalyzer(Collator collator) {
|
||||
_collator = collator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer result = new KeywordTokenizer(reader);
|
||||
return new TokenStreamComponents(result, new ICUCollationKeyFilter(result, _collator));
|
||||
}
|
||||
}
|
||||
|
||||
public void testFarsiRangeFilterCollating() throws Exception {
|
||||
testFarsiRangeFilterCollating(analyzer, firstRangeBeginning, firstRangeEnd,
|
||||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
public void testFarsiRangeQueryCollating() throws Exception {
|
||||
testFarsiRangeQueryCollating(analyzer, firstRangeBeginning, firstRangeEnd,
|
||||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
public void testFarsiTermRangeQuery() throws Exception {
|
||||
testFarsiTermRangeQuery
|
||||
(analyzer, firstRangeBeginning, firstRangeEnd,
|
||||
secondRangeBeginning, secondRangeEnd);
|
||||
}
|
||||
|
||||
// Test using various international locales with accented characters (which
|
||||
// sort differently depending on locale)
|
||||
//
|
||||
// Copied (and slightly modified) from
|
||||
// org.apache.lucene.search.TestSort.testInternationalSort()
|
||||
//
|
||||
public void testCollationKeySort() throws Exception {
|
||||
Analyzer usAnalyzer = new TestAnalyzer(Collator.getInstance(Locale.US));
|
||||
Analyzer franceAnalyzer
|
||||
= new TestAnalyzer(Collator.getInstance(Locale.FRANCE));
|
||||
Analyzer swedenAnalyzer
|
||||
= new TestAnalyzer(Collator.getInstance(new Locale("sv", "se")));
|
||||
Analyzer denmarkAnalyzer
|
||||
= new TestAnalyzer(Collator.getInstance(new Locale("da", "dk")));
|
||||
|
||||
// The ICU Collator and java.text.Collator implementations differ in their
|
||||
// orderings - "BFJHD" is the ordering for the ICU Collator for Locale.US.
|
||||
testCollationKeySort
|
||||
(usAnalyzer, franceAnalyzer, swedenAnalyzer, denmarkAnalyzer,
|
||||
"BFJHD", "ECAGI", "BJDFH", "BJDHF");
|
||||
}
|
||||
}
|
|
@ -60,7 +60,7 @@ public class NearRealtimeReaderTask extends PerfTask {
|
|||
}
|
||||
|
||||
long t = System.currentTimeMillis();
|
||||
DirectoryReader r = IndexReader.open(w, true);
|
||||
DirectoryReader r = DirectoryReader.open(w, true);
|
||||
runData.setIndexReader(r);
|
||||
// Transfer our reference to runData
|
||||
r.decRef();
|
||||
|
|
|
@ -45,9 +45,9 @@ public class OpenReaderTask extends PerfTask {
|
|||
Directory dir = getRunData().getDirectory();
|
||||
DirectoryReader r = null;
|
||||
if (commitUserData != null) {
|
||||
r = IndexReader.open(OpenReaderTask.findIndexCommit(dir, commitUserData));
|
||||
r = DirectoryReader.open(OpenReaderTask.findIndexCommit(dir, commitUserData));
|
||||
} else {
|
||||
r = IndexReader.open(dir);
|
||||
r = DirectoryReader.open(dir);
|
||||
}
|
||||
getRunData().setIndexReader(r);
|
||||
// We transfer reference to the run data
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.benchmark.byTask.tasks;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
|
@ -47,9 +48,9 @@ public class PrintReaderTask extends PerfTask {
|
|||
Directory dir = getRunData().getDirectory();
|
||||
IndexReader r = null;
|
||||
if (userData == null)
|
||||
r = IndexReader.open(dir);
|
||||
r = DirectoryReader.open(dir);
|
||||
else
|
||||
r = IndexReader.open(OpenReaderTask.findIndexCommit(dir, userData));
|
||||
r = DirectoryReader.open(OpenReaderTask.findIndexCommit(dir, userData));
|
||||
System.out.println("--> numDocs:"+r.numDocs()+" dels:"+r.numDeletedDocs());
|
||||
r.close();
|
||||
return 1;
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.benchmark.byTask.PerfRunData;
|
||||
import org.apache.lucene.benchmark.byTask.feeds.QueryMaker;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
|
@ -84,7 +85,7 @@ public abstract class ReadTask extends PerfTask {
|
|||
if (searcher == null) {
|
||||
// open our own reader
|
||||
Directory dir = getRunData().getDirectory();
|
||||
reader = IndexReader.open(dir);
|
||||
reader = DirectoryReader.open(dir);
|
||||
searcher = new IndexSearcher(reader);
|
||||
closeSearcher = true;
|
||||
} else {
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.benchmark.quality.trec;
|
|||
import org.apache.lucene.benchmark.quality.utils.SimpleQQParser;
|
||||
import org.apache.lucene.benchmark.quality.utils.SubmissionReport;
|
||||
import org.apache.lucene.benchmark.quality.*;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
@ -53,7 +54,7 @@ public class QueryDriver {
|
|||
SubmissionReport submitLog = new SubmissionReport(new PrintWriter(args[2]), "lucene");
|
||||
FSDirectory dir = FSDirectory.open(new File(args[3]));
|
||||
String fieldSpec = args.length == 5 ? args[4] : "T"; // default to Title-only if not specified.
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
IndexReader reader = DirectoryReader.open(dir);
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
|
||||
int maxResults = 1000;
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.benchmark.quality.utils;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -86,7 +87,7 @@ public class QualityQueriesFinder {
|
|||
|
||||
private String [] bestTerms(String field,int numTerms) throws IOException {
|
||||
PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
|
||||
IndexReader ir = IndexReader.open(dir);
|
||||
IndexReader ir = DirectoryReader.open(dir);
|
||||
try {
|
||||
int threshold = ir.maxDoc() / 10; // ignore words too common.
|
||||
Terms terms = MultiFields.getTerms(ir, field);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue